├── information_retrieval
    ├── elastic_container
    │   ├── errors.jsonl
    │   ├── start_elasticsearch.sh
    │   ├── ingest_data.py
    │   └── elastic.ipynb
    ├── faiss_container
    │   ├── docker-compose.yml
    │   ├── Dockerfile
    │   ├── server.py
    │   └── faiss_insert_data.ipynb
    └── document_encoding
    │   ├── bioBERT_encoder.py
    │   ├── medCPT_encoder.py
    │   └── encode_documents.ipynb
├── sys_requirements.txt
├── LICENSE
├── evaluation
    ├── evaluation_data_storages
    │   ├── faiss
    │   │   ├── conncatinatior.py
    │   │   ├── embedding_extractor.py
    │   │   └── request.ipynb
    │   ├── documentation.md
    │   ├── mongodb
    │   │   └── eval_mongo.ipynb
    │   └── elasticsearch
    │   │   ├── elastic.ipynb
    │   │   └── eval_elastic.ipynb
    └── evaluation_QA_system
    │   ├── full_text_evaluation.py
    │   ├── evaluation_pipeline.ipynb
    │   └── RAG_evaluator.py
├── rag_system
    ├── bm25_retriever.py
    ├── bioBERT_encoder.py
    ├── med_rag.py
    ├── hybrid_retriever.py
    ├── bioBERT_retriever.py
    ├── medCPT_encoder.py
    ├── openAI_chat.py
    └── medCPT_retriever.py
├── requirements.txt
├── .gitignore
└── README.md


/information_retrieval/elastic_container/errors.jsonl:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/information_retrieval/faiss_container/docker-compose.yml:
--------------------------------------------------------------------------------
 1 | services:
 2 |   faiss-app:
 3 |     image: continuumio/anaconda3
 4 |     ports:
 5 |       - "5000:5000"
 6 |     volumes:
 7 |       - ./server.py:/app/server.py
 8 |       - ./faiss_indices:/app/faiss_indices
 9 |       - ./PMIDs:/app/PMIDs
10 |     working_dir: /app
11 |     environment:
12 |       - FLASK_APP=server.py
13 |       - FLASK_RUN_HOST=0.0.0.0
14 |     command: >
15 |       /bin/bash -c "conda install -c pytorch faiss-cpu -y &&
16 |                     pip install flask numpy pandas &&
17 |                     python server.py"


--------------------------------------------------------------------------------
/sys_requirements.txt:
--------------------------------------------------------------------------------
 1 | # Essential system packages for Ubuntu 22.04.4 LTS (Jammy)
 2 | apt  # Advanced Package Tool, a package management system for Debian
 3 | curl  # Command line tool for transferring data with URLs
 4 | gcc  # GNU Compiler Collection, a compiler system
 5 | g++  # GNU C++ Compiler
 6 | make  # Utility for directing compilation
 7 | python3  # Python programming language interpreter
 8 | python3-pip  # Package installer for Python
 9 | git  # Version control system
10 | docker-ce  # Docker: the open-source application container engine
11 | docker-compose-plugin  # Docker Compose (V2) plugin for the Docker CLI
12 | build-essential  # Informational list of build-essential packages
13 | 


--------------------------------------------------------------------------------
/information_retrieval/faiss_container/Dockerfile:
--------------------------------------------------------------------------------
 1 | # Use the Anaconda base image
 2 | FROM continuumio/anaconda3
 3 | 
 4 | # Set the working directory in the container
 5 | WORKDIR /app
 6 | 
 7 | # Install necessary packages using Conda and Pip
 8 | RUN conda install -c pytorch faiss-cpu -y && \
 9 |     pip install flask numpy pandas
10 | 
11 | # Copy the application files into the container
12 | COPY ./server.py /app/server.py
13 | COPY ./faiss_indices /app/faiss_indices
14 | COPY ./PMIDs /app/PMIDs
15 | 
16 | # Set environment variables
17 | ENV FLASK_APP=/app/server.py
18 | ENV FLASK_RUN_HOST=0.0.0.0
19 | ENV FLASK_RUN_PORT=5000
20 | 
21 | # Expose port 5000 for communication with the Flask app
22 | EXPOSE 5000
23 | 
24 | # Define the command that runs when the container starts
25 | CMD ["flask", "run"]
26 | 
27 | # docker run -d --name faiss_cpt -p 5000:5000 faiss:latest


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2025 Linus Stuhlmann & Michael Saxer
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/evaluation/evaluation_data_storages/faiss/conncatinatior.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from pathlib import Path
 3 | from tqdm import tqdm
 4 | import pandas as pd
 5 | 
 6 | def concatenate_pubmed_ids(input_dir: Path, output_dir: Path) -> None:
 7 |     # Sicherstellen, dass das Ausgabeverzeichnis vorhanden ist
 8 |     output_dir.mkdir(exist_ok=True)
 9 | 
10 |     # Laden und Konkatenieren der PubMed ID Arrays
11 |     final_pubmed_ids = np.array([], dtype=int)
12 | 
13 |     id_files = sorted(input_dir.glob('pubmed_ids_*.npy'))
14 | 
15 |     if not id_files:
16 |         print("Keine PubMed ID .npy Dateien gefunden.")
17 |         return
18 | 
19 |     for file in tqdm(id_files, desc="Lade und konkatiniere PubMed IDs"):
20 |         ids = np.load(file)
21 |         final_pubmed_ids = np.concatenate((final_pubmed_ids, ids))
22 | 
23 |     # Speichern der finalen PubMed IDs
24 |     pd.DataFrame(final_pubmed_ids).to_csv(output_dir / 'concatenated_pubmed_ids.csv', index=False, header=False)
25 |     print("Finale PubMed IDs gespeichert.")
26 | 
27 | if __name__ == "__main__":
28 |     input_dir = Path('/home/ubuntu/data/numpy_embeddings')
29 |     output_dir = Path('/home/ubuntu/stuhllin/medical_RAG_system/information_retrieval/faiss_container/PMIDs')
30 |     concatenate_pubmed_ids(input_dir, output_dir)
31 | 


--------------------------------------------------------------------------------
/information_retrieval/document_encoding/bioBERT_encoder.py:
--------------------------------------------------------------------------------
 1 | from sentence_transformers import SentenceTransformer, models
 2 | import torch
 3 | 
 4 | class bioBERTEncoder:
 5 |     def __init__(self, max_length=512):
 6 |         if torch.cuda.is_available():
 7 |             self.device = "cuda"
 8 |         else:
 9 |             self.device = "cpu"
10 |         
11 |         self.max_length = max_length
12 |         
13 |         word_embedding_model = models.Transformer('dmis-lab/biobert-v1.1', max_seq_length=self.max_length)
14 |         pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(),
15 |                                        pooling_mode_mean_tokens=True,
16 |                                        pooling_mode_cls_token=False,
17 |                                        pooling_mode_max_tokens=False)
18 | 
19 |         self.model = SentenceTransformer(modules=[word_embedding_model, pooling_model], device=self.device)
20 | 
21 |     def __call__(self, batch):
22 |         contents = [item["content"] for item in batch]
23 |         embeddings = self.model.encode(contents, batch_size=len(contents), show_progress_bar=False)
24 |         return [{"id": item["id"], "title": item["title"], "content": item["content"], "PMID": item.get("PMID", None), "embeddings": embedding.tolist()} for item, embedding in zip(batch, embeddings)]
25 | 


--------------------------------------------------------------------------------
/information_retrieval/elastic_container/start_elasticsearch.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Prüfen, ob das Docker-Netzwerk existiert
 4 | if ! docker network ls | grep -qw elastic; then
 5 |   echo "Netzwerk 'elastic' existiert nicht. Es wird erstellt..."
 6 |   docker network create elastic
 7 | else
 8 |   echo "Netzwerk 'elastic' ist bereits vorhanden."
 9 | fi
10 | 
11 | # Prüfen, ob das Docker-Volume existiert
12 | if ! docker volume ls | grep -qw elasticsearch_data; then
13 |   echo "Volume 'elasticsearch_data' existiert nicht. Es wird erstellt..."
14 |   docker volume create elasticsearch_data
15 | else
16 |   echo "Volume 'elasticsearch_data' ist bereits vorhanden."
17 | fi
18 | 
19 | docker pull docker.elastic.co/elasticsearch/elasticsearch:8.13.4
20 | 
21 | # Elasticsearch-Container starten
22 | echo "Starte Elasticsearch-Container..."
23 | docker run \
24 |   --name es01 \
25 |   --net elastic \
26 |   -p 9200:9200 \
27 |   -it \
28 |   -m 32GB \
29 |   --volume elasticsearch_data:/usr/share/elasticsearch/data \
30 |   -e "ES_JAVA_OPTS=-Xms16g -Xmx16g" \ 
31 |   docker.elastic.co/elasticsearch/elasticsearch:8.13.4
32 | 
33 | # 16GB RAM im Heap festlegen (Xms und Xmx) um OutOfMemoryError zu vermeiden
34 | 
35 | echo "Elasticsearch-Container wurde gestartet."
36 | 
37 | 
38 | # if crt problem, use this command to start the container
39 | # docker run --name es01 --net elastic -p 9200:9200 -it -m 32GB -e "ES_JAVA_OPTS=-Xms16g -Xmx16g" docker.elastic.co/elasticsearch/elasticsearch:8.13.4


--------------------------------------------------------------------------------
/information_retrieval/faiss_container/server.py:
--------------------------------------------------------------------------------
 1 | from flask import Flask, request, jsonify
 2 | import faiss
 3 | import numpy as np
 4 | import pandas as pd
 5 | 
 6 | app = Flask(__name__)
 7 | 
 8 | # Load the Faiss index (assuming the index has already been created and saved)
 9 | index_path = "/app/faiss_indices/bioBERT_index.index"
10 | index = faiss.read_index(index_path)
11 | 
12 | # Load PMIDs and their respective index numbers
13 | pmids_path = "/app/PMIDs/bioBERT_pmids.csv"
14 | pmids_df = pd.read_csv(pmids_path)
15 | 
16 | # Create a dictionary to map index numbers to PMIDs
17 | index_to_pmids = dict(zip(pmids_df['Index'], pmids_df['PMID']))
18 | 
19 | @app.route('/search', methods=['POST'])
20 | def search():
21 |     # Extract the query vectors and the value of k from the POST request
22 |     data = request.get_json()
23 |     queries = np.array(data['queries'], dtype='float32')
24 |     
25 |     # Get the number of nearest neighbors to search for
26 |     k = int(data['k'])
27 |     
28 |     # Perform the search in the Faiss index
29 |     distances, indices = index.search(queries, k)
30 |     
31 |     # Map the Faiss indices to PMIDs using the dictionary
32 |     matched_PMIDs = [[index_to_pmids[idx] for idx in row] for row in indices]
33 | 
34 |     # Return the response as JSON
35 |     return jsonify(PMIDs=matched_PMIDs, distances=distances.tolist())
36 | 
37 | if __name__ == '__main__':
38 |     app.run(host='0.0.0.0', port=5000)  # Accessible over port 5000 on all network interfaces


--------------------------------------------------------------------------------
/rag_system/bm25_retriever.py:
--------------------------------------------------------------------------------
 1 | from elasticsearch import Elasticsearch
 2 | import os
 3 | import json
 4 | 
 5 | class BM25Retriever:
 6 |     def __init__(self):
 7 |         elastic_password = os.getenv('ELASTIC_PASSWORD')
 8 |         self.es = Elasticsearch(
 9 |             ['https://localhost:9200'],
10 |             basic_auth=('elastic', elastic_password),
11 |             verify_certs=True,
12 |             ca_certs="/home/rag/.crt/http_ca.crt",
13 |             request_timeout=60
14 |         )
15 |         self.index = "pubmed_index"
16 | 
17 |     def retrieve_docs(self, query: str, k: int = 10):
18 |         es_query = {
19 |             "size": k,
20 |             "query": {
21 |                 "match": {
22 |                     "content": query 
23 |                 }
24 |             },
25 |             "_source": ["PMID", "title", "content"]
26 |         }
27 |         # Execute the search query
28 |         response = self.es.search(index=self.index, body=es_query)
29 |         
30 |         # Format the results into the desired JSON structure
31 |         results = {}
32 |         for idx, doc in enumerate(response['hits']['hits'], 1):
33 |             doc_key = f"doc{idx}"
34 |             results[doc_key] = {
35 |                 'PMID': doc['_source']['PMID'],
36 |                 'title': doc['_source']['title'],
37 |                 'content': doc['_source']['content'],
38 |                 'score': doc['_score']
39 |             }
40 | 
41 |         return json.dumps(results, indent=4)


--------------------------------------------------------------------------------
/rag_system/bioBERT_encoder.py:
--------------------------------------------------------------------------------
 1 | from sentence_transformers import SentenceTransformer, models
 2 | import torch
 3 | import time
 4 | 
 5 | 
 6 | class BioBERTQueryEncooder:
 7 |     def __init__(self, model_name='dmis-lab/biobert-v1.1', max_length=512):
 8 |         if torch.cuda.is_available():
 9 |             self.device = "cuda"
10 |         else:
11 |             self.device = "cpu"
12 | 
13 |         self.max_length = max_length
14 | 
15 |         # Load pretrained BioBERT-Modell and adding MEAN-Pooling-Layers
16 |         word_embedding_model = models.Transformer(model_name, max_seq_length=self.max_length)
17 |         pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(),
18 |                                        pooling_mode_mean_tokens=True,
19 |                                        pooling_mode_cls_token=False,
20 |                                        pooling_mode_max_tokens=False)
21 | 
22 |         self.model = SentenceTransformer(modules=[word_embedding_model, pooling_model], device=self.device)
23 | 
24 |     def encode(self, text):
25 |         # transform text into vector representation
26 |         embedding = self.model.encode([text], batch_size=1, show_progress_bar=False)
27 |         return embedding[0]
28 | 
29 | if __name__ == "__main__":
30 | 
31 |     embedder = BioBERTQueryEncooder()
32 |     text = "This is a test sentence."
33 |     start = time.time()
34 |     embedding = embedder.embed(text)
35 |     print(embedding)
36 |     print(time.time() - start)


--------------------------------------------------------------------------------
/information_retrieval/document_encoding/medCPT_encoder.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from transformers import AutoTokenizer, AutoModel
 3 | from typing import List, Dict
 4 | 
 5 | class medCPTArticleEncoder:
 6 |     def __init__(self, max_length=512):
 7 |         if torch.cuda.is_available():
 8 |             self.device = "cuda"
 9 |         else:
10 |             self.device = "cpu"
11 |         
12 |         self.max_length = max_length
13 |         self.model = AutoModel.from_pretrained("ncbi/MedCPT-Article-Encoder")
14 |         self.tokenizer = AutoTokenizer.from_pretrained("ncbi/MedCPT-Article-Encoder")
15 | 
16 |     def __call__(self, batch: List[Dict]) -> List[Dict]:
17 |         encoded_articles = []
18 | 
19 |         with torch.no_grad():
20 |             # Extract the article content from the batch
21 |             articles = [item["content"] for item in batch]
22 |             
23 |             # Tokenize the articles
24 |             encoded = self.tokenizer(
25 |                 articles, 
26 |                 truncation=True, 
27 |                 padding=True, 
28 |                 return_tensors='pt', 
29 |                 max_length=self.max_length,
30 |             )
31 |             
32 |             # Encode the articles (use the [CLS] token as the representation)
33 |             outputs = self.model(**encoded)
34 |             embeddings = outputs.last_hidden_state[:, 0, :]
35 | 
36 |         for i, item in enumerate(batch):
37 |             encoded_articles.append({
38 |                 "id": item["id"],
39 |                 "title": item["title"],
40 |                 "content": item["content"],
41 |                 "PMID": item.get("PMID", None),
42 |                 "embedding": embeddings[i].tolist()
43 |             })
44 | 
45 |         return encoded_articles
46 | 


--------------------------------------------------------------------------------
/rag_system/med_rag.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import time
 3 | from openAI_chat import Chat
 4 | from bioBERT_retriever import BioBERTRetriever
 5 | from bm25_retriever import BM25Retriever
 6 | from hybrid_retriever import HybridRetriever
 7 | from medCPT_retriever import MedCPTRetriever
 8 | 
 9 | class MedRAG:
10 |     def __init__(self, retriever=1, question_type=1, n_docs=10):
11 |         if retriever == 1:
12 |             self.retriever = BioBERTRetriever()
13 |         elif retriever == 2:
14 |             self.retriever = BM25Retriever()
15 |         elif retriever == 3:
16 |             self.retriever = HybridRetriever()
17 |         elif retriever == 4:
18 |             self.retriever = MedCPTRetriever(rerank=True)
19 |         else:
20 |             raise ValueError("Invalid retriever value. Choose 1 for bioBERT, 2 for BM25, or 3 for hybrid.")
21 | 
22 |         self.chat = Chat(question_type=question_type) # 1 for full text, 2 for yes/no
23 |         self.n_docs = n_docs
24 | 
25 |     def extract_pmids(self, docs):
26 |         # Extracts PMIDs from the documents and returns them as a list
27 |         return [doc["PMID"] for doc in docs.values()]
28 | 
29 |     def get_answer(self, question: str) -> str:
30 | 
31 |         # retrieve the documents timing the retrieval
32 |         start_time_retrieval = time.time()
33 |         retrieved_docs = json.loads(self.retriever.retrieve_docs(question, self.n_docs))
34 |         end_time_retrieval = time.time()
35 | 
36 |         # extract the PMIDs from the retrieved documents
37 |         pmids = self.extract_pmids(retrieved_docs)
38 | 
39 |         # the chat response is a json string {'response': '...', 'used_PMIDs': [...]} and timing the generation
40 |         start_time_generation = time.time()
41 |         answer = self.chat.create_chat(question, retrieved_docs)
42 |         end_time_generation = time.time()
43 | 
44 |         retrieval_time = end_time_retrieval - start_time_retrieval
45 |         generation_time = end_time_generation - start_time_generation
46 | 
47 |         # now adding the retrieved PMIDs to the response
48 |         try :
49 |             answer = json.loads(answer)
50 |             answer['retrieved_PMIDs'] = pmids
51 |             answer['retrieval_time'] = retrieval_time
52 |             answer['generation_time'] = generation_time
53 |         except:
54 |             return None
55 |         return json.dumps(answer)


--------------------------------------------------------------------------------
/evaluation/evaluation_data_storages/faiss/embedding_extractor.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import json
 3 | import gc  # Garbage Collector importieren
 4 | from pathlib import Path
 5 | from tqdm import tqdm
 6 | 
 7 | def process_files(files):
 8 |     pubmed_ids = []
 9 |     embeddings = []
10 |     for file_name in tqdm(files, desc="Verarbeite Dateien", leave=False):
11 |         with open(file_name, 'r') as file:
12 |             for line in file:
13 |                 try:
14 |                     data = json.loads(line)
15 |                     pubmed_ids.append(int(data.get('PMID', 0)))  # Konvertierung zu Integer mit Standardwert 0
16 |                     embeddings.append(data.get('embeddings', []))  # Standardwert leere Liste
17 |                 except json.JSONDecodeError as e:
18 |                     print(f"Fehler beim Decodieren von JSON in Datei {file_name}: {e}")
19 |     return pubmed_ids, embeddings
20 | 
21 | source_directory = Path('/home/ubuntu/pubmed')
22 | jsonl_files = list(source_directory.glob('*.jsonl'))
23 | batch_size = 15  # Maximale Anzahl an Dateien pro Batch
24 | 
25 | # Speicherpfade für Zwischendateien
26 | temp_dir = Path('/home/ubuntu/temp_pubmed')
27 | temp_dir.mkdir(exist_ok=True)  # Stelle sicher, dass das Verzeichnis existiert
28 | 
29 | # Verarbeite Dateien in Batches und speichere Zwischenergebnisse
30 | for i in tqdm(range(0, len(jsonl_files), batch_size), desc="Verarbeite Batches"):
31 |     batch_files = jsonl_files[i:i + batch_size]
32 |     batch_pubmed_ids, batch_embeddings = process_files(batch_files)
33 |     # Speichere die Batch-Daten in temporären Dateien
34 |     np.save(temp_dir / f'embeddings_{i // batch_size}.npy', batch_embeddings)
35 |     np.save(temp_dir / f'pubmed_ids_{i // batch_size}.npy', batch_pubmed_ids)
36 |     # Lösche die Listen, um den Speicher freizugeben
37 |     del batch_pubmed_ids, batch_embeddings
38 |     gc.collect()  # Fordere die Garbage Collection explizit an
39 | 
40 | # Lade alle Zwischendateien und konkateniere die Arrays
41 | final_embeddings = np.concatenate([np.load(file) for file in temp_dir.glob('embeddings_*.npy')])
42 | final_pubmed_ids = np.concatenate([np.load(file) for file in temp_dir.glob('pubmed_ids_*.npy')])
43 | 
44 | # Speichere die finalen Arrays
45 | np.save('embeddings.npy', final_embeddings)
46 | np.save('pubmed_ids.npy', final_pubmed_ids)
47 | 
48 | # Aufräumen: Lösche die temporären Dateien
49 | for file in temp_dir.glob('*.npy'):
50 |     file.unlink()
51 | temp_dir.rmdir()  # Entferne das Verzeichnis, falls es leer ist
52 | 


--------------------------------------------------------------------------------
/rag_system/hybrid_retriever.py:
--------------------------------------------------------------------------------
 1 | from elasticsearch import Elasticsearch
 2 | import os
 3 | import json
 4 | from medCPT_encoder import MedCPTCrossEncoder
 5 | 
 6 | class HybridRetriever:
 7 |     def __init__(self):
 8 |         elastic_password = os.getenv('ELASTIC_PASSWORD')
 9 |         self.es = Elasticsearch(
10 |             ['https://localhost:9200'],
11 |             basic_auth=('elastic', elastic_password),
12 |             verify_certs=True,
13 |             ca_certs="/home/rag/.crt/http_ca.crt",
14 |             request_timeout=60
15 |         )
16 |         self.index = "pubmed_index"
17 |         self.reranker = MedCPTCrossEncoder()
18 | 
19 |     def rerank_docs(self, query: str, docs: list):
20 |         """Reranks the documents based on their relevance to the query."""
21 |         scores = self.reranker.score([doc['content'] for doc in docs], query)
22 |         reranked_docs = sorted(zip(docs, scores), key=lambda x: x[1], reverse=True)
23 |         return reranked_docs
24 | 
25 |     def retrieve_docs(self, query: str, top_n: int = 10, k: int = 20):
26 |         """Retrieves documents from Elasticsearch and reranks them, returning only the top N results."""
27 |         es_query = {
28 |             "size": k,
29 |             "query": {
30 |                 "match": {
31 |                     "content": query
32 |                 }
33 |             },
34 |             "_source": ["PMID", "title", "content"]
35 |         }
36 |         # Execute the search query
37 |         response = self.es.search(index=self.index, body=es_query)
38 | 
39 |         # Extract documents with full metadata
40 |         docs = [{
41 |             'PMID': hit['_source']['PMID'],
42 |             'title': hit['_source']['title'],
43 |             'content': hit['_source']['content']
44 |         } for hit in response['hits']['hits']]
45 | 
46 |         # Rerank the documents
47 |         reranked_docs = self.rerank_docs(query, docs)
48 | 
49 |         # only take documents with a score > 0
50 |         reranked_docs = [doc for doc in reranked_docs if doc[1] > 0]
51 | 
52 |         # Take max the top N reranked documents
53 |         top_reranked_docs = reranked_docs[:top_n]
54 | 
55 |         # Construct the final results with reranked scores
56 |         results = {
57 |             f"doc{idx + 1}": {
58 |                 'PMID': doc['PMID'],
59 |                 'title': doc['title'],
60 |                 'content': doc['content'],
61 |                 'score': score.item()
62 |             }
63 |             for idx, (doc, score) in enumerate(top_reranked_docs)
64 |         }
65 | 
66 |         return json.dumps(results, indent=4)
67 | 
68 | if __name__ == "__main__":
69 |     retriever = HybridRetriever()
70 |     query = "Is Alzheimer's disease hereditary?"
71 |     results = retriever.retrieve_docs(query, k=100, top_n=10)
72 |     print(results)
73 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | anaconda==0.0.1.1  # Anaconda package
 2 | annotated-types==0.6.0  # Support for typing-annotations
 3 | anyio==4.3.0  # Async network and file operations
 4 | argon2-cffi==23.1.0  # The secure Argon2 password hashing algorithm
 5 | attrs==23.2.0  # Attributes without boilerplate
 6 | Babel==2.14.0  # Internationalization utilities
 7 | beautifulsoup4==4.12.3  # Screen-scraping library
 8 | bleach==6.1.0  # Sanitize your inputs
 9 | click==8.1.7  # Command Line Interface Creation Kit
10 | decorator==5.1.1  # Simplifies the usage of decorators
11 | elastic-transport==8.13.0  # Transport layer for Elasticsearch
12 | elasticsearch==8.13.0  # Official Elasticsearch client
13 | faiss-cpu==1.8.0  # A library for efficient similarity search and clustering
14 | Flask==3.0.3  # Micro web framework
15 | fsspec==2024.3.1  # File system specification
16 | huggingface-hub==0.22.2  # Client library for Huggingface hub
17 | idna==3.3  # Internationalized Domain Names in Applications (IDNA)
18 | importlib-metadata==4.6.4  # Library to access the metadata for a Python package
19 | joblib==1.4.0  # Lightweight pipelining in Python
20 | jsonschema==4.21.1  # JSON Schema validation for Python
21 | jupyter==1.0.0  # Jupyter metapackage
22 | jupyterlab==4.1.8  # JupyterLab: the next generation Jupyter notebook
23 | MarkupSafe==2.1.5  # Implements a XML/HTML/XHTML Markup safe string for Python
24 | matplotlib==3.8.4  # Plotting library for Python
25 | more-itertools==8.10.0  # More routines for operating on iterables, beyond itertools
26 | nbconvert==7.16.3  # Convert Jupyter Notebooks
27 | numpy==1.26.4  # Fundamental package for array computing
28 | openai==1.23.3  # OpenAI API client
29 | packaging==24.0  # Core utilities for Python packages
30 | pandas==2.2.2  # Data analysis and manipulation library
31 | prompt-toolkit==3.0.43  # Library for building powerful interactive command lines
32 | psutil==5.9.8  # Cross-platform process and system utilities
33 | pydantic==2.7.1  # Data validation and settings management using Python type annotations
34 | Pygments==2.17.2  # Syntax highlighting package
35 | regex==2024.4.16  # Alternative regular expression module
36 | requests==2.31.0  # Simple HTTP library for Python
37 | scikit-learn==1.4.2  # Machine learning library
38 | scipy==1.13.0  # Fundamental library for scientific computing
39 | sentence-transformers==2.7.0  # BERT and SentenceTransformers library
40 | six==1.16.0  # Python 2 and 3 compatibility utilities
41 | tqdm==4.66.2  # Fast, extensible progress bar for Python
42 | traitlets==5.14.2  # Configuration system for Python applications
43 | transformers==4.40.0  # State-of-the-art Natural Language Processing for TensorFlow and PyTorch
44 | torch==2.2.2  # Tensors and Dynamic neural networks in Python with strong GPU acceleration
45 | urllib3==1.26.5  # HTTP library with thread-safe connection pooling
46 | Werkzeug==3.0.2  # Comprehensive WSGI web application library
47 | 


--------------------------------------------------------------------------------
/rag_system/bioBERT_retriever.py:
--------------------------------------------------------------------------------
 1 | from elasticsearch import Elasticsearch
 2 | import os
 3 | import requests
 4 | import json
 5 | from bioBERT_encoder import BioBERTQueryEncooder
 6 | 
 7 | class BioBERTRetriever:
 8 |     def __init__(self):
 9 |         elastic_password = os.getenv('ELASTIC_PASSWORD')
10 |         self.es = Elasticsearch(
11 |             ['https://localhost:9200'],
12 |             basic_auth=('elastic', elastic_password),
13 |             verify_certs=True,
14 |             ca_certs="/home/ubuntu/.crts/http_ca.crt",
15 |             request_timeout=60
16 |         )
17 |         self.index = "pubmed_index"
18 |         self.faiss_url = "http://localhost:5000/search"
19 |         self.query_encoder = BioBERTQueryEncooder()
20 | 
21 |     def query_to_vector(self, text: str):
22 |         """Converts text query to a vector using the BioBERT encoder."""
23 |         embedding = self.query_encoder.encode(text)
24 |         return embedding
25 | 
26 |     def faiss_query(self, query: str, k: int = 10):
27 |         """Performs a vector search using FAISS with the given query and k."""
28 |         vec = self.query_to_vector(query).tolist()  # Convert numpy array to list
29 |         data = {
30 |             'queries': [vec],  # List of vectors
31 |             'k': k
32 |         }
33 |         response = requests.post(self.faiss_url, headers={'Content-Type': 'application/json'}, data=json.dumps(data))
34 |         return response.json()
35 | 
36 |     def get_docs_via_PMIDs(self, PMIDs: list):
37 |         """Retrieves documents from Elasticsearch using a list of PMIDs."""
38 |         query = {
39 |             "size": len(PMIDs),
40 |             "query": {
41 |                 "terms": {
42 |                     "PMID": PMIDs
43 |                 }
44 |             },
45 |             "_source": ["PMID", "title", "content"]
46 |         }
47 |         return self.es.search(index=self.index, body=query)
48 | 
49 |     def retrieve_docs(self, query: str, k: int = 10):
50 |         """Retrieves documents relevant to the query using both FAISS and Elasticsearch."""
51 |         response = self.faiss_query(query, k)
52 |         PMIDs = response['PMIDs'][0]  # Assumes PMIDs are returned in a structured list
53 |         es_response = self.get_docs_via_PMIDs(PMIDs)
54 |         results = {}
55 | 
56 |         # Formatting the response as required
57 |         for idx, hit in enumerate(es_response['hits']['hits'], 1):
58 |             doc_key = f"doc{idx}"
59 |             results[doc_key] = {
60 |                 'PMID': hit['_source']['PMID'],
61 |                 'title': hit['_source']['title'],
62 |                 'content': hit['_source']['content']
63 |             }
64 | 
65 |         return json.dumps(results, indent=4)
66 |     
67 |     
68 | if __name__ == '__main__':
69 |     retriever = BioBERTRetriever()
70 |     query = "What is the role of sdRNA in cancer?"
71 |     n_docs = 5
72 |     response = retriever.retrieve_docs(query, n_docs)
73 |     print(response)


--------------------------------------------------------------------------------
/rag_system/medCPT_encoder.py:
--------------------------------------------------------------------------------
 1 | from transformers import AutoModel, AutoTokenizer, AutoModelForSequenceClassification
 2 | import torch
 3 | 
 4 | 
 5 | class MedCPTQueryEncoder:
 6 |     def __init__(self, model_name='ncbi/MedCPT-Query-Encoder', max_length=512):
 7 |         if torch.cuda.is_available():
 8 |             self.device = "cuda"
 9 |         else:
10 |             self.device = "cpu"
11 | 
12 |         self.max_length = max_length
13 | 
14 |         # Load pretrained MedCPT-Query-Encoder model and tokenizer
15 |         self.model = AutoModel.from_pretrained(model_name).to(self.device)
16 |         self.tokenizer = AutoTokenizer.from_pretrained(model_name)
17 | 
18 |     def encode(self, text):
19 |         with torch.no_grad():
20 |             # Tokenize the text
21 |             inputs = self.tokenizer(text, return_tensors='pt', truncation=True, padding=True, max_length=self.max_length).to(self.device)
22 |             # Pass the inputs through the model
23 |             outputs = self.model(**inputs)
24 |             # Return the last hidden states
25 |             return outputs.last_hidden_state[:, 0, :]
26 | 
27 | 
28 | class MedCPTCrossEncoder:
29 |     def __init__(self, model_name='ncbi/MedCPT-Cross-Encoder'):
30 |         if torch.cuda.is_available():
31 |             self.device = "cuda"
32 |         else:
33 |             self.device = "cpu"
34 | 
35 |         # Load pretrained Cross-Encoder model and tokenizer
36 |         self.model = AutoModelForSequenceClassification.from_pretrained(model_name).to(self.device)
37 |         self.tokenizer = AutoTokenizer.from_pretrained(model_name)
38 | 
39 |     def score(self, articles, query):
40 |         pairs = [[query, article] for article in articles]
41 | 
42 |         with torch.no_grad():
43 |             encoded = self.tokenizer(
44 |                 pairs,
45 |                 truncation=True,
46 |                 padding=True,
47 |                 return_tensors="pt",
48 |                 max_length=512,
49 |             ).to(self.device)
50 | 
51 |             logits = self.model(**encoded).logits.squeeze(dim=1)
52 |         return logits
53 | 
54 | 
55 | if __name__ == "__main__":
56 |     
57 |     cross_encoder = MedCPTCrossEncoder()
58 | 
59 |     query = "What is the treatment for diabetes?"
60 | 
61 |     articles = [
62 |         "Diabetes is a chronic disease that occurs when the body is unable to produce enough insulin or use it effectively. Treatment for diabetes includes lifestyle changes, such as diet and exercise, as well as medications like insulin and oral hypoglycemic drugs.",
63 |         "The treatment for diabetes involves managing blood sugar levels through diet, exercise, and medication. Insulin therapy, oral medications, and lifestyle changes are common approaches to managing diabetes.",
64 |         "Diabetes treatment typically involves a combination of diet, exercise, and medication. Insulin therapy, oral medications, and lifestyle changes are key components of managing diabetes.",
65 |     ]
66 | 
67 |     scores = cross_encoder.score(articles, query)
68 | 
69 |     for i, (article, score) in enumerate(zip(articles, scores)):
70 |         print(f"Article {i+1}: {article}")
71 |         print(f"Score: {score:.4f}\n")
72 | 


--------------------------------------------------------------------------------
/evaluation/evaluation_QA_system/full_text_evaluation.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import openai
 3 | 
 4 | 
 5 | class evaluateResponseGPT:
 6 |     def __init__(self, response, answer):
 7 |         self.response = response
 8 |         self.correct_answer = answer
 9 |         self.model = "gpt-3.5-turbo"
10 |         api_key = os.getenv('OPENAI_API_KEY')
11 |         self.client = openai.OpenAI(api_key=api_key)
12 |         self.context = self.set_context()
13 | 
14 |     def set_context(self) -> str:
15 |         return (
16 |             "You will evaluate a response by comparing it to an expert's optimal answer in the biomedical domain. "
17 |             "The evaluation process should include the following steps:"
18 |             "1. Identification of key terms and concepts in both the provided response and the expert's optimal answer. "
19 |             "2. Assessment of the context of the used terms and concepts in the response and the expert's answer. "
20 |             "3. Determination of the accuracy and completeness of the provided response. "
21 |             "Score the response on a scale from 0 to 10, where 0 means completely no overlap with the expert's answer and 10 means a perfect match."
22 |             "Provide the discrete numerical score as your response."
23 |         )
24 |     
25 |     def set_initial_message(self):
26 |         return [{"role": "system", "content": self.context}]
27 |     
28 |     def get_evaluation(self) -> float:
29 |         messages = self.set_initial_message()
30 |         messages.append({"role": "user", "content": f"Response: {self.response}"})
31 |         print(f"Correct answer: {self.response}")
32 |         messages.append({"role": "user", "content": f"Correct answer: {self.correct_answer}. Please score the response above from 0 to 1 based on its accuracy and completeness."})
33 |         print(f"Correct answer: {self.correct_answer}")
34 |         try:
35 |             completion = self.client.chat.completions.create(
36 |                 model=self.model,
37 |                 messages=messages,
38 |                 max_tokens=500,
39 |                 temperature=0.0
40 |             )
41 |             
42 |             # Correct way to access the message content
43 |             response_content = completion.choices[0].message.content  # Removed incorrect dictionary access
44 |             try:
45 |                 score = float(response_content.strip())
46 |             except ValueError:
47 |                 score = 0  # Handle the case where the response cannot be converted to float
48 |         except Exception as e:
49 |             print(f"An error occurred during response evaluation: {e}")
50 |             score = 0
51 | 
52 |         return score/10 # Normalize the score to be between 0 and 1
53 | 
54 | 
55 | if __name__ == "__main__":
56 |     response = "Standard treatment for type 2 diabetes is insulin injections and does emphasize lifestyle changes or oral medications like metformin."
57 |     correct_answer = "The standard treatment for type 2 diabetes involves lifestyle modifications such as diet and exercise, complemented by medications like metformin to regulate blood sugar levels."
58 |     evaluator = evaluateResponseGPT(response, correct_answer)
59 |     score = evaluator.get_evaluation()
60 |     print(f"Response score: {score}")


--------------------------------------------------------------------------------
/rag_system/openAI_chat.py:
--------------------------------------------------------------------------------
 1 | import openai
 2 | import os
 3 | import json
 4 | from typing import List, Dict
 5 | 
 6 | class Chat:
 7 |     def __init__(self, question_type: int = 1, api_key: str = os.getenv('OPENAI_API_KEY'), model: str = "gpt-3.5-turbo"):
 8 |         self.api_key = api_key
 9 |         self.model = model
10 |         self.client = openai.OpenAI(api_key=self.api_key)
11 |         self.context = self.set_context(question_type)
12 | 
13 |     def set_context(self, question_type: int) -> str:
14 |         base_context = (
15 |             "You are a scientific medical assistant designed to synthesize responses "
16 |             "from specific medical documents. Only use the information provided in the "
17 |             "documents to answer questions. The first documents should be the most relevant."
18 |             "Do not use any other information except for the documents provided."
19 |             "When answering questions, always format your response "
20 |             "as a JSON object with fields for 'response', 'used_PMIDs'. "
21 |             "Cite all PMIDs your response is based on in the 'used_PMIDs' field. "
22 |             "Please think step-by-step before answering questions and provide the most accurate response possible."
23 |         )
24 | 
25 |         question_specific_context = {
26 |             1: " Provide a detailed answer to the question in the 'response' field.",
27 |             2: " Your response should only be 'yes', 'no'. If if no relevant documents are found, return 'no_docs_found'.",
28 |             3: " Choose between the given options 1 to 4 and return as 'response' the chosen number. If no relevant documents are found, return the number 5.",
29 |             4: " Respond with keywords and list each keyword sepeartly as a list element. For example ['keyword1', 'keyword2', 'keyword3']. If no relevant documents are found, return an empty list.",
30 |         }
31 | 
32 |         return base_context + question_specific_context.get(question_type, "")
33 | 
34 |     def set_initial_message(self) -> List[dict]:
35 |         return [{"role": "system", "content": self.context}]
36 | 
37 |     def create_chat(self, user_message: str, retrieved_documents: Dict) -> str:
38 |         messages = self.set_initial_message()
39 |         messages.append({"role": "user", "content": f"Answer the following question: {user_message}"})
40 |         
41 |         document_texts = ["PMID {}: {} {}".format(doc['PMID'], doc['title'], doc['content']) for doc in retrieved_documents.values()]
42 |         documents_message = "\n\n".join(document_texts)  # Separating documents with two newlines
43 |         messages.append({"role": "system", "content": documents_message})
44 | 
45 |         try:
46 |             completion = self.client.chat.completions.create(
47 |                 model=self.model,
48 |                 messages=messages,
49 |                 max_tokens=500,
50 |                 temperature=0.0
51 |             )
52 |             
53 |             response_content = completion.choices[0].message.content
54 |             try:
55 |                 response_data = json.loads(response_content)
56 |                 formatted_response = {
57 |                     "response": response_data.get("response"),
58 |                     "used_PMIDs": response_data.get("used_PMIDs", []),
59 |                     "retrieved_PMIDs": [doc['PMID'] for doc in retrieved_documents.values()]
60 |                 }
61 |                 return json.dumps(formatted_response)
62 |             except json.JSONDecodeError:
63 |                 return json.dumps({"error": "Invalid JSON format in response.", "response": response_content})
64 |         
65 |         except Exception as e:
66 |             return json.dumps({"error": str(e)})


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | share/python-wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | MANIFEST
 28 | 
 29 | # PyInstaller
 30 | #  Usually these files are written by a python script from a template
 31 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 32 | *.manifest
 33 | *.spec
 34 | 
 35 | # Installer logs
 36 | pip-log.txt
 37 | pip-delete-this-directory.txt
 38 | 
 39 | # Unit test / coverage reports
 40 | htmlcov/
 41 | .tox/
 42 | .nox/
 43 | .coverage
 44 | .coverage.*
 45 | .cache
 46 | nosetests.xml
 47 | coverage.xml
 48 | *.cover
 49 | *.py,cover
 50 | .hypothesis/
 51 | .pytest_cache/
 52 | cover/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | .pybuilder/
 76 | target/
 77 | 
 78 | # Jupyter Notebook
 79 | .ipynb_checkpoints
 80 | 
 81 | # IPython
 82 | profile_default/
 83 | ipython_config.py
 84 | 
 85 | # pyenv
 86 | #   For a library or package, you might want to ignore these files since the code is
 87 | #   intended to run in multiple environments; otherwise, check them in:
 88 | # .python-version
 89 | 
 90 | # pipenv
 91 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 92 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 93 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 94 | #   install all needed dependencies.
 95 | #Pipfile.lock
 96 | 
 97 | # poetry
 98 | #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
 99 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
100 | #   commonly ignored for libraries.
101 | #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
102 | #poetry.lock
103 | 
104 | # pdm
105 | #   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
106 | #pdm.lock
107 | #   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
108 | #   in version control.
109 | #   https://pdm.fming.dev/#use-with-ide
110 | .pdm.toml
111 | 
112 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
113 | __pypackages__/
114 | 
115 | # Celery stuff
116 | celerybeat-schedule
117 | celerybeat.pid
118 | 
119 | # SageMath parsed files
120 | *.sage.py
121 | 
122 | # Environments
123 | .env
124 | .venv
125 | env/
126 | venv/
127 | ENV/
128 | env.bak/
129 | venv.bak/
130 | 
131 | # Spyder project settings
132 | .spyderproject
133 | .spyproject
134 | 
135 | # Rope project settings
136 | .ropeproject
137 | 
138 | # mkdocs documentation
139 | /site
140 | 
141 | # mypy
142 | .mypy_cache/
143 | .dmypy.json
144 | dmypy.json
145 | 
146 | # Pyre type checker
147 | .pyre/
148 | 
149 | # pytype static type analyzer
150 | .pytype/
151 | 
152 | # Cython debug symbols
153 | cython_debug/
154 | 
155 | # PyCharm
156 | #  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
157 | #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
158 | #  and can be added to the global gitignore or merged into this file.  For a more nuclear
159 | #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
160 | .idea/
161 | 
162 | data/
163 | 
164 | information_retrieval/faiss_container/faiss_indices
165 | 
166 | information_retrieval/faiss_container/PMIDs


--------------------------------------------------------------------------------
/information_retrieval/elastic_container/ingest_data.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | import json
  4 | import os
  5 | from pathlib import Path
  6 | from tqdm import tqdm
  7 | from elasticsearch import Elasticsearch, helpers
  8 | 
  9 | password = os.getenv("ELASTIC_PASSWORD")
 10 | 
 11 | es = Elasticsearch(
 12 |     hosts=[{"host": "localhost", "port": 9200, "scheme": "https"}],
 13 |     ca_certs="/home/rag/.crt/http_ca.crt",
 14 |     basic_auth=("elastic", password),
 15 | )
 16 | 
 17 | # Define the index name
 18 | index_name = "pubmed_index"
 19 | 
 20 | # Delete the index if it exists
 21 | if es.indices.exists(index=index_name):
 22 |     es.indices.delete(index=index_name)
 23 | 
 24 | # Check again if the index exists, and if not, create it
 25 | if not es.indices.exists(index=index_name):
 26 |     # Define the mapping
 27 |     mapping = {
 28 |     "settings": {
 29 |         "analysis": {
 30 |             "analyzer": {
 31 |                 "default": {
 32 |                     "type": "standard",  
 33 |                     "stopwords": "_english_" 
 34 |                 }
 35 |             }
 36 |         }
 37 |     },
 38 |     "mappings": {
 39 |         "properties": {
 40 |             "content": {
 41 |                 "type": "text",
 42 |                 "analyzer": "default",
 43 |                 "fields": {
 44 |                     "keyword": {
 45 |                         "type": "keyword",
 46 |                         "ignore_above": 256 
 47 |                     }
 48 |                 }
 49 |             }
 50 |         }
 51 |     }
 52 | }
 53 | 
 54 | 
 55 | # Create the index with the defined mapping
 56 | es.indices.create(index=index_name, body=mapping)
 57 | 
 58 | source_directory = Path('/home/rag/data/chunk')
 59 | error_log_path = Path('./errors.jsonl')  # Pfad zur Fehlerprotokolldatei
 60 | 
 61 | def bulk_index_documents(source_directory, index_name, error_log_path):
 62 |     if not source_directory.exists():
 63 |         print("The source directory does not exist.")
 64 |         return
 65 | 
 66 |     actions = []  # List to store the documents to be indexed
 67 | 
 68 |     # Open the error log file for writing
 69 |     with error_log_path.open('w') as error_log:
 70 |         # Iterate through each file in the source directory
 71 |         for file_name in tqdm(list(os.listdir(source_directory))):
 72 |             if file_name.endswith('.jsonl'):
 73 |                 source_file = source_directory / file_name
 74 |                 
 75 |                 # Open and read the JSONL file
 76 |                 with open(source_file, 'r') as json_file:
 77 |                     for line in json_file:
 78 |                         try:
 79 |                             doc = json.loads(line)
 80 |                             
 81 |                             action = {
 82 |                                 "_index": index_name,
 83 |                                 "_source": doc
 84 |                             }
 85 |                             actions.append(action)
 86 | 
 87 |                             if len(actions) == 200:  # Bulk indexing threshold
 88 |                                 helpers.bulk(es, actions)
 89 |                                 actions = []
 90 |                         except json.JSONDecodeError as e:
 91 |                             # Log the error
 92 |                             error_log.write(f"Error in file {file_name}: {e}\n")
 93 |                             error_log.write(f"{line}\n")
 94 |                         except Exception as e:
 95 |                             error_log.write(f"Unexpected error in file {file_name}: {e}\n")
 96 |                             error_log.write(f"{line}\n")
 97 | 
 98 |         # Index any remaining documents
 99 |         if actions:
100 |             helpers.bulk(es, actions)
101 | 
102 |     print('Indexing complete')
103 | 
104 | # Call the function to index the documents
105 | bulk_index_documents(source_directory, index_name, error_log_path)
106 | 
107 | # Count and print the number of documents in the index
108 | count_result = es.count(index=index_name)
109 | print(f"Index contains {count_result['count']} documents.")
110 | 
111 | # to run this script in the background, use the following command:
112 | # nohup python3 ./ingest_data.py > output.log 2>&1 &


--------------------------------------------------------------------------------
/rag_system/medCPT_retriever.py:
--------------------------------------------------------------------------------
  1 | from medCPT_encoder import MedCPTQueryEncoder, MedCPTCrossEncoder
  2 | from elasticsearch import Elasticsearch
  3 | import os
  4 | import requests
  5 | import json
  6 | 
  7 | 
  8 | class MedCPTRetriever:
  9 |     def __init__(self, rerank=True):
 10 |         elastic_password = os.getenv('ELASTIC_PASSWORD')
 11 |         self.es = Elasticsearch(
 12 |             ['https://localhost:9200'],
 13 |             basic_auth=('elastic', elastic_password),
 14 |             verify_certs=True,
 15 |             ca_certs="/home/ubuntu/.crts/http_ca.crt",
 16 |             request_timeout=60
 17 |         )
 18 |         self.index = "pubmed_index"
 19 |         self.faiss_url = "http://localhost:5000/search"
 20 |         self.text_encoder = MedCPTQueryEncoder()
 21 |         self.reranker = MedCPTCrossEncoder()
 22 |         self.rerank_enabled = rerank
 23 | 
 24 |     def query_to_vector(self, text: str):
 25 |         """Converts text query to a vector using the medCPT query encoder."""
 26 |         embedding = self.text_encoder.encode(text)
 27 |         return embedding[0]
 28 | 
 29 |     def faiss_request(self, query: str, k: int = 100):
 30 |         """Performs a vector search using FAISS with the given query and k."""
 31 |         vec = self.query_to_vector(query).tolist()  # Convert numpy array to list
 32 |         data = {
 33 |             'queries': [vec],  # List of vectors
 34 |             'k': k
 35 |         }
 36 |         response = requests.post(self.faiss_url, headers={'Content-Type': 'application/json'}, data=json.dumps(data))
 37 |         return response.json()
 38 | 
 39 |     def get_docs_via_PMIDs(self, PMIDs: list):
 40 |         """Retrieves documents from Elasticsearch using a list of PMIDs."""
 41 |         query = {
 42 |             "size": len(PMIDs),
 43 |             "query": {
 44 |                 "terms": {
 45 |                     "PMID": PMIDs
 46 |                 }
 47 |             },
 48 |             "_source": ["PMID", "title", "content"]
 49 |         }
 50 |         return self.es.search(index=self.index, body=query)
 51 | 
 52 |     def rerank_docs(self, query: str, docs: list, top_n: int):
 53 |         """Reranks the documents based on their relevance to the query and returns the top N."""
 54 |         scores = self.reranker.score([doc['content'] for doc in docs], query)
 55 |         reranked_docs = sorted(zip(docs, scores), key=lambda x: x[1], reverse=True)[:top_n]
 56 |         return reranked_docs
 57 | 
 58 |     def retrieve_docs(self, query: str, k: int = 20, top_n: int = 10):
 59 |         """Retrieves documents relevant to the query using both FAISS and Elasticsearch."""
 60 |         response = self.faiss_request(query, k)
 61 |         PMIDs = response['PMIDs'][0]
 62 |         es_response = self.get_docs_via_PMIDs(PMIDs)
 63 | 
 64 |         docs = [{
 65 |             'PMID': hit['_source']['PMID'],
 66 |             'title': hit['_source']['title'],
 67 |             'content': hit['_source']['content']
 68 |         } for hit in es_response['hits']['hits']]
 69 | 
 70 |         # Apply reranking if enabled
 71 |         if self.rerank_enabled:
 72 |             reranked_docs = self.rerank_docs(query, docs, top_n)
 73 |         
 74 |             # only take documents with a score > 0
 75 |             reranked_docs = [doc for doc in reranked_docs if doc[1] > 0]
 76 | 
 77 |             results = {
 78 |                 f"doc{idx + 1}": {
 79 |                     'PMID': doc['PMID'],
 80 |                     'title': doc['title'],
 81 |                     'content': doc['content'],
 82 |                     'score': score.item()
 83 |                 }
 84 |                 for idx, (doc, score) in enumerate(reranked_docs)
 85 |             }
 86 |         else:
 87 |             results = {
 88 |                 f"doc{idx + 1}": {
 89 |                     'PMID': doc['PMID'],
 90 |                     'title': doc['title'],
 91 |                     'content': doc['content']
 92 |                 }
 93 |                 for idx, doc in enumerate(docs[:top_n])
 94 |             }
 95 | 
 96 |         return json.dumps(results, indent=4)
 97 | 
 98 | 
 99 | if __name__ == "__main__":
100 |     retriever = MedCPTRetriever()
101 |     query = "What is the treatment for diabetes?"
102 |     print(retriever.retrieve_docs(query, k=20, top_n=3))
103 | 


--------------------------------------------------------------------------------
/evaluation/evaluation_data_storages/faiss/request.ipynb:
--------------------------------------------------------------------------------
 1 | {
 2 |  "cells": [
 3 |   {
 4 |    "cell_type": "code",
 5 |    "execution_count": 2,
 6 |    "metadata": {},
 7 |    "outputs": [
 8 |     {
 9 |      "name": "stdout",
10 |      "output_type": "stream",
11 |      "text": [
12 |       "Status Code: 200\n",
13 |       "Response: {'distances': [[345.34063720703125, 347.5325622558594, 347.6310729980469, 347.86968994140625, 348.7896728515625, 348.9269714355469, 349.13153076171875, 349.4542236328125, 349.48699951171875, 349.59759521484375, 349.71319580078125, 349.86529541015625, 350.11236572265625, 350.1585693359375, 350.3203125, 350.3330993652344, 350.364990234375, 350.3964538574219, 350.4454040527344, 350.55328369140625, 350.5574951171875, 350.5615234375, 350.697265625, 350.7253723144531, 350.7264404296875, 350.75177001953125, 350.86883544921875, 350.92620849609375, 350.9451904296875, 350.97515869140625, 351.14190673828125, 351.14959716796875, 351.150634765625, 351.163330078125, 351.16705322265625, 351.2235412597656, 351.3066711425781, 351.31182861328125, 351.3298034667969, 351.3414306640625, 351.3535461425781, 351.3605041503906, 351.3978576660156, 351.4039306640625, 351.42755126953125, 351.4990234375, 351.50018310546875, 351.5561828613281, 351.5712890625, 351.578369140625, 351.599365234375, 351.6015625, 351.6611328125, 351.699462890625, 351.7027587890625, 351.70855712890625, 351.7459716796875, 351.76617431640625, 351.775634765625, 351.7792663574219, 351.77984619140625, 351.797607421875, 351.80450439453125, 351.8053283691406, 351.808349609375, 351.8154296875, 351.8173828125, 351.84246826171875, 351.8436584472656, 351.85662841796875, 351.86383056640625, 351.869873046875, 351.8704833984375, 351.87548828125, 351.88934326171875, 351.89654541015625, 351.8977355957031, 351.900634765625, 351.9150390625, 351.933837890625, 351.94317626953125, 351.9481201171875, 351.9498291015625, 351.9609375, 351.97540283203125, 351.98065185546875, 351.9837646484375, 351.9930725097656, 352.00732421875, 352.0120849609375, 352.0337829589844, 352.0369873046875, 352.04071044921875, 352.04254150390625, 352.048095703125, 352.0513916015625, 352.05279541015625, 352.06463623046875, 352.0819091796875, 352.0927734375]], 'indices': [[1133332, 1129233, 2158547, 670332, 1559199, 1707872, 1346023, 1932016, 1302318, 1893635, 1375642, 2191381, 2179104, 2222540, 1133682, 1433335, 184772, 1298703, 1044265, 670344, 2402989, 334779, 2019346, 658058, 1398487, 1777282, 273849, 1384217, 436939, 2265182, 1616784, 2098363, 966665, 473742, 261485, 17393, 1722879, 1125923, 2132707, 1106967, 1910545, 82362, 552523, 958848, 2023649, 594676, 1319413, 2277406, 1895067, 1923708, 1715754, 374483, 190973, 1894858, 382862, 1881046, 401552, 1923465, 620328, 368685, 1215806, 1507128, 1765301, 88256, 527378, 1328518, 924844, 2294050, 79309, 1816391, 1538691, 975819, 2263032, 1210736, 943233, 1616936, 1426340, 1337171, 528403, 2033495, 1199468, 523944, 822048, 1138545, 1677746, 2225820, 309120, 1190258, 192989, 2212557, 886555, 225812, 1244613, 1911463, 2119927, 2344697, 1931231, 462343, 469270, 1742767]]}\n"
14 |      ]
15 |     }
16 |    ],
17 |    "source": [
18 |     "import requests\n",
19 |     "import numpy as np\n",
20 |     "import json\n",
21 |     "\n",
22 |     "# URL des Flask-Endpoints\n",
23 |     "url = 'http://localhost:5000/search'\n",
24 |     "\n",
25 |     "# Generiere einen zufälligen Vektor\n",
26 |     "random_vector = np.random.rand(768).tolist()\n",
27 |     "\n",
28 |     "k = 100\n",
29 |     "\n",
30 |     "# Daten für die POST-Anfrage\n",
31 |     "data = {\n",
32 |     "    'queries': [random_vector],\n",
33 |     "    'k': k\n",
34 |     "}\n",
35 |     "\n",
36 |     "# Senden der POST-Anfrage\n",
37 |     "response = requests.post(url, headers={'Content-Type': 'application/json'}, data=json.dumps(data))\n",
38 |     "\n",
39 |     "# Ausgabe der Antwort\n",
40 |     "print('Status Code:', response.status_code)\n",
41 |     "print('Response:', response.json())"
42 |    ]
43 |   }
44 |  ],
45 |  "metadata": {
46 |   "kernelspec": {
47 |    "display_name": "Python 3",
48 |    "language": "python",
49 |    "name": "python3"
50 |   },
51 |   "language_info": {
52 |    "codemirror_mode": {
53 |     "name": "ipython",
54 |     "version": 3
55 |    },
56 |    "file_extension": ".py",
57 |    "mimetype": "text/x-python",
58 |    "name": "python",
59 |    "nbconvert_exporter": "python",
60 |    "pygments_lexer": "ipython3",
61 |    "version": "3.10.12"
62 |   }
63 |  },
64 |  "nbformat": 4,
65 |  "nbformat_minor": 2
66 | }
67 | 


--------------------------------------------------------------------------------
/evaluation/evaluation_data_storages/documentation.md:
--------------------------------------------------------------------------------
 1 | ### Investigation of Elasticsearch and MongoDB as Data Storage
 2 | 
 3 | #### Elasticsearch 
 4 | ElasticSearch is built on Java and utilizes the Lucene search engine. It writes data to inverted indexes using Lucene segments. 
 5 | Elasticsearch avoids excessive I/O by creating dedicated transactional index logs, preventing frequent low-level Lucene commits during indexing.
 6 | 
 7 | #### MongoDB:
 8 | MongoDB, written in C++, uses a memory map file to map on-disk data files to in-memory byte arrays. 
 9 | It organizes data using a doubly linked data structure. MongoDB processes shut down in case of low system memory or high resource utilization, ensuring stability
10 | 
11 | #### Indexes for Full-Text Search in Elasticsearch and MongoDB.
12 | 
13 | - **Elasticsearch** uses inverted indexes for full-text search. It uses the BM25 algorithm to rank documents based on relevance.
14 | - **MongoDB** uses ...
15 | 
16 | #### Loading data into Elasticsearch:
17 | 1. Created an dense vector index with 768 dimensions for bioBERT embeddings.
18 | 2. Indexed the embeddings of the first 50 JSONL files in the dense vector index.
19 | 3. Indexing time took: [2:14: 00<00:00, 80.71s/it] for 1'800'000 documents.
20 | 4. 
21 | 5. Bulk loading (200 docs) turned out to be significantly faster than single document loading.
22 | 
23 | #### Loading data into MongoDB:
24 | 1. Created a collection with bioBERT embeddings.
25 | 2. Inserted the embeddings of the first 100 JSONL files into the collection.
26 | 3. Inserting time took: [21:06<00:00, 12.66s/it] for 1'795'879 documents.
27 | 4. Inserting data (Bulk: 1000 docs) into MongoDB was faster than Elasticsearch.
28 | 5. Text indexing for full text search took 14:30 minutes. 
29 | 
30 | ### Retriever Comparison
31 | 
32 | - **Full text search** using BM25 ranking algorithm.
33 | - **Semantic search** using bioBERT embedding and KNN / Cosine Similarity.
34 | - **Hybrid search** previous ranking using BM25 followed by semantic search and/or DPR.
35 | 
36 | ### BioLinkBERT for Information Retrieval
37 | 
38 | Die Integration von BioLinkBERT oder ähnlichen Sprachmodellen in ein Information Retrieval (IR) System kann die Genauigkeit und Relevanz der Suchergebnisse erheblich verbessern, insbesondere in spezialisierten Wissensdomänen wie der Biomedizin. Hier sind die grundlegenden Schritte, wie man ein solches Modell in ein IR-System einbinden könnte:
39 | 
40 | ### 1. Auswahl des Information Retrieval Systems
41 | Zuerst muss ein geeignetes IR-System ausgewählt oder entwickelt werden. Dies könnte eine traditionelle Keyword-basierte Suche oder eine fortschrittlichere semantische Suchmaschine sein, die auf Vektorraumsuchen basiert (z.B. Elasticsearch oder Solr mit Vektor-Such-Plugins).
42 | 
43 | ### 2. Vorbereitung des Index
44 | - **Dokumentenvorbereitung**: Alle Dokumente müssen indiziert werden. Dies beinhaltet das Extrahieren von Texten, das Aufbereiten und möglicherweise das Annotieren mit Metadaten.
45 | - **Einbindung von BioLinkBERT**: Verwenden Sie BioLinkBERT, um Texte in hochdimensionale Vektoren zu transformieren, die dann im Suchindex gespeichert werden. Diese Vektoren repräsentieren die semantischen Signaturen der Dokumente.
46 | 
47 | ### 3. Query Processing
48 | - **Abfrage Umwandlung**: Wenn ein Benutzer eine Suchanfrage einreicht, sollte diese Anfrage ebenfalls durch BioLinkBERT verarbeitet werden, um die semantische Repräsentation der Anfrage zu erhalten.
49 | - **Vektorsuche**: Nutzen Sie die generierten Vektoren, um die semantische Nähe zwischen der Suchanfrage und den Dokumenten im Index zu berechnen. Dies kann durch Berechnung von Kosinusähnlichkeiten zwischen den Vektoren erfolgen.
50 | 
51 | ### 4. Ranking und Relevanz-Feedback
52 | - **Relevanz Ranking**: Die Dokumente werden basierend auf ihrer semantischen Nähe zur Anfrage gerankt. Je höher die Ähnlichkeit, desto relevanter das Dokument.
53 | - **Feedback Loop**: Optionales Nutzerfeedback zu den Suchergebnissen kann verwendet werden, um das Modell weiter zu trainieren und die Genauigkeit der Suchergebnisse zu verbessern.
54 | 
55 | ### 5. Einsatz von erweiterten NLP-Techniken
56 | - **Frage-Antwort-Funktionen**: Für spezifische Anfragen, besonders in QA-Systemen, kann BioLinkBERT verwendet werden, um direkt Antworten aus den Texten zu extrahieren, indem es relevante Textpassagen identifiziert und die darin enthaltenen Informationen herausstellt.
57 | - **Zusammenfassungen und Highlighting**: Für längere Dokumente kann BioLinkBERT genutzt werden, um Zusammenfassungen zu erstellen oder Schlüsselinformationen hervorzuheben, die für die Anfrage relevant sind.
58 | 
59 | ### 6. Skalierung und Performance-Optimierung
60 | - **Effizienz**: Beachten Sie, dass die Verarbeitung von Anfragen mit einem vollständigen Sprachmodell rechenintensiv sein kann. Effizienzsteigerungen können durch Techniken wie Quantisierung, Pruning oder den Einsatz spezialisierter Hardware erreicht werden.
61 | - **Parallelisierung**: Um die Geschwindigkeit zu erhöhen, können Anfragen parallelisiert und auf mehreren Servern oder in der Cloud ausgeführt werden.
62 | 
63 | Die Einbindung von BioLinkBERT in ein IR-System erfordert eine sorgfältige Planung und Optimierung, kann jedoch die Fähigkeit des Systems, thematisch relevante und kontextuell passende Dokumente zu finden, erheblich verbessern.


--------------------------------------------------------------------------------
/information_retrieval/faiss_container/faiss_insert_data.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "### Ingesting embeddings into Faiss index"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "code",
 12 |    "execution_count": 1,
 13 |    "metadata": {},
 14 |    "outputs": [],
 15 |    "source": [
 16 |     "import faiss\n",
 17 |     "import json\n",
 18 |     "import numpy as np\n",
 19 |     "import os\n",
 20 |     "import csv\n",
 21 |     "from pathlib import Path\n",
 22 |     "from tqdm import tqdm"
 23 |    ]
 24 |   },
 25 |   {
 26 |    "cell_type": "markdown",
 27 |    "metadata": {},
 28 |    "source": [
 29 |     "Parsing through the JSONL files, extracting the bioBERT embeddings and the corresponding PMIDs. Creating a Faiss index with the embeddings and a CSV file storing PMIDs with the corresponding Faiss index id. "
 30 |    ]
 31 |   },
 32 |   {
 33 |    "cell_type": "code",
 34 |    "execution_count": 3,
 35 |    "metadata": {},
 36 |    "outputs": [
 37 |     {
 38 |      "name": "stderr",
 39 |      "output_type": "stream",
 40 |      "text": [
 41 |       "Processing JSONL files: 100%|██████████| 140/140 [41:12<00:00, 17.66s/it]\n"
 42 |      ]
 43 |     },
 44 |     {
 45 |      "name": "stdout",
 46 |      "output_type": "stream",
 47 |      "text": [
 48 |       "Index successfully written to: /home/ubuntu/data/faiss_indices/medCPT/medCPT_index.index\n",
 49 |       "CSV file successfully written to: /home/ubuntu/data/faiss_indices/medCPT/medCPT_pmids.csv\n"
 50 |      ]
 51 |     }
 52 |    ],
 53 |    "source": [
 54 |     "# Directory setup\n",
 55 |     "index_directory = \"/home/ubuntu/data/faiss_indices/medCPT\"\n",
 56 |     "index_file = \"medCPT_index.index\"\n",
 57 |     "index_path = os.path.join(index_directory, index_file)\n",
 58 |     "csv_file = \"/home/ubuntu/data/faiss_indices/medCPT/medCPT_pmids.csv\"\n",
 59 |     "\n",
 60 |     "# Ensure the index directory exists\n",
 61 |     "if not os.path.exists(index_directory):\n",
 62 |     "    os.makedirs(index_directory)\n",
 63 |     "\n",
 64 |     "# Dimensions of the embeddings\n",
 65 |     "d = 768\n",
 66 |     "\n",
 67 |     "# Initialize the Faiss index (Flat L2-Index)\n",
 68 |     "index = faiss.IndexFlatL2(d)\n",
 69 |     "\n",
 70 |     "# Initialize the CSV file for PMIDs\n",
 71 |     "csv_path = csv_file\n",
 72 |     "csv_rows = []\n",
 73 |     "\n",
 74 |     "# Collecting all JSONL files in the current directory\n",
 75 |     "source_directory = Path('/home/ubuntu/data/pubmed_medCPT')\n",
 76 |     "\n",
 77 |     "# Retrieve and sort the files based on their numerical order in filenames\n",
 78 |     "sorted_files = sorted(source_directory.glob('*.jsonl'), key=lambda x: int(x.stem.split('n')[-1]))\n",
 79 |     "\n",
 80 |     "# Processing sorted files with progress display\n",
 81 |     "for file_name in tqdm(sorted_files, desc=\"Processing JSONL files\"):\n",
 82 |     "    with open(file_name, 'r') as file:\n",
 83 |     "        for line in file:\n",
 84 |     "            try:\n",
 85 |     "                data = json.loads(line)\n",
 86 |     "                embeddings = data.get('embedding')\n",
 87 |     "                pmid = int(data.get('PMID'))\n",
 88 |     "                \n",
 89 |     "                # If embeddings and PMID are present, add them to the index\n",
 90 |     "                if embeddings and pmid:\n",
 91 |     "                    embeddings = np.array(embeddings, dtype='float32').reshape(1, -1)  # Convert to NumPy array and reshape\n",
 92 |     "                    index.add(embeddings)\n",
 93 |     "                    \n",
 94 |     "                    # Add PMIDs, filenames, and index numbers for ordering to the CSV\n",
 95 |     "                    index_num = index.ntotal - 1  # Index number of the last added embedding\n",
 96 |     "                    csv_rows.append([pmid, file_name.name, index_num])\n",
 97 |     "            except json.JSONDecodeError as e:\n",
 98 |     "                print(f\"Error decoding JSON in file {file_name}: {e}\")\n",
 99 |     "\n",
100 |     "# Write the index to a file\n",
101 |     "faiss.write_index(index, index_path)\n",
102 |     "\n",
103 |     "print(f\"Index successfully written to: {index_path}\")\n",
104 |     "\n",
105 |     "# Write PMIDs to CSV file\n",
106 |     "with open(csv_path, 'w', newline='') as csvfile:\n",
107 |     "    csv_writer = csv.writer(csvfile)\n",
108 |     "    csv_writer.writerow(['PMID', 'Filename', 'Index'])\n",
109 |     "    csv_writer.writerows(csv_rows)\n",
110 |     "\n",
111 |     "print(f\"CSV file successfully written to: {csv_path}\")"
112 |    ]
113 |   },
114 |   {
115 |    "cell_type": "code",
116 |    "execution_count": 4,
117 |    "metadata": {},
118 |    "outputs": [],
119 |    "source": [
120 |     "index = faiss.read_index('faiss_indices/PM_index.index')"
121 |    ]
122 |   },
123 |   {
124 |    "cell_type": "code",
125 |    "execution_count": null,
126 |    "metadata": {},
127 |    "outputs": [],
128 |    "source": [
129 |     "k = 10  # Number of nearest neighbors\n",
130 |     "\n",
131 |     "query = np.random.rand(768).tolist()\n",
132 |     "\n",
133 |     "distances, indices = index.search(query, k)"
134 |    ]
135 |   },
136 |   {
137 |    "cell_type": "code",
138 |    "execution_count": 1,
139 |    "metadata": {},
140 |    "outputs": [
141 |     {
142 |      "name": "stdout",
143 |      "output_type": "stream",
144 |      "text": [
145 |       "Status Code: 200\n",
146 |       "Response: {'distances': [[348.77490234375, 348.9889221191406, 349.5247497558594, 349.7203369140625, 349.90228271484375, 349.9190979003906, 350.23382568359375, 350.36578369140625, 350.47930908203125, 350.5979309082031]], 'indices': [[470115, 1932016, 473742, 469270, 1405245, 670332, 1715754, 2382674, 1707872, 2141577]]}\n"
147 |      ]
148 |     }
149 |    ],
150 |    "source": [
151 |     "import requests\n",
152 |     "import numpy as np\n",
153 |     "import json\n",
154 |     "\n",
155 |     "# URL of the Flask endpoint\n",
156 |     "url = 'http://localhost:5000/search'\n",
157 |     "\n",
158 |     "# Generate a random vector of length 768\n",
159 |     "random_vector = np.random.rand(768).tolist()  # Convert numpy array directly to list\n",
160 |     "\n",
161 |     "# Data for the POST request\n",
162 |     "data = {\n",
163 |     "    'queries': [random_vector]  # Ensure this is a list of lists\n",
164 |     "}\n",
165 |     "\n",
166 |     "# Convert data to JSON before sending as POST request\n",
167 |     "json_data = json.dumps(data)\n",
168 |     "\n",
169 |     "# Send the POST request\n",
170 |     "response = requests.post(url, headers={'Content-Type': 'application/json'}, data=json_data)\n",
171 |     "\n",
172 |     "# Output the response\n",
173 |     "print('Status Code:', response.status_code)\n",
174 |     "print('Response:', response.json())"
175 |    ]
176 |   }
177 |  ],
178 |  "metadata": {
179 |   "kernelspec": {
180 |    "display_name": "base",
181 |    "language": "python",
182 |    "name": "python3"
183 |   },
184 |   "language_info": {
185 |    "codemirror_mode": {
186 |     "name": "ipython",
187 |     "version": 3
188 |    },
189 |    "file_extension": ".py",
190 |    "mimetype": "text/x-python",
191 |    "name": "python",
192 |    "nbconvert_exporter": "python",
193 |    "pygments_lexer": "ipython3",
194 |    "version": "3.10.12"
195 |   }
196 |  },
197 |  "nbformat": 4,
198 |  "nbformat_minor": 2
199 | }
200 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | 
  2 | # Medical RAG System
  3 | 
  4 | This repository contains a comprehensive implementation of a Medical Retrieval-Augmented Generation (RAG) system. The system integrates multiple components for document retrieval, question answering, and evaluation, tailored specifically for the medical domain.
  5 | 
  6 | ## Table of Contents
  7 | - [Overview](#overview)
  8 | - [File Structure](#file-structure)
  9 | - [Installation](#installation)
 10 | - [Usage](#usage)
 11 | - [Components](#components)
 12 |   - [Retrieval System](#retrieval-system)
 13 |   - [Question Answering System](#question-answering-system)
 14 |   - [Evaluation](#evaluation)
 15 |   - [Data Storage](#data-storage)
 16 | - [Contributing](#contributing)
 17 | - [License](#license)
 18 | 
 19 | ## Overview
 20 | 
 21 | The Medical RAG System is designed to enhance medical information retrieval and provide accurate answers to medical queries. It combines various retrieval methods, including BM25, bioBERT, and hybrid models, with advanced question-answering techniques to ensure precise and relevant results.
 22 | 
 23 | 
 24 | ## File structure
 25 | 
 26 | ```plaintext
 27 | ├── evaluation
 28 | │   ├── evaluation_data_storages
 29 | │   │   ├── documentation.md
 30 | │   │   ├── elasticsearch
 31 | │   │   │   ├── elastic.ipynb
 32 | │   │   │   └── eval_elastic.ipynb
 33 | │   │   ├── faiss
 34 | │   │   │   ├── conncatinatior.py
 35 | │   │   │   ├── embedding_extractor.py
 36 | │   │   │   └── request.ipynb
 37 | │   │   └── mongodb
 38 | │   │       ├── eval_mongo.ipynb
 39 | │   │       └── mongoDB.ipynb
 40 | │   └── evaluation_QA_system
 41 | │       ├── dataset_filter
 42 | │       │   └── filter_data.ipynb
 43 | │       ├── evaluation_pipeline.ipynb
 44 | │       ├── explore_questions.ipynb
 45 | │       ├── full_text_evaluation.py
 46 | │       └── RAG_evaluator.py
 47 | ├── information_retrieval
 48 | │   ├── document_encoding
 49 | │   │   ├── bioBERT_encoder.py
 50 | │   │   ├── encode_documents.ipynb
 51 | │   │   └── medCPT_encoder.py
 52 | │   ├── elastic_container
 53 | │   │   ├── elastic.ipynb
 54 | │   │   ├── ingest_data.py
 55 | │   │   └── start_elasticsearch.sh
 56 | │   └── faiss_container
 57 | │       ├── docker-compose.yml
 58 | │       ├── Dockerfile
 59 | │       ├── faiss_insert_data.ipynb
 60 | │       └── server.py
 61 | ├── rag_system
 62 | │   ├── bioBERT_encoder.py
 63 | │   ├── bioBERT_retriever.py
 64 | │   ├── bm25_retriever.py
 65 | │   ├── hybrid_retriever.py
 66 | │   ├── medCPT_encoder.py
 67 | │   ├── medCPT_retriever.py
 68 | │   ├── med_rag.py
 69 | │   ├── openAI_chat.py
 70 | │   └── pipeline.ipynb
 71 | ├── README.md
 72 | ├── requirements.txt
 73 | └── sys_requirements.txt
 74 | 
 75 | ```
 76 | 
 77 | ## Installation
 78 | 
 79 | To set up the Medical RAG System, follow these steps:
 80 | 
 81 | 1. **Clone the Repository**
 82 | 
 83 |    ```
 84 |    git clone https://github.com/slinusc/medical_RAG_system.git
 85 |    cd medical_RAG_system
 86 |    ```
 87 | 
 88 | 2. **Install Dependencies**
 89 | 
 90 |    Create a virtual environment and install the required packages:
 91 | 
 92 |    ```
 93 |    python -m venv venv
 94 |    source venv/bin/activate  # On Windows, use `venv\Scripts\activate`
 95 |    pip install -r requirements.txt
 96 |    ```
 97 | 
 98 | 3. **Download Pre-trained Models**
 99 | 
100 |    Ensure that you download and set up any necessary pre-trained models (e.g., BioBERT, MedCPT).
101 | 
102 | ## Usage
103 | 
104 | The system can be used for different purposes, including document retrieval, question answering, and evaluation. Each component has its own set of instructions and example notebooks.
105 | 
106 | ### Retrieval System
107 | 
108 | - **BM25 Retriever**: `rag_system/bm25_retriever.py`
109 | - **BioBERT Retriever**: `rag_system/bioBERT_retriever.py`
110 | - **Hybrid Retriever**: `rag_system/hybrid_retriever.py`
111 | 
112 | ### Question Answering System
113 | 
114 | - **Medical RAG**: `rag_system/med_rag.py`
115 | - **OpenAI Chat**: `rag_system/openAI_chat.py`
116 | 
117 | ### Datasets
118 | 
119 | The 2.4M PubmEd subset we used can be accessed here: [slinusc/PubMedAbstractsSubset](https://huggingface.co/datasets/slinusc/PubMedAbstractsSubset).
120 |  If you're looking for the precomputed embedding vectors (MedCPT) used in our work [*Efficient and Reproducible Biomedical Question Answering using Retrieval Augmented Generation*](https://arxiv.org/abs/2505.07917), they are available in a separate dataset: [slinusc/PubMedAbstractsSubsetEmbedded](https://huggingface.co/datasets/slinusc/PubMedAbstractsSubsetEmbedded).
121 | 
122 | ### Evaluation
123 | 
124 | Evaluation scripts and notebooks are located in the `evaluation/evaluation_QA_system/` directory. Example notebooks are provided to demonstrate the evaluation process.
125 | 
126 | #### Running an Evaluation
127 | 
128 | 1. **Filter the Data (Optional)**
129 | 
130 |    If you need to filter your dataset before evaluation, use the provided notebook:
131 | 
132 |    ```
133 |    evaluation/evaluation_QA_system/dataset_filter/filter_data.ipynb
134 |    ```
135 | 
136 | 2. **Evaluate**
137 | To run the evaluation pipeline, use the evaluation_pipeline.ipynb notebook located in the evaluation/evaluation_QA_system/ directory. This notebook provides a comprehensive guide and setup to evaluate the performance of the RAG system.
138 | 
139 | ## Used Infrastructure
140 | 
141 | The experiments were conducted on the following system:
142 | 
143 | | **Component**        | **Specification**                          |
144 | |----------------------|--------------------------------------------|
145 | | **Architecture**     | x86_64                                     |
146 | | **CPU**              | 8 CPUs                                     |
147 | | **Model**            | Intel Core Processor (Broadwell)           |
148 | | **Memory**           | 32 GiB total, 10 GiB used for buffers/cache |
149 | | **Storage**          | 240 GiB disk size                          |
150 | | **Operating System** | Ubuntu 22.04.4 LTS (Jammy)                 |
151 | | **Kernel Version**   | 5.15.0-102-generic                         |
152 | | **GPU**              | NVIDIA A30                                 |
153 | 
154 | 
155 | ## Contributing
156 | 
157 | We welcome contributions to enhance the Medical RAG System. Please follow these steps to contribute:
158 | 
159 | 1. Fork the repository.
160 | 2. Create a new branch: `git checkout -b feature-branch`.
161 | 3. Make your changes and commit them: `git commit -m 'Add new feature'`.
162 | 4. Push to the branch: `git push origin feature-branch`.
163 | 5. Create a pull request.
164 | 
165 | ## 📄 Publications
166 | 
167 | **Efficient and Reproducible Biomedical Question Answering using Retrieval Augmented Generation**  
168 | Linus Stuhlmann, Michael Saxer, Jonathan Fürst
169 | 
170 | Please cite our work:
171 | 
172 | ```bibtex
173 | @INPROCEEDINGS{11081505,
174 |   author={Stuhlmann, Linus and Saxer, Michael Alexander and Fürst, Jonathan},
175 |   booktitle={2025 IEEE Swiss Conference on Data Science (SDS)}, 
176 |   title={Efficient and Reproducible Biomedical Question Answering Using Retrieval Augmented Generation}, 
177 |   year={2025},
178 |   volume={},
179 |   number={},
180 |   pages={154-157},
181 |   keywords={Accuracy;Scalability;Large language models;Retrieval augmented generation;Data science;Information retrieval;Question answering (information retrieval);Hybrid power systems;Time factors;Indexing;Biomedical Information Retrieval;RetrievalAugmented Generation;Hybrid Retrieval;Large Language Models;PubMed;Information Retrieval Systems},
182 |   doi={10.1109/SDS66131.2025.00029}}
183 | ```
184 | 
185 | [Read the paper on arXiv](https://arxiv.org/abs/2505.07917)
186 | 
187 | ## License
188 | 
189 | This project is licensed under the MIT License. See the [LICENSE](LICENSE) file for details.
190 | ```
191 | 


--------------------------------------------------------------------------------
/information_retrieval/elastic_container/elastic.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "import json\n",
 10 |     "from elasticsearch import Elasticsearch\n",
 11 |     "import os\n",
 12 |     "from pathlib import Path\n",
 13 |     "from tqdm import tqdm"
 14 |    ]
 15 |   },
 16 |   {
 17 |    "cell_type": "code",
 18 |    "execution_count": 4,
 19 |    "metadata": {},
 20 |    "outputs": [
 21 |     {
 22 |      "data": {
 23 |       "text/plain": [
 24 |        "ObjectApiResponse({'name': 'e16354f42e49', 'cluster_name': 'docker-cluster', 'cluster_uuid': 'QRfx48-WQEmifPZNrtrbGw', 'version': {'number': '8.13.4', 'build_flavor': 'default', 'build_type': 'docker', 'build_hash': 'da95df118650b55a500dcc181889ac35c6d8da7c', 'build_date': '2024-05-06T22:04:45.107454559Z', 'build_snapshot': False, 'lucene_version': '9.10.0', 'minimum_wire_compatibility_version': '7.17.0', 'minimum_index_compatibility_version': '7.0.0'}, 'tagline': 'You Know, for Search'})"
 25 |       ]
 26 |      },
 27 |      "execution_count": 4,
 28 |      "metadata": {},
 29 |      "output_type": "execute_result"
 30 |     }
 31 |    ],
 32 |    "source": [
 33 |     "password = os.getenv(\"ELASTIC_PASSWORD\")\n",
 34 |     "password = \"B*WJBFKxDIuH9erC-V2d\"\n",
 35 |     "\n",
 36 |     "es = Elasticsearch(\n",
 37 |     "            ['https://localhost:9200'],\n",
 38 |     "            basic_auth=('elastic', password),\n",
 39 |     "            verify_certs=True,\n",
 40 |     "            ca_certs=\"/home/rag/.crt/http_ca.crt\",\n",
 41 |     "            request_timeout=60\n",
 42 |     "        )\n",
 43 |     "\n",
 44 |     "es.info()"
 45 |    ]
 46 |   },
 47 |   {
 48 |    "cell_type": "code",
 49 |    "execution_count": 4,
 50 |    "metadata": {},
 51 |    "outputs": [
 52 |     {
 53 |      "name": "stdout",
 54 |      "output_type": "stream",
 55 |      "text": [
 56 |       "pubmed_index\n"
 57 |      ]
 58 |     }
 59 |    ],
 60 |    "source": [
 61 |     "indices = es.cat.indices(format='json')\n",
 62 |     "\n",
 63 |     "# Print the indices\n",
 64 |     "for index in indices:\n",
 65 |     "    print(index['index'])\n",
 66 |     "\n",
 67 |     "index = \"pubmed_index\""
 68 |    ]
 69 |   },
 70 |   {
 71 |    "cell_type": "code",
 72 |    "execution_count": null,
 73 |    "metadata": {},
 74 |    "outputs": [],
 75 |    "source": [
 76 |     "import re\n",
 77 |     "\n",
 78 |     "source_directory = Path('/home/rag/data/chunk')\n",
 79 |     "\n",
 80 |     "# get file namens sorted by number\n",
 81 |     "\n",
 82 |     "files = sorted([f for f in source_directory.iterdir() if f.is_file()])\n",
 83 |     "\n",
 84 |     "files"
 85 |    ]
 86 |   },
 87 |   {
 88 |    "cell_type": "code",
 89 |    "execution_count": null,
 90 |    "metadata": {},
 91 |    "outputs": [],
 92 |    "source": [
 93 |     "# Define the index name\n",
 94 |     "index_name = \"pubmed_index_embedded\"\n",
 95 |     "\n",
 96 |     "# Delete the index if it exists\n",
 97 |     "if es.indices.exists(index=index_name):\n",
 98 |     "    es.indices.delete(index=index_name)\n",
 99 |     "\n",
100 |     "# Check again if the index exists, and if not, create it\n",
101 |     "if not es.indices.exists(index=index_name):\n",
102 |     "    # Define the mapping\n",
103 |     "    {\n",
104 |     "  \"settings\": {\n",
105 |     "    \"analysis\": {\n",
106 |     "      \"analyzer\": {\n",
107 |     "        \"custom_lemmatizer_analyzer\": {\n",
108 |     "          \"type\": \"custom\",\n",
109 |     "          \"tokenizer\": \"standard\",\n",
110 |     "          \"filter\": [\"lowercase\", \"stopwords\", \"lemmatizer_filter\"]\n",
111 |     "        }\n",
112 |     "      },\n",
113 |     "      \"filter\": {\n",
114 |     "        \"lemmatizer_filter\": {\n",
115 |     "          \"language\": \"English\"  # Specify the language for lemmatization\n",
116 |     "        },\n",
117 |     "        \"stopwords\": {\n",
118 |     "          \"type\": \"stop\",\n",
119 |     "          \"stopwords\": \"_english_\"  # the built-in English stop words list\n",
120 |     "        }\n",
121 |     "      }\n",
122 |     "    }\n",
123 |     "  },\n",
124 |     "  \"mappings\": {\n",
125 |     "    \"properties\": {\n",
126 |     "      \"content\": {\n",
127 |     "        \"type\": \"text\",\n",
128 |     "        \"analyzer\": \"custom_lemmatizer_analyzer\"\n",
129 |     "      }\n",
130 |     "    }\n",
131 |     "  }\n",
132 |     "}\n",
133 |     "\n",
134 |     "\n",
135 |     "\n",
136 |     "# Create the index with the defined mapping\n",
137 |     "es.indices.create(index=index_name, body=mapping)\n",
138 |     "\n",
139 |     "source_directory = Path('/home/rag/data/chunk')\n",
140 |     "error_log_path = Path('./errors.jsonl')  # Pfad zur Fehlerprotokolldatei\n",
141 |     "\n",
142 |     "def bulk_index_documents(source_directory, index_name, error_log_path):\n",
143 |     "    if not source_directory.exists():\n",
144 |     "        print(\"The source directory does not exist.\")\n",
145 |     "        return\n",
146 |     "\n",
147 |     "    actions = []  # List to store the documents to be indexed\n",
148 |     "\n",
149 |     "    # Open the error log file for writing\n",
150 |     "    with error_log_path.open('w') as error_log:\n",
151 |     "        # Iterate through each file in the source directory\n",
152 |     "        num_files = 0\n",
153 |     "        max_files = 300\n",
154 |     "        for file_name in tqdm(list(os.listdir(source_directory))):\n",
155 |     "            if file_name.endswith('.jsonl'):\n",
156 |     "                source_file = source_directory / file_name\n",
157 |     "                \n",
158 |     "                # Open and read the JSONL file\n",
159 |     "                with open(source_file, 'r') as json_file:\n",
160 |     "                    for line in json_file:\n",
161 |     "                        try:\n",
162 |     "                            doc = json.loads(line)\n",
163 |     "                            \n",
164 |     "                            # Remove the \"embeddings\" field from the document\n",
165 |     "                            #if \"embeddings\" in doc:\n",
166 |     "                            #    del doc[\"embeddings\"]\n",
167 |     "                            \n",
168 |     "                            action = {\n",
169 |     "                                \"_index\": index_name,\n",
170 |     "                                \"_source\": doc\n",
171 |     "                            }\n",
172 |     "                            actions.append(action)\n",
173 |     "\n",
174 |     "                            if len(actions) == 200:  # Bulk indexing threshold\n",
175 |     "                                helpers.bulk(es, actions)\n",
176 |     "                                actions = []\n",
177 |     "                        except json.JSONDecodeError as e:\n",
178 |     "                            # Log the error\n",
179 |     "                            error_log.write(f\"Error in file {file_name}: {e}\\n\")\n",
180 |     "                            error_log.write(f\"{line}\\n\")\n",
181 |     "                        except Exception as e:\n",
182 |     "                            error_log.write(f\"Unexpected error in file {file_name}: {e}\\n\")\n",
183 |     "                            error_log.write(f\"{line}\\n\")\n",
184 |     "\n",
185 |     "        # Index any remaining documents\n",
186 |     "        if actions:\n",
187 |     "            helpers.bulk(es, actions)\n",
188 |     "\n",
189 |     "    print('Indexing complete')\n",
190 |     "\n",
191 |     "# Call the function to index the documents\n",
192 |     "bulk_index_documents(source_directory, index_name, error_log_path)\n",
193 |     "\n",
194 |     "# Count and print the number of documents in the index\n",
195 |     "count_result = es.count(index=index_name)\n",
196 |     "print(f\"Index contains {count_result['count']} documents.\")"
197 |    ]
198 |   }
199 |  ],
200 |  "metadata": {
201 |   "kernelspec": {
202 |    "display_name": "Python 3",
203 |    "language": "python",
204 |    "name": "python3"
205 |   },
206 |   "language_info": {
207 |    "codemirror_mode": {
208 |     "name": "ipython",
209 |     "version": 3
210 |    },
211 |    "file_extension": ".py",
212 |    "mimetype": "text/x-python",
213 |    "name": "python",
214 |    "nbconvert_exporter": "python",
215 |    "pygments_lexer": "ipython3",
216 |    "version": "3.10.12"
217 |   }
218 |  },
219 |  "nbformat": 4,
220 |  "nbformat_minor": 2
221 | }
222 | 


--------------------------------------------------------------------------------
/information_retrieval/document_encoding/encode_documents.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "id": "5ea70828-38b9-48db-a7f4-ec9c644bfc2d",
  7 |    "metadata": {},
  8 |    "outputs": [],
  9 |    "source": [
 10 |     "import ray\n",
 11 |     "import torch\n",
 12 |     "import os\n",
 13 |     "import langchain_community\n",
 14 |     "from ray.data import ActorPoolStrategy\n",
 15 |     "from tqdm import tqdm\n",
 16 |     "import pandas as pd\n",
 17 |     "from ray.data import from_pandas\n",
 18 |     "from functools import partial\n",
 19 |     "import torch\n",
 20 |     "from functools import partial\n",
 21 |     "from bioBERT_encoder import bioBERTEncoder\n",
 22 |     "from medCPT_encoder import medCPTArticleEncoder"
 23 |    ]
 24 |   },
 25 |   {
 26 |    "cell_type": "code",
 27 |    "execution_count": 2,
 28 |    "id": "f380bee0-db3e-4c56-81b1-a827aca6d048",
 29 |    "metadata": {},
 30 |    "outputs": [
 31 |     {
 32 |      "name": "stdout",
 33 |      "output_type": "stream",
 34 |      "text": [
 35 |       "Using cuda.\n"
 36 |      ]
 37 |     }
 38 |    ],
 39 |    "source": [
 40 |     "device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n",
 41 |     "print(f\"Using {device}.\")"
 42 |    ]
 43 |   },
 44 |   {
 45 |    "cell_type": "markdown",
 46 |    "id": "3c8bcc10-7ce7-4337-91b3-99795848b21c",
 47 |    "metadata": {},
 48 |    "source": [
 49 |     "### Initializing Ray"
 50 |    ]
 51 |   },
 52 |   {
 53 |    "cell_type": "code",
 54 |    "execution_count": 3,
 55 |    "id": "9bea217e-3ee8-4fe4-9c7a-511324db3215",
 56 |    "metadata": {},
 57 |    "outputs": [
 58 |     {
 59 |      "name": "stderr",
 60 |      "output_type": "stream",
 61 |      "text": [
 62 |       "2024-05-02 15:37:12,105\tINFO worker.py:1567 -- Connecting to existing Ray cluster at address: 10.10.2.206:6379...\n",
 63 |       "2024-05-02 15:37:12,116\tINFO worker.py:1743 -- Connected to Ray cluster. View the dashboard at \u001b[1m\u001b[32m127.0.0.1:8265 \u001b[39m\u001b[22m\n"
 64 |      ]
 65 |     }
 66 |    ],
 67 |    "source": [
 68 |     "runtime_env = {\n",
 69 |     "    \"pip\": [\n",
 70 |     "        \"langchain-text-splitters\",\n",
 71 |     "        \"langchain_community\", \n",
 72 |     "        \"sentence_transformers\"\n",
 73 |     "    ],\n",
 74 |     "}\n",
 75 |     "\n",
 76 |     "if not ray.is_initialized():\n",
 77 |     "    ray.init(runtime_env=runtime_env)\n",
 78 |     "else:\n",
 79 |     "    ray.shutdown()"
 80 |    ]
 81 |   },
 82 |   {
 83 |    "cell_type": "code",
 84 |    "execution_count": 4,
 85 |    "id": "3de246c8-44a7-4fc1-aeeb-86396be8588d",
 86 |    "metadata": {},
 87 |    "outputs": [
 88 |     {
 89 |      "name": "stdout",
 90 |      "output_type": "stream",
 91 |      "text": [
 92 |       "Verfügbare Ressourcen: {'CPU': 32.0, 'object_store_memory': 17714153471.0, 'memory': 40311046965.0, 'GPU': 4.0, 'accelerator_type:T4': 4.0, 'node:10.10.3.5': 1.0, 'node:10.10.3.72': 1.0, 'node:10.10.2.206': 1.0, 'node:__internal_head__': 1.0, 'node:10.10.2.65': 1.0}\n"
 93 |      ]
 94 |     }
 95 |    ],
 96 |    "source": [
 97 |     "available_resources = ray.available_resources()\n",
 98 |     "print(\"Verfügbare Ressourcen:\", available_resources)"
 99 |    ]
100 |   },
101 |   {
102 |    "cell_type": "code",
103 |    "execution_count": 5,
104 |    "id": "d7f12da1-fb93-4f7e-a083-140b47c01c62",
105 |    "metadata": {},
106 |    "outputs": [
107 |     {
108 |      "data": {
109 |       "text/plain": [
110 |        "['data/pubmed/chunk/pubmed23n0046.jsonl',\n",
111 |        " 'data/pubmed/chunk/pubmed23n0050.jsonl',\n",
112 |        " 'data/pubmed/chunk/pubmed23n0003.jsonl',\n",
113 |        " 'data/pubmed/chunk/pubmed23n0117.jsonl',\n",
114 |        " 'data/pubmed/chunk/pubmed23n0068.jsonl']"
115 |       ]
116 |      },
117 |      "execution_count": 5,
118 |      "metadata": {},
119 |      "output_type": "execute_result"
120 |     }
121 |    ],
122 |    "source": [
123 |     "directory_path = \"data/pubmed/chunk/\"\n",
124 |     "file_names = os.listdir(directory_path)\n",
125 |     "file_paths = [os.path.join(directory_path, file_name) for file_name in file_names]\n",
126 |     "jsonl_file_paths = [file_path for file_path in file_paths if file_path.endswith('.jsonl')]\n",
127 |     "\n",
128 |     "jsonl_file_paths[:5]"
129 |    ]
130 |   },
131 |   {
132 |    "cell_type": "markdown",
133 |    "id": "429f5635-5790-4ebe-aa5e-9d40c79c0a8c",
134 |    "metadata": {},
135 |    "source": [
136 |     "### Using only head node for embedding.\n",
137 |     "\n",
138 |     "Initializing BioBERT Embedding Model"
139 |    ]
140 |   },
141 |   {
142 |    "cell_type": "code",
143 |    "execution_count": 6,
144 |    "id": "96e3ffe7-c78a-4d34-9e3c-de44a392fd32",
145 |    "metadata": {},
146 |    "outputs": [],
147 |    "source": [
148 |     "encoder = bioBERTEncoder()"
149 |    ]
150 |   },
151 |   {
152 |    "cell_type": "markdown",
153 |    "id": "4298cec9-539d-4c9c-8de3-fdcd49bdde71",
154 |    "metadata": {},
155 |    "source": [
156 |     "Iterating through every JSONL file adding the attribute \"embeddings\""
157 |    ]
158 |   },
159 |   {
160 |    "cell_type": "code",
161 |    "execution_count": 10,
162 |    "id": "a64c8a49-1e6e-45ae-87f2-526eee4715d2",
163 |    "metadata": {},
164 |    "outputs": [
165 |     {
166 |      "name": "stderr",
167 |      "output_type": "stream",
168 |      "text": [
169 |       "100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 99/99 [13:00:29<00:00, 473.02s/it]"
170 |      ]
171 |     },
172 |     {
173 |      "name": "stdout",
174 |      "output_type": "stream",
175 |      "text": [
176 |       "Alle Dateien wurden verarbeitet und gespeichert.\n"
177 |      ]
178 |     },
179 |     {
180 |      "name": "stderr",
181 |      "output_type": "stream",
182 |      "text": [
183 |       "\n"
184 |      ]
185 |     }
186 |    ],
187 |    "source": [
188 |     "import os\n",
189 |     "import json\n",
190 |     "from pathlib import Path\n",
191 |     "\n",
192 |     "# Definiere die Pfade für die Quell- und Zielverzeichnisse\n",
193 |     "source_directory = Path('data/pubmed/chunk')\n",
194 |     "target_directory = Path('data/pubmed/embedded')\n",
195 |     "target_directory.mkdir(parents=True, exist_ok=True)\n",
196 |     "\n",
197 |     "# Iteriert durch jede Datei im Quellverzeichnis\n",
198 |     "for file_name in tqdm(os.listdir(source_directory)):\n",
199 |     "    if file_name.endswith('.jsonl'):\n",
200 |     "        source_file = source_directory / file_name\n",
201 |     "        target_file = target_directory / file_name\n",
202 |     "\n",
203 |     "        # Erstellt eine neue Datei im Zielverzeichnis\n",
204 |     "        with open(target_file, 'w') as target:\n",
205 |     "            with open(source_file, 'r') as source:\n",
206 |     "                for line in source:\n",
207 |     "                    # Jede Zeile ist ein JSON-Objekt\n",
208 |     "                    item = json.loads(line)\n",
209 |     "                    # Verarbeite das Item mit EmbedChunks\n",
210 |     "                    embedded_item = encoder([item])[0]  # [0], weil embedder eine Liste zurückgibt\n",
211 |     "                    # Schreibe das bearbeitete Objekt in die Zieldatei\n",
212 |     "                    target.write(json.dumps(embedded_item) + '\\n')\n",
213 |     "            #print(f\"{target_file} has been successfully written to data/pubmed/embedded\")\n",
214 |     "                    \n",
215 |     "print(\"Alle Dateien wurden verarbeitet und gespeichert.\")"
216 |    ]
217 |   },
218 |   {
219 |    "cell_type": "markdown",
220 |    "id": "3b1a578e-e1ca-4204-8b4b-b26fc248e400",
221 |    "metadata": {},
222 |    "source": [
223 |     "To improve performance we'll try to distribute the embedding process on the Ray cluster using 4 nodes with GPUs. With one node the embedding of 1.8 mio documents took 14 hours."
224 |    ]
225 |   },
226 |   {
227 |    "cell_type": "markdown",
228 |    "id": "1b114122-6b49-4521-973f-e5ca45f6dc35",
229 |    "metadata": {},
230 |    "source": [
231 |     "### MedCPT\n",
232 |     "now encode with article encoder of MedCPT"
233 |    ]
234 |   },
235 |   {
236 |    "cell_type": "code",
237 |    "execution_count": 5,
238 |    "id": "e586b89d-c760-41cb-b542-1222b3b69483",
239 |    "metadata": {},
240 |    "outputs": [],
241 |    "source": [
242 |     "encoder = medCPTArticleEncoder()"
243 |    ]
244 |   },
245 |   {
246 |    "cell_type": "code",
247 |    "execution_count": null,
248 |    "id": "0c3fd8c4-f408-4eae-9ea4-e003e664ceb0",
249 |    "metadata": {},
250 |    "outputs": [],
251 |    "source": [
252 |     "import os\n",
253 |     "import json\n",
254 |     "from pathlib import Path\n",
255 |     "\n",
256 |     "# Definiere die Pfade für die Quell- und Zielverzeichnisse\n",
257 |     "source_directory = Path('data/pubmed/chunk')\n",
258 |     "target_directory = Path('data/pubmed/embedded_MedCPT')\n",
259 |     "target_directory.mkdir(parents=True, exist_ok=True)\n",
260 |     "\n",
261 |     "# Iteriert durch jede Datei im Quellverzeichnis\n",
262 |     "for file_name in tqdm(os.listdir(source_directory)):\n",
263 |     "    if file_name.endswith('.jsonl'):\n",
264 |     "        source_file = source_directory / file_name\n",
265 |     "        target_file = target_directory / file_name\n",
266 |     "\n",
267 |     "        # Erstellt eine neue Datei im Zielverzeichnis\n",
268 |     "        with open(target_file, 'w') as target:\n",
269 |     "            with open(source_file, 'r') as source:\n",
270 |     "                for line in source:\n",
271 |     "                    # Jede Zeile ist ein JSON-Objekt\n",
272 |     "                    item = json.loads(line)\n",
273 |     "                    # Verarbeite das Item mit EmbedChunks\n",
274 |     "                    embedded_item = encoder([item])[0]  # [0], weil embedder eine Liste zurückgibt\n",
275 |     "                    # Schreibe das bearbeitete Objekt in die Zieldatei\n",
276 |     "                    target.write(json.dumps(embedded_item) + '\\n')\n",
277 |     "            #print(f\"{target_file} has been successfully written to data/pubmed/embedded\")\n",
278 |     "                    \n",
279 |     "print(\"Alle Dateien wurden verarbeitet und gespeichert.\")"
280 |    ]
281 |   }
282 |  ],
283 |  "metadata": {
284 |   "kernelspec": {
285 |    "display_name": "Python 3 (ipykernel)",
286 |    "language": "python",
287 |    "name": "python3"
288 |   },
289 |   "language_info": {
290 |    "codemirror_mode": {
291 |     "name": "ipython",
292 |     "version": 3
293 |    },
294 |    "file_extension": ".py",
295 |    "mimetype": "text/x-python",
296 |    "name": "python",
297 |    "nbconvert_exporter": "python",
298 |    "pygments_lexer": "ipython3",
299 |    "version": "3.9.18"
300 |   }
301 |  },
302 |  "nbformat": 4,
303 |  "nbformat_minor": 5
304 | }
305 | 


--------------------------------------------------------------------------------
/evaluation/evaluation_QA_system/evaluation_pipeline.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "### Evaluation of the RAG system"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "markdown",
 12 |    "metadata": {},
 13 |    "source": [
 14 |     "first we import some neccessary libraries"
 15 |    ]
 16 |   },
 17 |   {
 18 |    "cell_type": "code",
 19 |    "execution_count": 1,
 20 |    "metadata": {},
 21 |    "outputs": [],
 22 |    "source": [
 23 |     "import pandas as pd\n",
 24 |     "import matplotlib.pyplot as plt\n",
 25 |     "import sys\n",
 26 |     "import os\n",
 27 |     "from RAG_evaluator import RAG_evaluator\n",
 28 |     "sys.path.append(\"../../rag_system/\")\n",
 29 |     "from med_rag import MedRAG"
 30 |    ]
 31 |   },
 32 |   {
 33 |    "cell_type": "markdown",
 34 |    "metadata": {},
 35 |    "source": [
 36 |     "now we define an experiment name, this name should ! uniqely! identify the experiemnal run"
 37 |    ]
 38 |   },
 39 |   {
 40 |    "cell_type": "code",
 41 |    "execution_count": 2,
 42 |    "metadata": {},
 43 |    "outputs": [],
 44 |    "source": [
 45 |     "experiment_name = \"experiment_debugginglist_questions\""
 46 |    ]
 47 |   },
 48 |   {
 49 |    "cell_type": "markdown",
 50 |    "metadata": {},
 51 |    "source": [
 52 |     "then we implement a running experiment by using rag system one two and three"
 53 |    ]
 54 |   },
 55 |   {
 56 |    "cell_type": "code",
 57 |    "execution_count": 3,
 58 |    "metadata": {},
 59 |    "outputs": [
 60 |     {
 61 |      "name": "stdout",
 62 |      "output_type": "stream",
 63 |      "text": [
 64 |       "Directory 'experiment_debugginglist_questions' created at /home/ubuntu/questions_answers_data/experiment_results/experiment_debugginglist_questions\n"
 65 |      ]
 66 |     }
 67 |    ],
 68 |    "source": [
 69 |     "# Base directory where the new folder will be created\n",
 70 |     "base_directory = \"/home/ubuntu/questions_answers_data/experiment_results\"\n",
 71 |     "# input directory, change if diffrent one is used\n",
 72 |     "question_input_dir = \"/home/ubuntu/questions_answers_data/all_questions_in_system_min3.json\"\n",
 73 |     "\n",
 74 |     "\n",
 75 |     "# Construct the path for the new experiment folder\n",
 76 |     "experiment_folder_path = os.path.join(base_directory, experiment_name)\n",
 77 |     "\n",
 78 |     "# Create the directory if it does not exist\n",
 79 |     "if not os.path.exists(experiment_folder_path):\n",
 80 |     "    os.makedirs(experiment_folder_path)\n",
 81 |     "    print(f\"Directory '{experiment_name}' created at {experiment_folder_path}\")\n",
 82 |     "else:\n",
 83 |     "    print(f\"Directory '{experiment_name}' already exists at {experiment_folder_path}\")\n",
 84 |     "\n",
 85 |     "# Construct the path for the JSON file\n",
 86 |     "output_path_retriever_1 = os.path.join(experiment_folder_path, \"result_ragver_1.json\")\n",
 87 |     "output_path_retriever_2 = os.path.join(experiment_folder_path, \"result_ragver_2.json\")\n",
 88 |     "output_path_retriever_3 = os.path.join(experiment_folder_path, \"result_ragver_3.json\")\n",
 89 |     "output_path_retriever_4 = os.path.join(experiment_folder_path, \"result_ragver_4.json\")"
 90 |    ]
 91 |   },
 92 |   {
 93 |    "cell_type": "markdown",
 94 |    "metadata": {},
 95 |    "source": [
 96 |     "### Evaluation of the 3 retriever types used in the RAG"
 97 |    ]
 98 |   },
 99 |   {
100 |    "cell_type": "markdown",
101 |    "metadata": {},
102 |    "source": [
103 |     "- Retriever 1: BioBERT\n",
104 |     "- Retriever 2: BM25\n",
105 |     "- Retriever 3: Hybrid Retriever BM25 reranked with medCPT cross encoder\n",
106 |     "- Retriever 4: medCPT Retriever with reranking"
107 |    ]
108 |   },
109 |   {
110 |    "cell_type": "markdown",
111 |    "metadata": {},
112 |    "source": [
113 |     "Retriever 2: BM25"
114 |    ]
115 |   },
116 |   {
117 |    "cell_type": "code",
118 |    "execution_count": 4,
119 |    "metadata": {},
120 |    "outputs": [],
121 |    "source": [
122 |     "rag_system = MedRAG(retriever=2, question_type=2)\n",
123 |     "\n",
124 |     "rag_type = RAG_evaluator(\n",
125 |     "    rag_model=rag_system,\n",
126 |     "    path_to_question_json=question_input_dir,\n",
127 |     "    output_path=output_path_retriever_2,\n",
128 |     ")"
129 |    ]
130 |   },
131 |   {
132 |    "cell_type": "code",
133 |    "execution_count": null,
134 |    "metadata": {},
135 |    "outputs": [],
136 |    "source": [
137 |     "rag_type.run_eval()"
138 |    ]
139 |   },
140 |   {
141 |    "cell_type": "code",
142 |    "execution_count": 10,
143 |    "metadata": {},
144 |    "outputs": [
145 |     {
146 |      "name": "stdout",
147 |      "output_type": "stream",
148 |      "text": [
149 |       "Summary Statistics for RAG with retriever 4\n",
150 |       "Total Questions: 85\n",
151 |       "\n",
152 |       "Response Time:\n",
153 |       "Mean: 3.81 seconds\n",
154 |       "Standard Deviation: 1.42 seconds\n",
155 |       "\n",
156 |       "Summary of non-answered questions:\n",
157 |       "Absolute count - No Docs Found: 0\n",
158 |       "Percentage - No Docs Found: 0.00%\n",
159 |       "\n",
160 |       "Metrics - RAG Q&A:\n",
161 |       "Accuracy: 0.86\n",
162 |       "Recall: 0.86\n",
163 |       "Precision: 0.87\n",
164 |       "F1 Score: 0.86\n",
165 |       "\n",
166 |       "Metrics - Retriever:\n",
167 |       "Recall Retriever: 0.32\n",
168 |       "Precision Retriever: 0.01\n",
169 |       "F1 Score Retriever: 0.02\n",
170 |       "\n",
171 |       "Metrics - Used vs Retrieved:\n",
172 |       "Recall Used vs Retrieved: 0.49\n",
173 |       "Precision Used vs Retrieved: 0.01\n",
174 |       "F1 Score Used vs Retrieved: 0.02\n",
175 |       "\n",
176 |       "Additional metrics:\n",
177 |       "Mean response time retriever: 2.33\n",
178 |       "Standard deviation response time retriever: 0.71\n",
179 |       "Mean response time generation: 1.49\n",
180 |       "Standard deviation response time generation: 1.33\n"
181 |      ]
182 |     }
183 |    ],
184 |    "source": [
185 |     "rag_type.analyze_performance(\"/home/ubuntu/questions_answers_data/experiment_results/experiment_bioASQ_min2/result_ragver_4.json\")"
186 |    ]
187 |   },
188 |   {
189 |    "cell_type": "markdown",
190 |    "metadata": {},
191 |    "source": [
192 |     "### Evaluation of all question types"
193 |    ]
194 |   },
195 |   {
196 |    "cell_type": "code",
197 |    "execution_count": 2,
198 |    "metadata": {},
199 |    "outputs": [
200 |     {
201 |      "name": "stdout",
202 |      "output_type": "stream",
203 |      "text": [
204 |       "Directory 'experiment_bioASQ_min1_bioBERT' already exists at /home/ubuntu/questions_answers_data/experiment_results/experiment_bioASQ_min1_bioBERT\n"
205 |      ]
206 |     }
207 |    ],
208 |    "source": [
209 |     "experiment_name = \"experiment_bioASQ_min1_bioBERT\"\n",
210 |     "# Base directory where the new folder will be created\n",
211 |     "base_directory = \"/home/ubuntu/questions_answers_data/experiment_results\"\n",
212 |     "# input directory, change if diffrent one is used\n",
213 |     "question_input_factoid = \"/home/ubuntu/questions_answers_data/bioASQ_data_min_1/factoid_questions.json\"\n",
214 |     "question_input_summary = \"/home/ubuntu/questions_answers_data/bioASQ_data_min_1/summary_questions.json\"\n",
215 |     "question_input_list = \"/home/ubuntu/questions_answers_data/bioASQ_data_min_1/list_questions.json\"\n",
216 |     "question_input_yesno = \"/home/ubuntu/questions_answers_data/bioASQ_data_min_1/yesno_questions.json\"\n",
217 |     "\n",
218 |     "# Construct the path for the new experiment folder\n",
219 |     "experiment_folder_path = os.path.join(base_directory, experiment_name)\n",
220 |     "\n",
221 |     "# Create the directory if it does not exist\n",
222 |     "if not os.path.exists(experiment_folder_path):\n",
223 |     "    os.makedirs(experiment_folder_path)\n",
224 |     "    print(f\"Directory '{experiment_name}' created at {experiment_folder_path}\")\n",
225 |     "else:\n",
226 |     "    print(f\"Directory '{experiment_name}' already exists at {experiment_folder_path}\")\n",
227 |     "\n",
228 |     "# Construct the path for the JSON file\n",
229 |     "output_path_question_factoid = os.path.join(experiment_folder_path, \"result_factoid.json\")\n",
230 |     "output_path_question_summary = os.path.join(experiment_folder_path, \"result_summary.json\")\n",
231 |     "output_path_question_list = os.path.join(experiment_folder_path, \"result_list.json\")\n",
232 |     "output_path_question_yesno = os.path.join(experiment_folder_path, \"result_yesno.json\")"
233 |    ]
234 |   },
235 |   {
236 |    "cell_type": "markdown",
237 |    "metadata": {},
238 |    "source": [
239 |     "1. Factoid"
240 |    ]
241 |   },
242 |   {
243 |    "cell_type": "code",
244 |    "execution_count": 3,
245 |    "metadata": {},
246 |    "outputs": [],
247 |    "source": [
248 |     "rag_system = MedRAG(retriever=1, question_type=1)\n",
249 |     "\n",
250 |     "eval_factoid = RAG_evaluator(\n",
251 |     "    rag_model=rag_system,\n",
252 |     "    path_to_question_json=question_input_factoid,\n",
253 |     "    output_path=output_path_question_factoid,\n",
254 |     ")"
255 |    ]
256 |   },
257 |   {
258 |    "cell_type": "code",
259 |    "execution_count": null,
260 |    "metadata": {},
261 |    "outputs": [],
262 |    "source": [
263 |     "eval_factoid.run_eval()"
264 |    ]
265 |   },
266 |   {
267 |    "cell_type": "code",
268 |    "execution_count": 28,
269 |    "metadata": {},
270 |    "outputs": [
271 |     {
272 |      "name": "stdout",
273 |      "output_type": "stream",
274 |      "text": [
275 |       "Summary Statistics for RAG with retriever Unknown\n",
276 |       "Total Questions: 175\n",
277 |       "\n",
278 |       "Response Time:\n",
279 |       "Mean: 6.39 seconds\n",
280 |       "Standard Deviation: 1.66 seconds\n",
281 |       "\n",
282 |       "Summary of non-answered questions:\n",
283 |       "Absolute count - No Docs Found: 0\n",
284 |       "Percentage - No Docs Found: 0.00%\n",
285 |       "\n",
286 |       "Metrics - Retriever:\n",
287 |       "Average Recall: 0.58\n",
288 |       "\n",
289 |       "Metrics for RAG Usage:\n",
290 |       "Average Precision: 0.34\n",
291 |       "\n",
292 |       "Additional metrics:\n",
293 |       "Mean response time retriever: 4.16\n",
294 |       "Standard deviation response time retriever: 1.17\n",
295 |       "Mean response time generation: 2.22\n",
296 |       "Standard deviation response time generation: 1.05\n"
297 |      ]
298 |     }
299 |    ],
300 |    "source": [
301 |     "eval_factoid.analyze_performance(\"/home/ubuntu/questions_answers_data/experiment_results/experiment_bioASQ_min1_hybrid/result_factoid.json\")"
302 |    ]
303 |   },
304 |   {
305 |    "cell_type": "markdown",
306 |    "metadata": {},
307 |    "source": [
308 |     "2. Summary"
309 |    ]
310 |   },
311 |   {
312 |    "cell_type": "code",
313 |    "execution_count": 11,
314 |    "metadata": {},
315 |    "outputs": [],
316 |    "source": [
317 |     "rag_system = MedRAG(retriever=1, question_type=1)\n",
318 |     "\n",
319 |     "eval_summary = RAG_evaluator(\n",
320 |     "    rag_model=rag_system,\n",
321 |     "    path_to_question_json=question_input_summary,\n",
322 |     "    output_path=output_path_question_summary,\n",
323 |     ")"
324 |    ]
325 |   },
326 |   {
327 |    "cell_type": "code",
328 |    "execution_count": null,
329 |    "metadata": {},
330 |    "outputs": [],
331 |    "source": [
332 |     "eval_summary.run_eval()"
333 |    ]
334 |   },
335 |   {
336 |    "cell_type": "code",
337 |    "execution_count": 27,
338 |    "metadata": {},
339 |    "outputs": [
340 |     {
341 |      "name": "stdout",
342 |      "output_type": "stream",
343 |      "text": [
344 |       "Summary Statistics for RAG with retriever Unknown\n",
345 |       "Total Questions: 121\n",
346 |       "\n",
347 |       "Response Time:\n",
348 |       "Mean: 7.50 seconds\n",
349 |       "Standard Deviation: 1.88 seconds\n",
350 |       "\n",
351 |       "Summary of non-answered questions:\n",
352 |       "Absolute count - No Docs Found: 0\n",
353 |       "Percentage - No Docs Found: 0.00%\n",
354 |       "\n",
355 |       "Metrics - Retriever:\n",
356 |       "Average Recall: 0.59\n",
357 |       "\n",
358 |       "Metrics for RAG Usage:\n",
359 |       "Average Precision: 0.32\n",
360 |       "\n",
361 |       "Additional metrics:\n",
362 |       "Mean response time retriever: 4.22\n",
363 |       "Standard deviation response time retriever: 1.33\n",
364 |       "Mean response time generation: 3.28\n",
365 |       "Standard deviation response time generation: 1.25\n"
366 |      ]
367 |     }
368 |    ],
369 |    "source": [
370 |     "eval_summary.analyze_performance(\"/home/ubuntu/questions_answers_data/experiment_results/experiment_bioASQ_min1_hybrid/result_summary.json\")"
371 |    ]
372 |   },
373 |   {
374 |    "cell_type": "markdown",
375 |    "metadata": {},
376 |    "source": [
377 |     "3. List"
378 |    ]
379 |   },
380 |   {
381 |    "cell_type": "code",
382 |    "execution_count": 14,
383 |    "metadata": {},
384 |    "outputs": [],
385 |    "source": [
386 |     "rag_system = MedRAG(retriever=1, question_type=4)\n",
387 |     "\n",
388 |     "eval_list = RAG_evaluator(\n",
389 |     "    rag_model=rag_system,\n",
390 |     "    path_to_question_json=question_input_list,\n",
391 |     "    output_path=output_path_question_list,\n",
392 |     ")"
393 |    ]
394 |   },
395 |   {
396 |    "cell_type": "code",
397 |    "execution_count": null,
398 |    "metadata": {},
399 |    "outputs": [],
400 |    "source": [
401 |     "eval_list.run_eval()"
402 |    ]
403 |   },
404 |   {
405 |    "cell_type": "code",
406 |    "execution_count": 26,
407 |    "metadata": {},
408 |    "outputs": [
409 |     {
410 |      "name": "stdout",
411 |      "output_type": "stream",
412 |      "text": [
413 |       "Summary Statistics for RAG with retriever Unknown\n",
414 |       "Total Questions: 131\n",
415 |       "\n",
416 |       "Response Time:\n",
417 |       "Mean: 5.94 seconds\n",
418 |       "Standard Deviation: 1.45 seconds\n",
419 |       "\n",
420 |       "Summary of non-answered questions:\n",
421 |       "Absolute count - No Docs Found: 0\n",
422 |       "Percentage - No Docs Found: 0.00%\n",
423 |       "\n",
424 |       "Metrics - Retriever:\n",
425 |       "Average Recall: 0.51\n",
426 |       "\n",
427 |       "Metrics for RAG Usage:\n",
428 |       "Average Precision: 0.34\n",
429 |       "\n",
430 |       "Additional metrics:\n",
431 |       "Mean response time retriever: 4.37\n",
432 |       "Standard deviation response time retriever: 1.29\n",
433 |       "Mean response time generation: 1.57\n",
434 |       "Standard deviation response time generation: 0.7\n"
435 |      ]
436 |     }
437 |    ],
438 |    "source": [
439 |     "eval_list.analyze_performance(\"/home/ubuntu/questions_answers_data/experiment_results/experiment_bioASQ_min1_hybrid/result_list.json\")"
440 |    ]
441 |   },
442 |   {
443 |    "cell_type": "markdown",
444 |    "metadata": {},
445 |    "source": [
446 |     "4. Yes/No"
447 |    ]
448 |   },
449 |   {
450 |    "cell_type": "code",
451 |    "execution_count": 3,
452 |    "metadata": {},
453 |    "outputs": [],
454 |    "source": [
455 |     "rag_system = MedRAG(retriever=1, question_type=2)\n",
456 |     "\n",
457 |     "eval_yesno = RAG_evaluator(\n",
458 |     "    rag_model=rag_system,\n",
459 |     "    path_to_question_json=question_input_yesno,\n",
460 |     "    output_path=output_path_question_yesno,\n",
461 |     ")"
462 |    ]
463 |   },
464 |   {
465 |    "cell_type": "code",
466 |    "execution_count": null,
467 |    "metadata": {},
468 |    "outputs": [],
469 |    "source": [
470 |     "eval_yesno.run_eval()"
471 |    ]
472 |   },
473 |   {
474 |    "cell_type": "code",
475 |    "execution_count": 4,
476 |    "metadata": {},
477 |    "outputs": [
478 |     {
479 |      "name": "stdout",
480 |      "output_type": "stream",
481 |      "text": [
482 |       "Summary Statistics for RAG with retriever Unknown\n",
483 |       "Total Questions: 160\n",
484 |       "\n",
485 |       "Response Time:\n",
486 |       "Mean: 5.80 seconds\n",
487 |       "Standard Deviation: 1.30 seconds\n",
488 |       "\n",
489 |       "Summary of non-answered questions:\n",
490 |       "Absolute count - No Docs Found: 0\n",
491 |       "Percentage - No Docs Found: 0.00%\n",
492 |       "\n",
493 |       "Metrics - RAG Q&A:\n",
494 |       "Accuracy: 0.86\n",
495 |       "Recall: 0.86\n",
496 |       "Precision: 0.89\n",
497 |       "F1 Score: 0.86\n",
498 |       "\n",
499 |       "Metrics - Retriever:\n",
500 |       "Average Recall: 0.54\n",
501 |       "\n",
502 |       "Metrics for RAG Usage:\n",
503 |       "Average Precision: 0.28\n",
504 |       "\n",
505 |       "Additional metrics:\n",
506 |       "Mean response time retriever: 4.66\n",
507 |       "Standard deviation response time retriever: 1.24\n",
508 |       "Mean response time generation: 1.14\n",
509 |       "Standard deviation response time generation: 0.38\n"
510 |      ]
511 |     }
512 |    ],
513 |    "source": [
514 |     "eval_yesno.analyze_performance(\"/home/ubuntu/questions_answers_data/experiment_results/experiment_bioASQ_min1_hybrid/result_yesno.json\")"
515 |    ]
516 |   }
517 |  ],
518 |  "metadata": {
519 |   "kernelspec": {
520 |    "display_name": "Python 3",
521 |    "language": "python",
522 |    "name": "python3"
523 |   },
524 |   "language_info": {
525 |    "codemirror_mode": {
526 |     "name": "ipython",
527 |     "version": 3
528 |    },
529 |    "file_extension": ".py",
530 |    "mimetype": "text/x-python",
531 |    "name": "python",
532 |    "nbconvert_exporter": "python",
533 |    "pygments_lexer": "ipython3",
534 |    "version": "3.10.12"
535 |   }
536 |  },
537 |  "nbformat": 4,
538 |  "nbformat_minor": 2
539 | }
540 | 


--------------------------------------------------------------------------------
/evaluation/evaluation_QA_system/RAG_evaluator.py:
--------------------------------------------------------------------------------
  1 | import time
  2 | from tqdm import tqdm
  3 | import re
  4 | import json
  5 | import pandas as pd
  6 | from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
  7 | 
  8 | 
  9 | class RAG_evaluator:
 10 |     """
 11 |     Evaluates the performance of a Retrieval-Augmented Generation (RAG) system.
 12 |     """
 13 | 
 14 |     def __init__(
 15 |         self, rag_model, path_to_question_json, output_path, multiplechoice=False
 16 |     ):
 17 |         self.rag_model = rag_model
 18 |         self.path_to_jsonfile = path_to_question_json
 19 |         self.output_path = output_path
 20 |         self.multiple_choice = multiplechoice
 21 | 
 22 |     def run_eval(self):
 23 |         """Executes the evaluation of the RAG system."""
 24 |         start_time = time.time()  # Start timing
 25 | 
 26 |         # Read the input JSON file
 27 |         with open(self.path_to_jsonfile, "r") as file:
 28 |             data = json.load(file)
 29 | 
 30 |         results = []
 31 |         i = 0
 32 |         for question in tqdm(data["questions"], desc="Processing questions"):
 33 |             response = self.request_selector(question)
 34 |             if response is not None:
 35 |                 results.append(response)
 36 | 
 37 |         # Write the results to the output JSON file
 38 |         with open(self.output_path, "w") as file:
 39 |             json.dump(results, file, indent=4)
 40 | 
 41 |         elapsed_time = time.time() - start_time
 42 |         print(f"Results written to {self.output_path}")
 43 |         print(f"Processing time: {elapsed_time:.2f} seconds")
 44 | 
 45 |     def request_selector(self, question):
 46 |         """Selects the appropriate RAG model and processes the question."""
 47 |         try:
 48 |             if not self.multiple_choice:
 49 |                 match question["type"]:
 50 |                     case "yesno":
 51 |                         return self.handle_yesno(question)
 52 |                     case "list":
 53 |                         return self.handle_list(question)
 54 |                     case "summary" | "factoid":
 55 |                         return self.handle_summary_factoid(question)
 56 |                     case _:
 57 |                         return None
 58 |             else:
 59 |                 return self.handle_multiple_choice(question)
 60 |         except Exception as e:
 61 |             print(e)
 62 |             return None
 63 | 
 64 |     def handle_summary_factoid(self, question):
 65 |         """Handles 'yesno' questions."""
 66 |         start_time = time.time()
 67 |         rag_answer = json.loads(self.rag_model.get_answer(question["body"]))
 68 |         elapsed_time = time.time() - start_time
 69 | 
 70 |         response = rag_answer.get("response")
 71 |         k_pubmedids = list(map(str, rag_answer["retrieved_PMIDs"]))
 72 |         used_pubmedids = list(map(str, rag_answer["used_PMIDs"]))
 73 | 
 74 |         retriever_time = rag_answer["retrieval_time"]
 75 |         generation_time = rag_answer["generation_time"]
 76 | 
 77 |         ground_truth_ids = self.extract_pubmedid(question["documents"])
 78 |         retrieved_correct_ids, num_correct_retrieved_ids, matching_retrieved_ids = (
 79 |             self.compare_pubmed_ids(k_pubmedids, question["documents"])
 80 |         )
 81 |         (
 82 |             rag_used_correct_ids,
 83 |             rag_used_num_correct_retrieved_ids,
 84 |             rag_used_matching_retrieved_ids,
 85 |         ) = self.compare_pubmed_ids(used_pubmedids, question["documents"])
 86 | 
 87 |         limit = 0
 88 | 
 89 |         answered_correct = self.llm_eval(limit, response, question["ideal_answer"])
 90 | 
 91 |         return {
 92 |             "questionid": question["id"],
 93 |             "querytype": question["type"],
 94 |             "question": question["body"],
 95 |             "trueresponse_exact": question["ideal_answer"],
 96 |             "ragresponse": response,
 97 |             "answered_correct": answered_correct,
 98 |             "pmids_retrieved": k_pubmedids,
 99 |             "pmids_uses_by_rag": used_pubmedids,
100 |             "pmids_ground_truth": ground_truth_ids,
101 |             "retrieved_correct_pubmedid": retrieved_correct_ids,
102 |             "num_correct_retrieved_ids": num_correct_retrieved_ids,
103 |             "matching_retrieved_ids": matching_retrieved_ids,
104 |             "rag_used_correct_ids": rag_used_correct_ids,
105 |             "rag_used_num_correct_retrieved_ids": rag_used_num_correct_retrieved_ids,
106 |             "rag_used_matching_retrieved_ids": rag_used_matching_retrieved_ids,
107 |             "requestime": elapsed_time,
108 |             "retrievment_time": retriever_time,
109 |             "generation_time": generation_time,
110 |         }
111 | 
112 |     def handle_list(self, question):
113 |         """Handles 'yesno' questions."""
114 |         start_time = time.time()
115 |         rag_answer = json.loads(self.rag_model.get_answer(question["body"]))
116 |         elapsed_time = time.time() - start_time
117 | 
118 |         response = rag_answer.get("response")
119 |         k_pubmedids = list(map(str, rag_answer["retrieved_PMIDs"]))
120 |         used_pubmedids = list(map(str, rag_answer["used_PMIDs"]))
121 | 
122 |         retriever_time = rag_answer["retrieval_time"]
123 |         generation_time = rag_answer["generation_time"]
124 | 
125 |         ground_truth_ids = self.extract_pubmedid(question["documents"])
126 |         retrieved_correct_ids, num_correct_retrieved_ids, matching_retrieved_ids = (
127 |             self.compare_pubmed_ids(k_pubmedids, question["documents"])
128 |         )
129 |         (
130 |             rag_used_correct_ids,
131 |             rag_used_num_correct_retrieved_ids,
132 |             rag_used_matching_retrieved_ids,
133 |         ) = self.compare_pubmed_ids(used_pubmedids, question["documents"])
134 | 
135 |         answered_correct, response, exact_answer = self.list_eval(
136 |             response, question["exact_answer"]
137 |         )
138 | 
139 |         return {
140 |             "questionid": question["id"],
141 |             "querytype": question["type"],
142 |             "question": question["body"],
143 |             "trueresponse_exact": exact_answer,
144 |             "ragresponse": response,
145 |             "answered_correct": answered_correct,
146 |             "pmids_retrieved": k_pubmedids,
147 |             "pmids_uses_by_rag": used_pubmedids,
148 |             "pmids_ground_truth": ground_truth_ids,
149 |             "retrieved_correct_pubmedid": retrieved_correct_ids,
150 |             "num_correct_retrieved_ids": num_correct_retrieved_ids,
151 |             "matching_retrieved_ids": matching_retrieved_ids,
152 |             "rag_used_correct_ids": rag_used_correct_ids,
153 |             "rag_used_num_correct_retrieved_ids": rag_used_num_correct_retrieved_ids,
154 |             "rag_used_matching_retrieved_ids": rag_used_matching_retrieved_ids,
155 |             "requestime": elapsed_time,
156 |             "retrievment_time": retriever_time,
157 |             "generation_time": generation_time,
158 |         }
159 | 
160 |     def handle_yesno(self, question):
161 |         """Handles 'yesno' questions."""
162 |         start_time = time.time()
163 |         rag_answer = json.loads(self.rag_model.get_answer(question["body"]))
164 |         elapsed_time = time.time() - start_time
165 | 
166 |         response = rag_answer.get("response")
167 |         k_pubmedids = list(map(str, rag_answer["retrieved_PMIDs"]))
168 |         used_pubmedids = list(map(str, rag_answer["used_PMIDs"]))
169 | 
170 |         retriever_time = rag_answer["retrieval_time"]
171 |         generation_time = rag_answer["generation_time"]
172 | 
173 |         ground_truth_ids = self.extract_pubmedid(question["documents"])
174 |         retrieved_correct_ids, num_correct_retrieved_ids, matching_retrieved_ids = (
175 |             self.compare_pubmed_ids(k_pubmedids, question["documents"])
176 |         )
177 |         (
178 |             rag_used_correct_ids,
179 |             rag_used_num_correct_retrieved_ids,
180 |             rag_used_matching_retrieved_ids,
181 |         ) = self.compare_pubmed_ids(used_pubmedids, question["documents"])
182 | 
183 |         answered_correct = self.yesno_eval(response, question["exact_answer"])
184 | 
185 |         return {
186 |             "questionid": question["id"],
187 |             "querytype": question["type"],
188 |             "question": question["body"],
189 |             "trueresponse_exact": question["exact_answer"].lower(),
190 |             "ragresponse": response.lower(),
191 |             "answered_correct": answered_correct,
192 |             "pmids_retrieved": k_pubmedids,
193 |             "pmids_uses_by_rag": used_pubmedids,
194 |             "pmids_ground_truth": ground_truth_ids,
195 |             "retrieved_correct_pubmedid": retrieved_correct_ids,
196 |             "num_correct_retrieved_ids": num_correct_retrieved_ids,
197 |             "matching_retrieved_ids": matching_retrieved_ids,
198 |             "rag_used_correct_ids": rag_used_correct_ids,
199 |             "rag_used_num_correct_retrieved_ids": rag_used_num_correct_retrieved_ids,
200 |             "rag_used_matching_retrieved_ids": rag_used_matching_retrieved_ids,
201 |             "requestime": elapsed_time,
202 |             "retrievment_time": retriever_time,
203 |             "generation_time": generation_time,
204 |         }
205 | 
206 |     def handle_multiple_choice(self, question):
207 |         """Handles multiple-choice questions."""
208 |         start_time = time.time()
209 |         rag_answer = json.loads(
210 |             self.rag_model.get_answer(
211 |                 f"{question['question']} \n"
212 |                 f"1: {question['opa']} \n"
213 |                 f"2: {question['opb']} \n"
214 |                 f"3: {question['opc']} \n"
215 |                 f"4: {question['opd']}"
216 |             )
217 |         )
218 |         elapsed_time = time.time() - start_time
219 | 
220 |         response = rag_answer.get("response")
221 |         k_pubmedids = rag_answer["retrieved_PMIDs"]
222 |         used_pubmedids = rag_answer["used_PMIDs"]
223 | 
224 |         retriever_time = rag_answer["retrieval_time"]
225 |         generation_time = rag_answer["generation_time"]
226 | 
227 |         answered_correct = self.evaluate_MEDMCQA(response, question["cop"])
228 | 
229 |         return {
230 |             "questionid": question["id"],
231 |             "querytype": "MEDCQA" + question["choice_type"],
232 |             "question": question["question"],
233 |             "trueresponse_exact": question["cop"],
234 |             "ragresponse": response,
235 |             "answered_correct": answered_correct,
236 |             "pmids_retrieved": k_pubmedids,
237 |             "pmids_uses_by_rag": used_pubmedids,
238 |             "pmids_ground_truth": "none_for_question_type",
239 |             "requestime": elapsed_time,
240 |             "retrievment_time": retriever_time,
241 |             "generation_time": generation_time,
242 |         }
243 | 
244 |     def evaluate_MEDMCQA(self, rag_response, true_response):
245 |         """Evaluates multiple-choice questions."""
246 |         try:
247 |             return int(rag_response) == int(true_response)
248 |         except Exception:
249 |             return False
250 | 
251 |     def dummy_llm(self):
252 |         # delete if real implementation is done
253 |         pass
254 | 
255 |     def llm_eval(self, limit, rag_response, true_response):
256 |         return "Not_evaluated_yet"
257 |         limit = limit + 1
258 |         response = self.dummy_llm(rag_response, true_response)
259 |         if int(response) == 0:
260 |             return False
261 |         elif int(response) == 1:
262 |             return True
263 |         else:  # if there is no valid response we try 2 times more to get one else we break
264 |             if limit < 3:
265 |                 return self.llm_eval(limit, rag_response, true_response)
266 |             else:
267 |                 return "no_valid_response_possible"
268 | 
269 |     def yesno_eval(self, rag_response, true_response):
270 |         """Evaluates 'yesno' questions."""
271 |         valid_responses = {"yes", "no"}
272 |         if (
273 |             rag_response.lower() not in valid_responses
274 |             or true_response.lower() not in valid_responses
275 |         ):
276 |             return False
277 |         return rag_response.lower() == true_response.lower()
278 | 
279 |     def list_eval(self, rag_response, true_response):
280 |         # Normalize responses using the helper function
281 |         normalized_rag = self.flatten_and_normalize(rag_response)
282 |         normalized_true = self.flatten_and_normalize(true_response)
283 | 
284 |         # Check if at least one item matches
285 |         is_any_match = bool(set(normalized_rag) & set(normalized_true))
286 | 
287 |         # Check if at least one item matches
288 |         is_any_match = bool(set(normalized_rag) & set(normalized_true))
289 | 
290 |         # Similarity score (nur wenns de linus wett)
291 |         # The similarity score is calculated as the Jaccard similarity index, which is the size of the intersection
292 |         # of the two sets divided by the size of their union. This gives us a measure of similarity based on how many
293 |         # items are common to both sets relative to the total number of unique items across both sets.
294 |         # intersection = set(normalized_rag).intersection(set(normalized_true))
295 |         # union = set(normalized_rag).union(set(normalized_true))
296 |         # similarity_score = len(intersection) / len(union) if union else 1.0  # Handle division by zero if both lists are empty
297 | 
298 |         return is_any_match, normalized_rag, normalized_true  # ,  similarity_score,
299 | 
300 |     def compare_pubmed_ids(self, pubmed_ids, documents):
301 |         """Compares PubMed IDs returned by the RAG system."""
302 |         if not isinstance(pubmed_ids, list):
303 |             pubmed_ids = []
304 | 
305 |         extracted_ids = [
306 |             re.search(r"pubmed/(\d+)", doc).group(1)
307 |             for doc in documents
308 |             if re.search(r"pubmed/(\d+)", doc)
309 |         ]
310 | 
311 |         matched_ids = [pid for pid in extracted_ids if pid in pubmed_ids]
312 | 
313 |         return bool(matched_ids), len(matched_ids), matched_ids
314 | 
315 |     def extract_pubmedid(self, documents):
316 |         """Extracts PubMed IDs from document URLs."""
317 |         return [
318 |             re.search(r"pubmed/(\d+)", doc).group(1)
319 |             for doc in documents
320 |             if re.search(r"pubmed/(\d+)", doc)
321 |         ]
322 | 
323 |     def manual_accuracy_score(self, y_true, y_pred):
324 |         """Calculates the accuracy manually."""
325 |         if len(y_true) != len(y_pred):
326 |             raise ValueError(
327 |                 "The length of true labels and predicted labels must be the same."
328 |             )
329 |         return sum(1 for true, pred in zip(y_true, y_pred) if true == pred) / len(
330 |             y_true
331 |         )
332 | 
333 |     def flatten_and_normalize(self, response):
334 |         # This helper function flattens nested lists and normalizes strings
335 |         flattened = []
336 |         for item in response:
337 |             if isinstance(item, list):
338 |                 # If the item is a list, extend the flattened list with normalized subitems
339 |                 flattened.extend([str(subitem).lower().strip() for subitem in item])
340 |             else:
341 |                 # Otherwise, just append the normalized item
342 |                 flattened.append(str(item).lower().strip())
343 |         return flattened
344 | 
345 |     # Function to handle lists, flattening nested lists and normalizing strings
346 |     def process_list(self, items):
347 |         flattened = []
348 |         for item in items:
349 |             if isinstance(item, list):
350 |                 # Recursively process nested lists
351 |                 flattened.extend(self.process_list(item))
352 |             else:
353 |                 # Normalize non-list items
354 |                 flattened.append(self.normalize(item))
355 |         return flattened
356 | 
357 |     # Helper function to handle string normalization
358 |     def normalize(self, item):
359 |         return str(item).lower().strip()
360 | 
361 |     def flatten_and_normalize(self, response):
362 | 
363 |         # Check if the response is a dictionary and process any lists found within
364 |         if isinstance(response, dict):
365 |             flattened = []
366 |             for value in response.values():
367 |                 if isinstance(value, list):
368 |                     flattened.extend(self.process_list(value))
369 |                 else:
370 |                     flattened.append(self.normalize(value))
371 |             return flattened
372 |         elif isinstance(response, list):
373 |             # If the initial response is a list, process it directly
374 |             return self.process_list(response)
375 |         else:
376 |             # Handle single non-list items
377 |             return [self.normalize(response)]
378 | 
379 |     def analyze_performance(self, json_file_path):
380 |         """Analysiert die Performance anhand der Daten aus einer JSON-Datei."""
381 |         with open(json_file_path, "r") as file:
382 |             data = json.load(file)
383 | 
384 |         df = pd.DataFrame(data)
385 | 
386 |         retriever_match = re.search(r"ragver_(\d+)", json_file_path)
387 |         retriever = retriever_match.group(1) if retriever_match else "Unknown"
388 |         print(f"Summary Statistics for RAG with retriever {retriever}")
389 |         print(f"Total Questions: {len(df)}")
390 | 
391 |         mean_response_time = df["requestime"].mean()
392 |         sd_response_time = df["requestime"].std()
393 |         print("\nResponse Time:")
394 |         print(f"Mean: {mean_response_time:.2f} seconds")
395 |         print(f"Standard Deviation: {sd_response_time:.2f} seconds")
396 | 
397 | 
398 |         accuracy = self.manual_accuracy_score(
399 |             df["trueresponse_exact"], df["ragresponse"]
400 |         )
401 |         recall = recall_score(
402 |             df["trueresponse_exact"],
403 |             df["ragresponse"],
404 |             average="weighted",
405 |             zero_division=0,
406 |         )
407 |         precision = precision_score(
408 |             df["trueresponse_exact"],
409 |             df["ragresponse"],
410 |             average="weighted",
411 |             zero_division=0,
412 |         )
413 |         f1 = f1_score(
414 |             df["trueresponse_exact"],
415 |             df["ragresponse"],
416 |             average="weighted",
417 |             zero_division=0,
418 |         )
419 | 
420 | 
421 |         recall_list = []
422 |         precision_list = []
423 | 
424 |         for i in range(len(df)):
425 |             ground_truth_pmids = list(df["pmids_ground_truth"][i])
426 |             matching_retrieved_ids = list(df["matching_retrieved_ids"][i])
427 |             retrieved_pmids = list(df["pmids_retrieved"][i])
428 |             matching_used_ids = list(df["rag_used_matching_retrieved_ids"][i])
429 |             used_pmids = list(df["pmids_uses_by_rag"][i])
430 | 
431 |             recall_retrieval = (
432 |                 len(matching_retrieved_ids) / len(ground_truth_pmids)
433 |                 if ground_truth_pmids
434 |                 else 0
435 |             )
436 |             precision_rag = (
437 |                 len(matching_used_ids) / len(used_pmids)
438 |                 if used_pmids
439 |                 else 0
440 |             )
441 | 
442 |             recall_list.append(recall_retrieval)
443 |             precision_list.append(precision_rag)
444 | 
445 |         avg_recall_retrieval = sum(recall_list) / len(recall_list)
446 |         avg_precision_rag = sum(precision_list) / len(precision_list)
447 | 
448 |         print("\nSummary of non-answered questions:")
449 |         count_no_docs_found = (df["ragresponse"] == "no_docs_found").sum()
450 |         total_specific_counts = count_no_docs_found  # Adjust this if there are other specific counts to include
451 |         total_rows = len(df)
452 |         percentage_not_answered = (total_specific_counts / total_rows) * 100
453 |         print(f"Absolute count - No Docs Found: {total_specific_counts}")
454 |         print(f"Percentage - No Docs Found: {percentage_not_answered:.2f}%")
455 |         
456 |         
457 |         print("\nMetrics - RAG Q&A:")
458 |         print(f"Accuracy: {accuracy:.2f}")
459 |         print(f"Recall: {recall:.2f}")
460 |         print(f"Precision: {precision:.2f}")
461 |         print(f"F1 Score: {f1:.2f}")
462 |         
463 | 
464 |         print("\nMetrics - Retriever:")
465 |         print(f"Average Recall: {avg_recall_retrieval:.2f}")
466 | 
467 |         print("\nMetrics for RAG Usage:")
468 |         print(f"Average Precision: {avg_precision_rag:.2f}")
469 | 
470 |         print("\nAdditional metrics:")
471 |         print(f"Mean response time retriever: {round(df['retrievment_time'].mean(), 2)}")
472 |         print(f"Standard deviation response time retriever: {round(df['retrievment_time'].std(), 2)}")
473 |         print(f"Mean response time generation: {round(df['generation_time'].mean(), 2)}")
474 |         print(f"Standard deviation response time generation: {round(df['generation_time'].std(), 2)}")
475 | 


--------------------------------------------------------------------------------
/evaluation/evaluation_data_storages/mongodb/eval_mongo.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "source": [
  6 |     "### Evaluation of the TF-IDF ranking implemented in MongoDB for probabilistic full text search\n",
  7 |     "\n",
  8 |     "We configurated a mongodb instance in a docker container using 16GB of RAM and 4 cores and port forwarding to the host machine on port 27017. We indexed the 23.9m documents on the content field using the TF-IDF ranking."
  9 |    ],
 10 |    "metadata": {
 11 |     "collapsed": false
 12 |    },
 13 |    "id": "e99371764495f1bd"
 14 |   },
 15 |   {
 16 |    "cell_type": "markdown",
 17 |    "source": [],
 18 |    "metadata": {
 19 |     "collapsed": false
 20 |    },
 21 |    "id": "42bf126ed742dd07"
 22 |   },
 23 |   {
 24 |    "cell_type": "code",
 25 |    "outputs": [
 26 |     {
 27 |      "name": "stdout",
 28 |      "output_type": "stream",
 29 |      "text": [
 30 |       "Collections in der Datenbank: ['Docs', 'all_docs']\n"
 31 |      ]
 32 |     }
 33 |    ],
 34 |    "source": [
 35 |     "from pymongo import MongoClient\n",
 36 |     "import numpy as np\n",
 37 |     "from pathlib import Path\n",
 38 |     "import os\n",
 39 |     "import json\n",
 40 |     "from tqdm import tqdm\n",
 41 |     "\n",
 42 |     "# Connect to MongoDB\n",
 43 |     "client = MongoClient('localhost', 27017)\n",
 44 |     "db = client['PubMed']\n",
 45 |     "collection = db['Docs']\n",
 46 |     "\n",
 47 |     "collections = db.list_collection_names()\n",
 48 |     "print(\"Collections in der Datenbank:\", collections)"
 49 |    ],
 50 |    "metadata": {
 51 |     "collapsed": false,
 52 |     "ExecuteTime": {
 53 |      "end_time": "2024-04-11T14:33:26.784113Z",
 54 |      "start_time": "2024-04-11T14:33:26.044687Z"
 55 |     }
 56 |    },
 57 |    "id": "ef097ce096f81125",
 58 |    "execution_count": 1
 59 |   },
 60 |   {
 61 |    "cell_type": "markdown",
 62 |    "source": [
 63 |     "Now we define the search query and the number of results we want to retrieve. We only retrieve the PMIDs of the documents to compare the results with the relevant documents to the related queries by using the bioASQ dataset."
 64 |    ],
 65 |    "metadata": {
 66 |     "collapsed": false
 67 |    },
 68 |    "id": "715030c934a661f8"
 69 |   },
 70 |   {
 71 |    "cell_type": "markdown",
 72 |    "source": [
 73 |     "This funktion retrieves the PMIDs of the documents that contain the search query in the content field.\n",
 74 |     "\n",
 75 |     "This query only retrieves documents that contain the search query in the content field without any ranking. Thus, the results are not sorted by relevance."
 76 |    ],
 77 |    "metadata": {
 78 |     "collapsed": false
 79 |    },
 80 |    "id": "24e31789dfd838c2"
 81 |   },
 82 |   {
 83 |    "cell_type": "code",
 84 |    "outputs": [],
 85 |    "source": [
 86 |     "def search(query):\n",
 87 |     "    results = collection.find({\"$text\": {\"$search\": query}}).limit(100)\n",
 88 |     "    return results"
 89 |    ],
 90 |    "metadata": {
 91 |     "collapsed": false,
 92 |     "ExecuteTime": {
 93 |      "end_time": "2024-04-11T14:39:17.794498Z",
 94 |      "start_time": "2024-04-11T14:39:17.790223Z"
 95 |     }
 96 |    },
 97 |    "id": "87a03f7fa95bf5bd",
 98 |    "execution_count": 10
 99 |   },
100 |   {
101 |    "cell_type": "code",
102 |    "outputs": [
103 |     {
104 |      "data": {
105 |       "text/plain": "[{'_id': ObjectId('6617aff5fb3d2cdc7cd9b435'),\n  'id': 'pubmed23n0045_2126',\n  'title': 'Distribution of somatostatin-28 (1-12) in the cat brainstem: an immunocytochemical study.',\n  'content': 'We studied the distribution of somatostatin-28 (1-12)-immunoreactive fibers and cell bodies in the cat brainstem. A moderate density of cell bodies containing the peptide was observed in the ventral nucleus of the lateral lemniscus, accessory dorsal tegmental nucleus, retrofacial nucleus and in the lateral reticular nucleus, whereas a low density of such perikarya was found in the interpeduncular nucleus, nucleus incertus, nucleus sagulum, gigantocellular tegmental field, nucleus of the trapezoid body, nucleus praepositus hypoglosii, lateral and magnocellular tegmental fields, nucleus of the solitary tract, nucleus ambiguous and in the nucleus intercalatus. Moreover, a moderate density of somatostatin-28 (1-12)-immunoreactive processes was found in the dorsal nucleus of the raphe, dorsal tegmental nucleus, accessory dorsal tegmental nucleus, periaqueductal gray and in the marginal nucleus of the brachium conjunctivum. Finally, few immunoreactive fibers were visualized in the interpeduncular nucleus, cuneiform nucleus, locus coeruleus, nucleus incertus, superior and inferior central nuclei, nucleus sagulum, ventral nucleus of the lateral lemniscus, nucleus praepositus hypoglosii, medial vestibular nucleus, Kölliker-Fuse area, nucleus ambiguous, retrofacial nucleus, postpyramidal nucleus of the raphe, nucleus of the solitary tract, dorsal motor nucleus of the vagus, lateral reticular nucleus and laminar and alaminar spinal trigeminal nuclei.',\n  'contents': 'Distribution of somatostatin-28 (1-12) in the cat brainstem: an immunocytochemical study. We studied the distribution of somatostatin-28 (1-12)-immunoreactive fibers and cell bodies in the cat brainstem. A moderate density of cell bodies containing the peptide was observed in the ventral nucleus of the lateral lemniscus, accessory dorsal tegmental nucleus, retrofacial nucleus and in the lateral reticular nucleus, whereas a low density of such perikarya was found in the interpeduncular nucleus, nucleus incertus, nucleus sagulum, gigantocellular tegmental field, nucleus of the trapezoid body, nucleus praepositus hypoglosii, lateral and magnocellular tegmental fields, nucleus of the solitary tract, nucleus ambiguous and in the nucleus intercalatus. Moreover, a moderate density of somatostatin-28 (1-12)-immunoreactive processes was found in the dorsal nucleus of the raphe, dorsal tegmental nucleus, accessory dorsal tegmental nucleus, periaqueductal gray and in the marginal nucleus of the brachium conjunctivum. Finally, few immunoreactive fibers were visualized in the interpeduncular nucleus, cuneiform nucleus, locus coeruleus, nucleus incertus, superior and inferior central nuclei, nucleus sagulum, ventral nucleus of the lateral lemniscus, nucleus praepositus hypoglosii, medial vestibular nucleus, Kölliker-Fuse area, nucleus ambiguous, retrofacial nucleus, postpyramidal nucleus of the raphe, nucleus of the solitary tract, dorsal motor nucleus of the vagus, lateral reticular nucleus and laminar and alaminar spinal trigeminal nuclei.',\n  'PMID': 1346714},\n {'_id': ObjectId('6617b085fb3d2cdc7cf495f9'),\n  'id': 'pubmed23n0133_7446',\n  'title': 'Afferent and efferent connections of the medial preoptic area in the rat: a WGA-HRP study.',\n  'content': 'Afferent and efferent connections of the medial preoptic area including medial preoptic nucleus (MP) and periventricular area at the MP level were examined using WGA-HRP as a marker. Injections were performed by insertion of micropipette containing (1) small amount of HRP powder or (2) dryed HRP solution for 24 to 48 hr until the fixation or for 5 min respectively. Dorsal and ventral approaches of injection micropipettes were performed and the results were compared. Previously reported reciprocal connections with lateral septum, bed nucleus of the stria terminalis, medial amygdaloid nucleus, lateral hypothalamic nucleus, paraventricular hypothalamic nucleus, ventromedial hypothalamic nucleus, arcuate nucleus, supramammillary nucleus, central gray at the mesencephalon, raphe dorsalis, raphe medianus, and lateral parabrachial nucleus have been confirmed. In addition, we found reciprocal connections with septo-hypothalamic nucleus, amygdalo-hipocampal nucleus, subiculum, parafascicular thalamic nucleus, posterior thalamic nucleus at the caudo-ventral subdivision, median preoptic nucleus, lateral preoptic nucleus, anterior hypothalamic nucleus, periventricular area at the caudal hypothalamic level, dorsomedial hypothalamic nucleus, posterior hypothalamic nucleus, dorsal and ventral premammillary nucleus, lateral mammillary nucleus, peripeduncular nucleus, periventricular gray, ventral tegmental area, interpeduncular nucleus, nucleus raphe pontis, nucleus raphe magnus, pedunculo-pontine tegmental nucleus, gigantocellular reticular nucleus and solitary tract nucleus. The areas which had only efferent connections from MP were accumbens, caudate putamen, ventral pallidum, substantia innominata, lateral habenular nucleus, paratenial thalamic nucleus, paraventricular thalamic nucleus, mediodorsal thalamic nucleus, reuniens thalamic nucleus, median eminence, medial mammillary nucleus, subthalamic nucleus, pars compacta of substantia nigra, oculomotor nucleus, red nucleus, laterodorsal tegmental nucleus, reticular tegmental nucleus, cuneiform nucleus, nucleus locus coeruleus, and dorsal motor nucleus of vagus among which substantia innominata and median eminence were previously reported. Efferent connections to the nucleus of Darkschewitsch, interstitial nucleus of Cajal, dorsal tegmental nucleus, ventral tegmental nucleus, vestibular nuclei, nucleus raphe obsculus were very weak or abscent in the ventral approach while they were observed in dorsal approach. Previously reported afferent connections from dorsal tegmental nucleus, cuneiform nucleus, and nucleus locus ceruleus were not detected in this study.(ABSTRACT TRUNCATED AT 400 WORDS)',\n  'contents': 'Afferent and efferent connections of the medial preoptic area in the rat: a WGA-HRP study. Afferent and efferent connections of the medial preoptic area including medial preoptic nucleus (MP) and periventricular area at the MP level were examined using WGA-HRP as a marker. Injections were performed by insertion of micropipette containing (1) small amount of HRP powder or (2) dryed HRP solution for 24 to 48 hr until the fixation or for 5 min respectively. Dorsal and ventral approaches of injection micropipettes were performed and the results were compared. Previously reported reciprocal connections with lateral septum, bed nucleus of the stria terminalis, medial amygdaloid nucleus, lateral hypothalamic nucleus, paraventricular hypothalamic nucleus, ventromedial hypothalamic nucleus, arcuate nucleus, supramammillary nucleus, central gray at the mesencephalon, raphe dorsalis, raphe medianus, and lateral parabrachial nucleus have been confirmed. In addition, we found reciprocal connections with septo-hypothalamic nucleus, amygdalo-hipocampal nucleus, subiculum, parafascicular thalamic nucleus, posterior thalamic nucleus at the caudo-ventral subdivision, median preoptic nucleus, lateral preoptic nucleus, anterior hypothalamic nucleus, periventricular area at the caudal hypothalamic level, dorsomedial hypothalamic nucleus, posterior hypothalamic nucleus, dorsal and ventral premammillary nucleus, lateral mammillary nucleus, peripeduncular nucleus, periventricular gray, ventral tegmental area, interpeduncular nucleus, nucleus raphe pontis, nucleus raphe magnus, pedunculo-pontine tegmental nucleus, gigantocellular reticular nucleus and solitary tract nucleus. The areas which had only efferent connections from MP were accumbens, caudate putamen, ventral pallidum, substantia innominata, lateral habenular nucleus, paratenial thalamic nucleus, paraventricular thalamic nucleus, mediodorsal thalamic nucleus, reuniens thalamic nucleus, median eminence, medial mammillary nucleus, subthalamic nucleus, pars compacta of substantia nigra, oculomotor nucleus, red nucleus, laterodorsal tegmental nucleus, reticular tegmental nucleus, cuneiform nucleus, nucleus locus coeruleus, and dorsal motor nucleus of vagus among which substantia innominata and median eminence were previously reported. Efferent connections to the nucleus of Darkschewitsch, interstitial nucleus of Cajal, dorsal tegmental nucleus, ventral tegmental nucleus, vestibular nuclei, nucleus raphe obsculus were very weak or abscent in the ventral approach while they were observed in dorsal approach. Previously reported afferent connections from dorsal tegmental nucleus, cuneiform nucleus, and nucleus locus ceruleus were not detected in this study.(ABSTRACT TRUNCATED AT 400 WORDS)',\n  'PMID': 3995367},\n {'_id': ObjectId('6617b083fb3d2cdc7cf43ab3'),\n  'id': 'pubmed23n0132_4049',\n  'title': '[Afferent connections of the nucleus of the facial nerve in the cat detected using the technic of retrograde axonal transport of horseradish peroxidase].',\n  'content': 'Neuronal populations in the brainstem and spinal cord as sources of fibre pathways to the facial nucleus were studied in adult cats by means of microionophoretic injections of horseradish peroxidase into restricted zones of the facial nucleus. Projection from nucleus nervi hypoglossi, nucleus praepositus hypoglossi, nucleus raphe pallidus, nucleus intercalatus, medial nucleus of the solitary tract, dorsal motor nucleus of the vagus, neurons of genu of the facial nerve, ipsilateral red nucleus and reticular formation of the midbrain to the facial nucleus are found. Projections from a number of other brain structures to the facial nucleus are confirmed. A topographical map of distribution of the brainstem and spinal cord afferents in the facial nucleus is proposed.',\n  'contents': '[Afferent connections of the nucleus of the facial nerve in the cat detected using the technic of retrograde axonal transport of horseradish peroxidase]. Neuronal populations in the brainstem and spinal cord as sources of fibre pathways to the facial nucleus were studied in adult cats by means of microionophoretic injections of horseradish peroxidase into restricted zones of the facial nucleus. Projection from nucleus nervi hypoglossi, nucleus praepositus hypoglossi, nucleus raphe pallidus, nucleus intercalatus, medial nucleus of the solitary tract, dorsal motor nucleus of the vagus, neurons of genu of the facial nerve, ipsilateral red nucleus and reticular formation of the midbrain to the facial nucleus are found. Projections from a number of other brain structures to the facial nucleus are confirmed. A topographical map of distribution of the brainstem and spinal cord afferents in the facial nucleus is proposed.',\n  'PMID': 3960201},\n {'_id': ObjectId('6617b0bcfb3d2cdc7cfebd34'),\n  'id': 'pubmed23n0237_9745',\n  'title': 'Brainstem afferents to the thalamus in a lizard, Varanus exanthematicus.',\n  'content': 'HRP was injected into various thalamic nuclei in order to investigate the brainstem projections to the thalamus in the lizard Varanus exanthematicus. Nucleus dorsomedialis receives afferents from the septal area, nucleus entopeduncularis anterior, nucleus periventricularis hypothalami, area triangularis, nucleus raphes superior, nucleus reticularis inferior, and locus coeruleus. Nucleus dorsolateralis receives afferents from septal area, nucleus dorsomedialis, nucleus entopeduncularis anterior, nucleus periventricularis hypothalami, and the torus semicircularis. Nucleus rotundus receives an input from the tectum mesencephali, the pretectal area, and from the mesencephalic reticular formation. Nucleus intermedius dorsalis receives afferents from the dorsal column nuclei and nucleus periventricularis hypothalami. Nucleus ventrolateralis receives afferents from the dorsal column nuclei, the trigeminal complex, locus coeruleus, and the reticular formation. Nucleus ventromedialis also receives afferents from the trigeminal complex and the reticular formation. Afferents to the habenula have been demonstrated from the septal area, nucleus entopeduncularis anterior, triangular area, nucleus periventricularis hypothalami, nucleus interpeduncularis, nucleus raphes superior, locus coeruleus, nucleus isthmi, nucleus dorsalis motorius nervi vagi, and the mesencephalic tegmentum. The laminar part of the torus semicicularis projects to nucleus medialis.',\n  'contents': 'Brainstem afferents to the thalamus in a lizard, Varanus exanthematicus. HRP was injected into various thalamic nuclei in order to investigate the brainstem projections to the thalamus in the lizard Varanus exanthematicus. Nucleus dorsomedialis receives afferents from the septal area, nucleus entopeduncularis anterior, nucleus periventricularis hypothalami, area triangularis, nucleus raphes superior, nucleus reticularis inferior, and locus coeruleus. Nucleus dorsolateralis receives afferents from septal area, nucleus dorsomedialis, nucleus entopeduncularis anterior, nucleus periventricularis hypothalami, and the torus semicircularis. Nucleus rotundus receives an input from the tectum mesencephali, the pretectal area, and from the mesencephalic reticular formation. Nucleus intermedius dorsalis receives afferents from the dorsal column nuclei and nucleus periventricularis hypothalami. Nucleus ventrolateralis receives afferents from the dorsal column nuclei, the trigeminal complex, locus coeruleus, and the reticular formation. Nucleus ventromedialis also receives afferents from the trigeminal complex and the reticular formation. Afferents to the habenula have been demonstrated from the septal area, nucleus entopeduncularis anterior, triangular area, nucleus periventricularis hypothalami, nucleus interpeduncularis, nucleus raphes superior, locus coeruleus, nucleus isthmi, nucleus dorsalis motorius nervi vagi, and the mesencephalic tegmentum. The laminar part of the torus semicicularis projects to nucleus medialis.',\n  'PMID': 7130476},\n {'_id': ObjectId('6617b007fb3d2cdc7cde3763'),\n  'id': 'pubmed23n0058_6194',\n  'title': 'Comparative cytoarchitectonic analysis of some visual pretectal nuclei in teleosts.',\n  'content': 'The posterior pretectal nucleus, which in Osteoglossum receives second order visual input and projects to the inferior lobe of the hypothalamus, was identified and characterized in species from all major groups of non-neoteleost teleosts. The hypothesis that the posterior pretectal nucleus in these species is homologous to both the pars intermedius of the superficial pretectal nucleus and nucleus glomerulosus in acanthopterygians is supported by multiple similarities in relative position and cytoarchitecture. Nucleus corticalis, which receives retinal input and projects to the posterior pretectal nucleus (or to nucleus glomerulosus), was identified in species belonging to three of the four major teleost radiations. Both the posterior pretectal nucleus and nucleus corticalis are plesiomorphic for teleosts. The presence of glomeruli in the posterior pretectal nucleus and nucleus glomerulosus in esocids and acanthopterygians, respectively, and the presence of two nuclei, the pars intermedius and nucleus glomerulosus, in acanthopterygians, as opposed to one nucleus, the posterior pretectal nucleus, are apomorphies.',\n  'contents': 'Comparative cytoarchitectonic analysis of some visual pretectal nuclei in teleosts. The posterior pretectal nucleus, which in Osteoglossum receives second order visual input and projects to the inferior lobe of the hypothalamus, was identified and characterized in species from all major groups of non-neoteleost teleosts. The hypothesis that the posterior pretectal nucleus in these species is homologous to both the pars intermedius of the superficial pretectal nucleus and nucleus glomerulosus in acanthopterygians is supported by multiple similarities in relative position and cytoarchitecture. Nucleus corticalis, which receives retinal input and projects to the posterior pretectal nucleus (or to nucleus glomerulosus), was identified in species belonging to three of the four major teleost radiations. Both the posterior pretectal nucleus and nucleus corticalis are plesiomorphic for teleosts. The presence of glomeruli in the posterior pretectal nucleus and nucleus glomerulosus in esocids and acanthopterygians, respectively, and the presence of two nuclei, the pars intermedius and nucleus glomerulosus, in acanthopterygians, as opposed to one nucleus, the posterior pretectal nucleus, are apomorphies.',\n  'PMID': 1742601}]"
106 |      },
107 |      "execution_count": 14,
108 |      "metadata": {},
109 |      "output_type": "execute_result"
110 |     }
111 |    ],
112 |    "source": [
113 |     "pmid_liste = search(\"Is it possible to visualize subtahalamic nucleus by using transcranial ultrasound?\")\n",
114 |     "\n",
115 |     "list(pmid_liste)[:5]"
116 |    ],
117 |    "metadata": {
118 |     "collapsed": false,
119 |     "ExecuteTime": {
120 |      "end_time": "2024-04-11T14:48:21.797401Z",
121 |      "start_time": "2024-04-11T14:48:21.771026Z"
122 |     }
123 |    },
124 |    "id": "885b4cd3f85a48b4",
125 |    "execution_count": 14
126 |   },
127 |   {
128 |    "cell_type": "markdown",
129 |    "source": [
130 |     "This funktion retrieves the PMIDs of the documents that contain the search query in the content field. The results are sorted by the TF-IDF ranking.\n",
131 |     "\n",
132 |     "It takes significantly longer to retrieve the results because the documents are sorted by the TF-IDF ranking."
133 |    ],
134 |    "metadata": {
135 |     "collapsed": false
136 |    },
137 |    "id": "998b429c804633be"
138 |   },
139 |   {
140 |    "cell_type": "code",
141 |    "outputs": [],
142 |    "source": [
143 |     "def search_TF_IDF(query, k):\n",
144 |     "    results = collection.find({\"$text\": {\"$search\": query}}, {\"_id\": 0, \"PMID\": 1, \"score\": {\"$meta\": \"textScore\"}}).sort([(\"score\", {\"$meta\": \"textScore\"})]).limit(k)\n",
145 |     "    return results"
146 |    ],
147 |    "metadata": {
148 |     "collapsed": false,
149 |     "ExecuteTime": {
150 |      "end_time": "2024-04-11T14:33:28.996003Z",
151 |      "start_time": "2024-04-11T14:33:28.989219Z"
152 |     }
153 |    },
154 |    "id": "2398467d2a980e74",
155 |    "execution_count": 2
156 |   },
157 |   {
158 |    "cell_type": "markdown",
159 |    "source": [
160 |     "MongoDB uses lazy evaluation. Thus, the query is not executed until the results are accessed. We access the results to measure the time it takes to retrieve the results."
161 |    ],
162 |    "metadata": {
163 |     "collapsed": false
164 |    },
165 |    "id": "3a316c740cda85d"
166 |   },
167 |   {
168 |    "cell_type": "code",
169 |    "outputs": [],
170 |    "source": [
171 |     "pmid_liste = search_TF_IDF(\"Is it possible to visualize subtahalamic nucleus by using transcranial ultrasound?\", 10)"
172 |    ],
173 |    "metadata": {
174 |     "collapsed": false,
175 |     "ExecuteTime": {
176 |      "end_time": "2024-04-11T14:33:31.255852Z",
177 |      "start_time": "2024-04-11T14:33:31.249673Z"
178 |     }
179 |    },
180 |    "id": "52bfc4116b88b82d",
181 |    "execution_count": 3
182 |   },
183 |   {
184 |    "cell_type": "markdown",
185 |    "source": [
186 |     "The results are retrieved as a cursor. We convert the cursor to a list to access the results. This takes a while because the results are sorted by the TF-IDF ranking."
187 |    ],
188 |    "metadata": {
189 |     "collapsed": false
190 |    },
191 |    "id": "6a462e683a59117"
192 |   },
193 |   {
194 |    "cell_type": "code",
195 |    "outputs": [
196 |     {
197 |      "data": {
198 |       "text/plain": "[{'PMID': 1627439, 'score': 3.31173513986014},\n {'PMID': 64478, 'score': 3.2818834459459465},\n {'PMID': 1705058, 'score': 3.2310779816513757},\n {'PMID': 6763082, 'score': 3.229017857142857},\n {'PMID': 2683312, 'score': 3.1279296875},\n {'PMID': 1650433, 'score': 3.095602766798419},\n {'PMID': 2473416, 'score': 3.095472440944882},\n {'PMID': 3473897, 'score': 3.07967032967033},\n {'PMID': 1519071, 'score': 3.0697115384615383},\n {'PMID': 3545257, 'score': 3.0483333333333333}]"
199 |      },
200 |      "execution_count": 4,
201 |      "metadata": {},
202 |      "output_type": "execute_result"
203 |     }
204 |    ],
205 |    "source": [
206 |     "pmid_liste = list(pmid_liste)\n",
207 |     "pmid_liste"
208 |    ],
209 |    "metadata": {
210 |     "collapsed": false,
211 |     "ExecuteTime": {
212 |      "end_time": "2024-04-11T14:33:59.618631Z",
213 |      "start_time": "2024-04-11T14:33:33.265473Z"
214 |     }
215 |    },
216 |    "id": "c84b73f819247fb5",
217 |    "execution_count": 4
218 |   },
219 |   {
220 |    "cell_type": "markdown",
221 |    "source": [
222 |     "25 seconds are needed to retrieve the results for the query. This time is impractical for a real-time search engine."
223 |    ],
224 |    "metadata": {
225 |     "collapsed": false
226 |    },
227 |    "id": "74e53b14defc9b03"
228 |   }
229 |  ],
230 |  "metadata": {
231 |   "kernelspec": {
232 |    "display_name": "Python 3",
233 |    "language": "python",
234 |    "name": "python3"
235 |   },
236 |   "language_info": {
237 |    "codemirror_mode": {
238 |     "name": "ipython",
239 |     "version": 2
240 |    },
241 |    "file_extension": ".py",
242 |    "mimetype": "text/x-python",
243 |    "name": "python",
244 |    "nbconvert_exporter": "python",
245 |    "pygments_lexer": "ipython2",
246 |    "version": "2.7.6"
247 |   }
248 |  },
249 |  "nbformat": 4,
250 |  "nbformat_minor": 5
251 | }
252 | 


--------------------------------------------------------------------------------
/evaluation/evaluation_data_storages/elasticsearch/elastic.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "outputs": [],
  6 |    "source": [
  7 |     "import json\n",
  8 |     "from elasticsearch import Elasticsearch, helpers\n",
  9 |     "import urllib3\n",
 10 |     "import os\n",
 11 |     "\n",
 12 |     "\n",
 13 |     "elastic_password = os.getenv('ELASTIC_PASSWORD_SERVER')\n",
 14 |     "\n",
 15 |     "es = Elasticsearch(\n",
 16 |     "    ['https://localhost:9200'],\n",
 17 |     "    basic_auth=('elastic', elastic_password),\n",
 18 |     "    verify_certs=False,\n",
 19 |     "    ca_certs=\"C:/Users/linus/http_ca.crt\"\n",
 20 |     ")\n",
 21 |     "\n",
 22 |     "urllib3.disable_warnings()"
 23 |    ],
 24 |    "metadata": {
 25 |     "collapsed": false,
 26 |     "ExecuteTime": {
 27 |      "end_time": "2024-04-06T20:56:12.028476Z",
 28 |      "start_time": "2024-04-06T20:56:11.995320Z"
 29 |     }
 30 |    },
 31 |    "id": "c730151aee91dee3",
 32 |    "execution_count": 7
 33 |   },
 34 |   {
 35 |    "cell_type": "code",
 36 |    "outputs": [
 37 |     {
 38 |      "data": {
 39 |       "text/plain": "ObjectApiResponse({'name': 'b3472380ffa2', 'cluster_name': 'docker-cluster', 'cluster_uuid': 'DQvmIapdSNS30vfmGkeR8w', 'version': {'number': '8.13.1', 'build_flavor': 'default', 'build_type': 'docker', 'build_hash': '9287f29bba5e270bd51d557b8daccb7d118ba247', 'build_date': '2024-03-29T10:05:29.787251984Z', 'build_snapshot': False, 'lucene_version': '9.10.0', 'minimum_wire_compatibility_version': '7.17.0', 'minimum_index_compatibility_version': '7.0.0'}, 'tagline': 'You Know, for Search'})"
 40 |      },
 41 |      "execution_count": 8,
 42 |      "metadata": {},
 43 |      "output_type": "execute_result"
 44 |     }
 45 |    ],
 46 |    "source": [
 47 |     "# Test the connection\n",
 48 |     "es.info()"
 49 |    ],
 50 |    "metadata": {
 51 |     "collapsed": false,
 52 |     "ExecuteTime": {
 53 |      "end_time": "2024-04-06T20:56:15.903778Z",
 54 |      "start_time": "2024-04-06T20:56:15.741059Z"
 55 |     }
 56 |    },
 57 |    "id": "9900f0eb7bdc320",
 58 |    "execution_count": 8
 59 |   },
 60 |   {
 61 |    "cell_type": "markdown",
 62 |    "source": [
 63 |     "### Indexing Documents with Embeddings into Elasticsearch for Vector Similarity Search"
 64 |    ],
 65 |    "metadata": {
 66 |     "collapsed": false
 67 |    },
 68 |    "id": "a2769980d775a70d"
 69 |   },
 70 |   {
 71 |    "cell_type": "markdown",
 72 |    "source": [
 73 |     "initialize the index with the appropriate mapping for the dense vector field."
 74 |    ],
 75 |    "metadata": {
 76 |     "collapsed": false
 77 |    },
 78 |    "id": "d35e8d0e15557047"
 79 |   },
 80 |   {
 81 |    "cell_type": "code",
 82 |    "outputs": [],
 83 |    "source": [
 84 |     "# drop the index if it already exists\n",
 85 |     "if es.indices.exists(index='pubmed_emb_index'):\n",
 86 |     "   es.indices.delete(index='pubmed_emb_index')"
 87 |    ],
 88 |    "metadata": {
 89 |     "collapsed": false
 90 |    },
 91 |    "id": "a4809b06942eaf91",
 92 |    "execution_count": 14
 93 |   },
 94 |   {
 95 |    "cell_type": "code",
 96 |    "outputs": [
 97 |     {
 98 |      "name": "stderr",
 99 |      "output_type": "stream",
100 |      "text": [
101 |       "C:\\Users\\linus\\anaconda3\\Lib\\site-packages\\urllib3\\connectionpool.py:1056: InsecureRequestWarning: Unverified HTTPS request is being made to host 'localhost'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
102 |       "  warnings.warn(\n",
103 |       "C:\\Users\\linus\\anaconda3\\Lib\\site-packages\\urllib3\\connectionpool.py:1056: InsecureRequestWarning: Unverified HTTPS request is being made to host 'localhost'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
104 |       "  warnings.warn(\n"
105 |      ]
106 |     }
107 |    ],
108 |    "source": [
109 |     "# Define the index name\n",
110 |     "index_name = \"pubmed_emb_index\"\n",
111 |     "\n",
112 |     "# Check if the index already exists\n",
113 |     "if not es.indices.exists(index=index_name):\n",
114 |     "    # Define the mapping\n",
115 |     "    mapping = {\n",
116 |     "        \"mappings\": {\n",
117 |     "            \"properties\": {\n",
118 |     "                \"embeddings\": {\"type\": \"dense_vector\", \"dims\": 768}  # Adjust the dimension size as needed\n",
119 |     "                # Add other field mappings as necessary\n",
120 |     "            }\n",
121 |     "        }\n",
122 |     "    }\n",
123 |     "    \n",
124 |     "    # Create the index with the defined mapping\n",
125 |     "    es.indices.create(index=index_name, body=mapping)"
126 |    ],
127 |    "metadata": {
128 |     "collapsed": false
129 |    },
130 |    "id": "e950a01927be6bdd",
131 |    "execution_count": 7
132 |   },
133 |   {
134 |    "cell_type": "markdown",
135 |    "source": [
136 |     "load the JSONL files containing the PubMed documents, extract the embeddings, and index the documents into Elasticsearch."
137 |    ],
138 |    "metadata": {
139 |     "collapsed": false
140 |    },
141 |    "id": "d299505c678d88c3"
142 |   },
143 |   {
144 |    "cell_type": "code",
145 |    "outputs": [
146 |     {
147 |      "name": "stderr",
148 |      "output_type": "stream",
149 |      "text": [
150 |       " 63%|██████▎   | 63/100 [35:47<51:34, 83.63s/it]"
151 |      ]
152 |     },
153 |     {
154 |      "name": "stdout",
155 |      "output_type": "stream",
156 |      "text": [
157 |       "An error occurred: Connection timed out\n"
158 |      ]
159 |     },
160 |     {
161 |      "name": "stderr",
162 |      "output_type": "stream",
163 |      "text": [
164 |       "100%|██████████| 100/100 [48:28<00:00, 29.09s/it]\n"
165 |      ]
166 |     },
167 |     {
168 |      "ename": "ConnectionTimeout",
169 |      "evalue": "Connection timed out",
170 |      "output_type": "error",
171 |      "traceback": [
172 |       "\u001B[1;31m---------------------------------------------------------------------------\u001B[0m",
173 |       "\u001B[1;31mConnectionTimeout\u001B[0m                         Traceback (most recent call last)",
174 |       "Cell \u001B[1;32mIn[15], line 49\u001B[0m\n\u001B[0;32m     46\u001B[0m     \u001B[38;5;28mprint\u001B[39m(\u001B[38;5;124m'\u001B[39m\u001B[38;5;124mIndexing complete\u001B[39m\u001B[38;5;124m'\u001B[39m)\n\u001B[0;32m     48\u001B[0m \u001B[38;5;66;03m# Rufen Sie die Funktion auf, um die Dokumente zu indizieren\u001B[39;00m\n\u001B[1;32m---> 49\u001B[0m bulk_index_documents(source_directory, index_name)\n",
175 |       "Cell \u001B[1;32mIn[15], line 44\u001B[0m, in \u001B[0;36mbulk_index_documents\u001B[1;34m(source_directory, index_name)\u001B[0m\n\u001B[0;32m     42\u001B[0m \u001B[38;5;66;03m# Indexieren Sie alle verbleibenden Dokumente\u001B[39;00m\n\u001B[0;32m     43\u001B[0m \u001B[38;5;28;01mif\u001B[39;00m actions:\n\u001B[1;32m---> 44\u001B[0m     helpers\u001B[38;5;241m.\u001B[39mbulk(es, actions)\n\u001B[0;32m     46\u001B[0m \u001B[38;5;28mprint\u001B[39m(\u001B[38;5;124m'\u001B[39m\u001B[38;5;124mIndexing complete\u001B[39m\u001B[38;5;124m'\u001B[39m)\n",
176 |       "File \u001B[1;32m~\\anaconda3\\Lib\\site-packages\\elasticsearch\\helpers\\actions.py:521\u001B[0m, in \u001B[0;36mbulk\u001B[1;34m(client, actions, stats_only, ignore_status, *args, **kwargs)\u001B[0m\n\u001B[0;32m    519\u001B[0m \u001B[38;5;66;03m# make streaming_bulk yield successful results so we can count them\u001B[39;00m\n\u001B[0;32m    520\u001B[0m kwargs[\u001B[38;5;124m\"\u001B[39m\u001B[38;5;124myield_ok\u001B[39m\u001B[38;5;124m\"\u001B[39m] \u001B[38;5;241m=\u001B[39m \u001B[38;5;28;01mTrue\u001B[39;00m\n\u001B[1;32m--> 521\u001B[0m \u001B[38;5;28;01mfor\u001B[39;00m ok, item \u001B[38;5;129;01min\u001B[39;00m streaming_bulk(\n\u001B[0;32m    522\u001B[0m     client, actions, ignore_status\u001B[38;5;241m=\u001B[39mignore_status, \u001B[38;5;241m*\u001B[39margs, \u001B[38;5;241m*\u001B[39m\u001B[38;5;241m*\u001B[39mkwargs  \u001B[38;5;66;03m# type: ignore[misc]\u001B[39;00m\n\u001B[0;32m    523\u001B[0m ):\n\u001B[0;32m    524\u001B[0m     \u001B[38;5;66;03m# go through request-response pairs and detect failures\u001B[39;00m\n\u001B[0;32m    525\u001B[0m     \u001B[38;5;28;01mif\u001B[39;00m \u001B[38;5;129;01mnot\u001B[39;00m ok:\n\u001B[0;32m    526\u001B[0m         \u001B[38;5;28;01mif\u001B[39;00m \u001B[38;5;129;01mnot\u001B[39;00m stats_only:\n",
177 |       "File \u001B[1;32m~\\anaconda3\\Lib\\site-packages\\elasticsearch\\helpers\\actions.py:436\u001B[0m, in \u001B[0;36mstreaming_bulk\u001B[1;34m(client, actions, chunk_size, max_chunk_bytes, raise_on_error, expand_action_callback, raise_on_exception, max_retries, initial_backoff, max_backoff, yield_ok, ignore_status, *args, **kwargs)\u001B[0m\n\u001B[0;32m    433\u001B[0m     time\u001B[38;5;241m.\u001B[39msleep(\u001B[38;5;28mmin\u001B[39m(max_backoff, initial_backoff \u001B[38;5;241m*\u001B[39m \u001B[38;5;241m2\u001B[39m \u001B[38;5;241m*\u001B[39m\u001B[38;5;241m*\u001B[39m (attempt \u001B[38;5;241m-\u001B[39m \u001B[38;5;241m1\u001B[39m)))\n\u001B[0;32m    435\u001B[0m \u001B[38;5;28;01mtry\u001B[39;00m:\n\u001B[1;32m--> 436\u001B[0m     \u001B[38;5;28;01mfor\u001B[39;00m data, (ok, info) \u001B[38;5;129;01min\u001B[39;00m \u001B[38;5;28mzip\u001B[39m(\n\u001B[0;32m    437\u001B[0m         bulk_data,\n\u001B[0;32m    438\u001B[0m         _process_bulk_chunk(\n\u001B[0;32m    439\u001B[0m             client,\n\u001B[0;32m    440\u001B[0m             bulk_actions,\n\u001B[0;32m    441\u001B[0m             bulk_data,\n\u001B[0;32m    442\u001B[0m             raise_on_exception,\n\u001B[0;32m    443\u001B[0m             raise_on_error,\n\u001B[0;32m    444\u001B[0m             ignore_status,\n\u001B[0;32m    445\u001B[0m             \u001B[38;5;241m*\u001B[39margs,\n\u001B[0;32m    446\u001B[0m             \u001B[38;5;241m*\u001B[39m\u001B[38;5;241m*\u001B[39mkwargs,\n\u001B[0;32m    447\u001B[0m         ),\n\u001B[0;32m    448\u001B[0m     ):\n\u001B[0;32m    449\u001B[0m         \u001B[38;5;28;01mif\u001B[39;00m \u001B[38;5;129;01mnot\u001B[39;00m ok:\n\u001B[0;32m    450\u001B[0m             action, info \u001B[38;5;241m=\u001B[39m info\u001B[38;5;241m.\u001B[39mpopitem()\n",
178 |       "File \u001B[1;32m~\\anaconda3\\Lib\\site-packages\\elasticsearch\\helpers\\actions.py:339\u001B[0m, in \u001B[0;36m_process_bulk_chunk\u001B[1;34m(client, bulk_actions, bulk_data, raise_on_exception, raise_on_error, ignore_status, *args, **kwargs)\u001B[0m\n\u001B[0;32m    335\u001B[0m     ignore_status \u001B[38;5;241m=\u001B[39m (ignore_status,)\n\u001B[0;32m    337\u001B[0m \u001B[38;5;28;01mtry\u001B[39;00m:\n\u001B[0;32m    338\u001B[0m     \u001B[38;5;66;03m# send the actual request\u001B[39;00m\n\u001B[1;32m--> 339\u001B[0m     resp \u001B[38;5;241m=\u001B[39m client\u001B[38;5;241m.\u001B[39mbulk(\u001B[38;5;241m*\u001B[39margs, operations\u001B[38;5;241m=\u001B[39mbulk_actions, \u001B[38;5;241m*\u001B[39m\u001B[38;5;241m*\u001B[39mkwargs)  \u001B[38;5;66;03m# type: ignore[arg-type]\u001B[39;00m\n\u001B[0;32m    340\u001B[0m \u001B[38;5;28;01mexcept\u001B[39;00m ApiError \u001B[38;5;28;01mas\u001B[39;00m e:\n\u001B[0;32m    341\u001B[0m     gen \u001B[38;5;241m=\u001B[39m _process_bulk_chunk_error(\n\u001B[0;32m    342\u001B[0m         error\u001B[38;5;241m=\u001B[39me,\n\u001B[0;32m    343\u001B[0m         bulk_data\u001B[38;5;241m=\u001B[39mbulk_data,\n\u001B[1;32m   (...)\u001B[0m\n\u001B[0;32m    346\u001B[0m         raise_on_error\u001B[38;5;241m=\u001B[39mraise_on_error,\n\u001B[0;32m    347\u001B[0m     )\n",
179 |       "File \u001B[1;32m~\\anaconda3\\Lib\\site-packages\\elasticsearch\\_sync\\client\\utils.py:446\u001B[0m, in \u001B[0;36m_rewrite_parameters.<locals>.wrapper.<locals>.wrapped\u001B[1;34m(*args, **kwargs)\u001B[0m\n\u001B[0;32m    443\u001B[0m         \u001B[38;5;28;01mexcept\u001B[39;00m \u001B[38;5;167;01mKeyError\u001B[39;00m:\n\u001B[0;32m    444\u001B[0m             \u001B[38;5;28;01mpass\u001B[39;00m\n\u001B[1;32m--> 446\u001B[0m \u001B[38;5;28;01mreturn\u001B[39;00m api(\u001B[38;5;241m*\u001B[39margs, \u001B[38;5;241m*\u001B[39m\u001B[38;5;241m*\u001B[39mkwargs)\n",
180 |       "File \u001B[1;32m~\\anaconda3\\Lib\\site-packages\\elasticsearch\\_sync\\client\\__init__.py:714\u001B[0m, in \u001B[0;36mElasticsearch.bulk\u001B[1;34m(self, operations, body, index, error_trace, filter_path, human, pipeline, pretty, refresh, require_alias, routing, source, source_excludes, source_includes, timeout, wait_for_active_shards)\u001B[0m\n\u001B[0;32m    709\u001B[0m __body \u001B[38;5;241m=\u001B[39m operations \u001B[38;5;28;01mif\u001B[39;00m operations \u001B[38;5;129;01mis\u001B[39;00m \u001B[38;5;129;01mnot\u001B[39;00m \u001B[38;5;28;01mNone\u001B[39;00m \u001B[38;5;28;01melse\u001B[39;00m body\n\u001B[0;32m    710\u001B[0m __headers \u001B[38;5;241m=\u001B[39m {\n\u001B[0;32m    711\u001B[0m     \u001B[38;5;124m\"\u001B[39m\u001B[38;5;124maccept\u001B[39m\u001B[38;5;124m\"\u001B[39m: \u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mapplication/json\u001B[39m\u001B[38;5;124m\"\u001B[39m,\n\u001B[0;32m    712\u001B[0m     \u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mcontent-type\u001B[39m\u001B[38;5;124m\"\u001B[39m: \u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mapplication/x-ndjson\u001B[39m\u001B[38;5;124m\"\u001B[39m,\n\u001B[0;32m    713\u001B[0m }\n\u001B[1;32m--> 714\u001B[0m \u001B[38;5;28;01mreturn\u001B[39;00m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mperform_request(  \u001B[38;5;66;03m# type: ignore[return-value]\u001B[39;00m\n\u001B[0;32m    715\u001B[0m     \u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mPUT\u001B[39m\u001B[38;5;124m\"\u001B[39m,\n\u001B[0;32m    716\u001B[0m     __path,\n\u001B[0;32m    717\u001B[0m     params\u001B[38;5;241m=\u001B[39m__query,\n\u001B[0;32m    718\u001B[0m     headers\u001B[38;5;241m=\u001B[39m__headers,\n\u001B[0;32m    719\u001B[0m     body\u001B[38;5;241m=\u001B[39m__body,\n\u001B[0;32m    720\u001B[0m     endpoint_id\u001B[38;5;241m=\u001B[39m\u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mbulk\u001B[39m\u001B[38;5;124m\"\u001B[39m,\n\u001B[0;32m    721\u001B[0m     path_parts\u001B[38;5;241m=\u001B[39m__path_parts,\n\u001B[0;32m    722\u001B[0m )\n",
181 |       "File \u001B[1;32m~\\anaconda3\\Lib\\site-packages\\elasticsearch\\_sync\\client\\_base.py:271\u001B[0m, in \u001B[0;36mBaseClient.perform_request\u001B[1;34m(self, method, path, params, headers, body, endpoint_id, path_parts)\u001B[0m\n\u001B[0;32m    255\u001B[0m \u001B[38;5;28;01mdef\u001B[39;00m \u001B[38;5;21mperform_request\u001B[39m(\n\u001B[0;32m    256\u001B[0m     \u001B[38;5;28mself\u001B[39m,\n\u001B[0;32m    257\u001B[0m     method: \u001B[38;5;28mstr\u001B[39m,\n\u001B[1;32m   (...)\u001B[0m\n\u001B[0;32m    264\u001B[0m     path_parts: Optional[Mapping[\u001B[38;5;28mstr\u001B[39m, Any]] \u001B[38;5;241m=\u001B[39m \u001B[38;5;28;01mNone\u001B[39;00m,\n\u001B[0;32m    265\u001B[0m ) \u001B[38;5;241m-\u001B[39m\u001B[38;5;241m>\u001B[39m ApiResponse[Any]:\n\u001B[0;32m    266\u001B[0m     \u001B[38;5;28;01mwith\u001B[39;00m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39m_otel\u001B[38;5;241m.\u001B[39mspan(\n\u001B[0;32m    267\u001B[0m         method,\n\u001B[0;32m    268\u001B[0m         endpoint_id\u001B[38;5;241m=\u001B[39mendpoint_id,\n\u001B[0;32m    269\u001B[0m         path_parts\u001B[38;5;241m=\u001B[39mpath_parts \u001B[38;5;129;01mor\u001B[39;00m {},\n\u001B[0;32m    270\u001B[0m     ) \u001B[38;5;28;01mas\u001B[39;00m otel_span:\n\u001B[1;32m--> 271\u001B[0m         response \u001B[38;5;241m=\u001B[39m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39m_perform_request(\n\u001B[0;32m    272\u001B[0m             method,\n\u001B[0;32m    273\u001B[0m             path,\n\u001B[0;32m    274\u001B[0m             params\u001B[38;5;241m=\u001B[39mparams,\n\u001B[0;32m    275\u001B[0m             headers\u001B[38;5;241m=\u001B[39mheaders,\n\u001B[0;32m    276\u001B[0m             body\u001B[38;5;241m=\u001B[39mbody,\n\u001B[0;32m    277\u001B[0m             otel_span\u001B[38;5;241m=\u001B[39motel_span,\n\u001B[0;32m    278\u001B[0m         )\n\u001B[0;32m    279\u001B[0m         otel_span\u001B[38;5;241m.\u001B[39mset_elastic_cloud_metadata(response\u001B[38;5;241m.\u001B[39mmeta\u001B[38;5;241m.\u001B[39mheaders)\n\u001B[0;32m    280\u001B[0m         \u001B[38;5;28;01mreturn\u001B[39;00m response\n",
182 |       "File \u001B[1;32m~\\anaconda3\\Lib\\site-packages\\elasticsearch\\_sync\\client\\_base.py:316\u001B[0m, in \u001B[0;36mBaseClient._perform_request\u001B[1;34m(self, method, path, params, headers, body, otel_span)\u001B[0m\n\u001B[0;32m    313\u001B[0m \u001B[38;5;28;01melse\u001B[39;00m:\n\u001B[0;32m    314\u001B[0m     target \u001B[38;5;241m=\u001B[39m path\n\u001B[1;32m--> 316\u001B[0m meta, resp_body \u001B[38;5;241m=\u001B[39m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mtransport\u001B[38;5;241m.\u001B[39mperform_request(\n\u001B[0;32m    317\u001B[0m     method,\n\u001B[0;32m    318\u001B[0m     target,\n\u001B[0;32m    319\u001B[0m     headers\u001B[38;5;241m=\u001B[39mrequest_headers,\n\u001B[0;32m    320\u001B[0m     body\u001B[38;5;241m=\u001B[39mbody,\n\u001B[0;32m    321\u001B[0m     request_timeout\u001B[38;5;241m=\u001B[39m\u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39m_request_timeout,\n\u001B[0;32m    322\u001B[0m     max_retries\u001B[38;5;241m=\u001B[39m\u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39m_max_retries,\n\u001B[0;32m    323\u001B[0m     retry_on_status\u001B[38;5;241m=\u001B[39m\u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39m_retry_on_status,\n\u001B[0;32m    324\u001B[0m     retry_on_timeout\u001B[38;5;241m=\u001B[39m\u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39m_retry_on_timeout,\n\u001B[0;32m    325\u001B[0m     client_meta\u001B[38;5;241m=\u001B[39m\u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39m_client_meta,\n\u001B[0;32m    326\u001B[0m     otel_span\u001B[38;5;241m=\u001B[39motel_span,\n\u001B[0;32m    327\u001B[0m )\n\u001B[0;32m    329\u001B[0m \u001B[38;5;66;03m# HEAD with a 404 is returned as a normal response\u001B[39;00m\n\u001B[0;32m    330\u001B[0m \u001B[38;5;66;03m# since this is used as an 'exists' functionality.\u001B[39;00m\n\u001B[0;32m    331\u001B[0m \u001B[38;5;28;01mif\u001B[39;00m \u001B[38;5;129;01mnot\u001B[39;00m (method \u001B[38;5;241m==\u001B[39m \u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mHEAD\u001B[39m\u001B[38;5;124m\"\u001B[39m \u001B[38;5;129;01mand\u001B[39;00m meta\u001B[38;5;241m.\u001B[39mstatus \u001B[38;5;241m==\u001B[39m \u001B[38;5;241m404\u001B[39m) \u001B[38;5;129;01mand\u001B[39;00m (\n\u001B[0;32m    332\u001B[0m     \u001B[38;5;129;01mnot\u001B[39;00m \u001B[38;5;241m200\u001B[39m \u001B[38;5;241m<\u001B[39m\u001B[38;5;241m=\u001B[39m meta\u001B[38;5;241m.\u001B[39mstatus \u001B[38;5;241m<\u001B[39m \u001B[38;5;241m299\u001B[39m\n\u001B[0;32m    333\u001B[0m     \u001B[38;5;129;01mand\u001B[39;00m (\n\u001B[1;32m   (...)\u001B[0m\n\u001B[0;32m    337\u001B[0m     )\n\u001B[0;32m    338\u001B[0m ):\n",
183 |       "File \u001B[1;32m~\\anaconda3\\Lib\\site-packages\\elastic_transport\\_transport.py:342\u001B[0m, in \u001B[0;36mTransport.perform_request\u001B[1;34m(self, method, target, body, headers, max_retries, retry_on_status, retry_on_timeout, request_timeout, client_meta, otel_span)\u001B[0m\n\u001B[0;32m    340\u001B[0m \u001B[38;5;28;01mtry\u001B[39;00m:\n\u001B[0;32m    341\u001B[0m     otel_span\u001B[38;5;241m.\u001B[39mset_node_metadata(node\u001B[38;5;241m.\u001B[39mhost, node\u001B[38;5;241m.\u001B[39mport, node\u001B[38;5;241m.\u001B[39mbase_url, target)\n\u001B[1;32m--> 342\u001B[0m     resp \u001B[38;5;241m=\u001B[39m node\u001B[38;5;241m.\u001B[39mperform_request(\n\u001B[0;32m    343\u001B[0m         method,\n\u001B[0;32m    344\u001B[0m         target,\n\u001B[0;32m    345\u001B[0m         body\u001B[38;5;241m=\u001B[39mrequest_body,\n\u001B[0;32m    346\u001B[0m         headers\u001B[38;5;241m=\u001B[39mrequest_headers,\n\u001B[0;32m    347\u001B[0m         request_timeout\u001B[38;5;241m=\u001B[39mrequest_timeout,\n\u001B[0;32m    348\u001B[0m     )\n\u001B[0;32m    349\u001B[0m     _logger\u001B[38;5;241m.\u001B[39minfo(\n\u001B[0;32m    350\u001B[0m         \u001B[38;5;124m\"\u001B[39m\u001B[38;5;132;01m%s\u001B[39;00m\u001B[38;5;124m \u001B[39m\u001B[38;5;132;01m%s\u001B[39;00m\u001B[38;5;132;01m%s\u001B[39;00m\u001B[38;5;124m [status:\u001B[39m\u001B[38;5;132;01m%s\u001B[39;00m\u001B[38;5;124m duration:\u001B[39m\u001B[38;5;132;01m%.3f\u001B[39;00m\u001B[38;5;124ms]\u001B[39m\u001B[38;5;124m\"\u001B[39m\n\u001B[0;32m    351\u001B[0m         \u001B[38;5;241m%\u001B[39m (\n\u001B[1;32m   (...)\u001B[0m\n\u001B[0;32m    357\u001B[0m         )\n\u001B[0;32m    358\u001B[0m     )\n\u001B[0;32m    360\u001B[0m     \u001B[38;5;28;01mif\u001B[39;00m method \u001B[38;5;241m!=\u001B[39m \u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mHEAD\u001B[39m\u001B[38;5;124m\"\u001B[39m:\n",
184 |       "File \u001B[1;32m~\\anaconda3\\Lib\\site-packages\\elastic_transport\\_node\\_http_urllib3.py:202\u001B[0m, in \u001B[0;36mUrllib3HttpNode.perform_request\u001B[1;34m(self, method, target, body, headers, request_timeout)\u001B[0m\n\u001B[0;32m    194\u001B[0m         err \u001B[38;5;241m=\u001B[39m \u001B[38;5;167;01mConnectionError\u001B[39;00m(\u001B[38;5;28mstr\u001B[39m(e), errors\u001B[38;5;241m=\u001B[39m(e,))\n\u001B[0;32m    195\u001B[0m     \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39m_log_request(\n\u001B[0;32m    196\u001B[0m         method\u001B[38;5;241m=\u001B[39mmethod,\n\u001B[0;32m    197\u001B[0m         target\u001B[38;5;241m=\u001B[39mtarget,\n\u001B[1;32m   (...)\u001B[0m\n\u001B[0;32m    200\u001B[0m         exception\u001B[38;5;241m=\u001B[39merr,\n\u001B[0;32m    201\u001B[0m     )\n\u001B[1;32m--> 202\u001B[0m     \u001B[38;5;28;01mraise\u001B[39;00m err \u001B[38;5;28;01mfrom\u001B[39;00m \u001B[38;5;28;01mNone\u001B[39;00m\n\u001B[0;32m    204\u001B[0m meta \u001B[38;5;241m=\u001B[39m ApiResponseMeta(\n\u001B[0;32m    205\u001B[0m     node\u001B[38;5;241m=\u001B[39m\u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mconfig,\n\u001B[0;32m    206\u001B[0m     duration\u001B[38;5;241m=\u001B[39mduration,\n\u001B[1;32m   (...)\u001B[0m\n\u001B[0;32m    209\u001B[0m     headers\u001B[38;5;241m=\u001B[39mresponse_headers,\n\u001B[0;32m    210\u001B[0m )\n\u001B[0;32m    211\u001B[0m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39m_log_request(\n\u001B[0;32m    212\u001B[0m     method\u001B[38;5;241m=\u001B[39mmethod,\n\u001B[0;32m    213\u001B[0m     target\u001B[38;5;241m=\u001B[39mtarget,\n\u001B[1;32m   (...)\u001B[0m\n\u001B[0;32m    217\u001B[0m     response\u001B[38;5;241m=\u001B[39mdata,\n\u001B[0;32m    218\u001B[0m )\n",
185 |       "\u001B[1;31mConnectionTimeout\u001B[0m: Connection timed out"
186 |      ]
187 |     }
188 |    ],
189 |    "source": [
190 |     "from pathlib import Path\n",
191 |     "import os\n",
192 |     "import json\n",
193 |     "from tqdm import tqdm\n",
194 |     "\n",
195 |     "source_directory = Path('C:/Users/linus/big_data/pubmed/first100JSONLembedded/')\n",
196 |     "\n",
197 |     "index_name = \"pubmed_emb_index\"\n",
198 |     "\n",
199 |     "def bulk_index_documents(source_directory, index_name):\n",
200 |     "    if not source_directory.exists():\n",
201 |     "        print(\"The source directory does not exist.\")\n",
202 |     "        return\n",
203 |     "\n",
204 |     "    actions = [] \n",
205 |     "\n",
206 |     "    for file_name in tqdm(os.listdir(source_directory)):\n",
207 |     "        if file_name.endswith('.jsonl'):\n",
208 |     "            source_file = source_directory / file_name\n",
209 |     "            \n",
210 |     "            with open(source_file, 'r') as json_file:\n",
211 |     "                for line in json_file:\n",
212 |     "                    try:\n",
213 |     "                        doc = json.loads(line)\n",
214 |     "                        action = {\n",
215 |     "                            \"_index\": index_name,\n",
216 |     "                            \"_source\": doc\n",
217 |     "                        }\n",
218 |     "                        actions.append(action)\n",
219 |     "\n",
220 |     "                        if len(actions) == 600: # Bulk 600 docs\n",
221 |     "                            helpers.bulk(es, actions)\n",
222 |     "                            actions = [] \n",
223 |     "                    except json.JSONDecodeError as e:\n",
224 |     "                        print(f\"Error decoding JSON: {e}\")\n",
225 |     "                    except Exception as e:\n",
226 |     "                        print(f\"An error occurred: {e}\")\n",
227 |     "\n",
228 |     "    if actions:\n",
229 |     "        helpers.bulk(es, actions)\n",
230 |     "\n",
231 |     "    print('Indexing complete')\n",
232 |     "\n",
233 |     "bulk_index_documents(source_directory, index_name)"
234 |    ],
235 |    "metadata": {
236 |     "collapsed": false
237 |    },
238 |    "id": "7a60983235cd1e2b",
239 |    "execution_count": 15
240 |   },
241 |   {
242 |    "cell_type": "code",
243 |    "outputs": [
244 |     {
245 |      "name": "stdout",
246 |      "output_type": "stream",
247 |      "text": [
248 |       "Index contains 100 JSONL Chunks with 1795307 documents.\n"
249 |      ]
250 |     }
251 |    ],
252 |    "source": [
253 |     "count_result = es.count(index='pubmed_emb_index')\n",
254 |     "\n",
255 |     "# Print the count\n",
256 |     "print(f\"Index contains 100 JSONL Chunks with {count_result['count']} documents.\")"
257 |    ],
258 |    "metadata": {
259 |     "collapsed": false,
260 |     "ExecuteTime": {
261 |      "end_time": "2024-04-06T21:02:07.817164Z",
262 |      "start_time": "2024-04-06T21:02:07.768706Z"
263 |     }
264 |    },
265 |    "id": "30f73281bec8ed96",
266 |    "execution_count": 9
267 |   },
268 |   {
269 |    "cell_type": "code",
270 |    "outputs": [
271 |     {
272 |      "name": "stdout",
273 |      "output_type": "stream",
274 |      "text": [
275 |       "Die Grösse des Indexes ist 30.2 GB.\n"
276 |      ]
277 |     }
278 |    ],
279 |    "source": [
280 |     "response = es.indices.stats(index='pubmed_emb_index')\n",
281 |     "index_size = response['_all']['total']['store']['size_in_bytes']\n",
282 |     "\n",
283 |     "print(f\"Die Grösse des Indexes ist {round(index_size/1000000000, 2)} GB.\")"
284 |    ],
285 |    "metadata": {
286 |     "collapsed": false,
287 |     "ExecuteTime": {
288 |      "end_time": "2024-04-06T21:02:12.749395Z",
289 |      "start_time": "2024-04-06T21:02:12.668611Z"
290 |     }
291 |    },
292 |    "id": "6980954f4300fb66",
293 |    "execution_count": 10
294 |   },
295 |   {
296 |    "cell_type": "markdown",
297 |    "source": [],
298 |    "metadata": {
299 |     "collapsed": false
300 |    },
301 |    "id": "2b4b89dea2ad760a"
302 |   },
303 |   {
304 |    "cell_type": "markdown",
305 |    "source": [],
306 |    "metadata": {
307 |     "collapsed": false
308 |    },
309 |    "id": "74ff19354036d83e"
310 |   },
311 |   {
312 |    "cell_type": "markdown",
313 |    "source": [],
314 |    "metadata": {
315 |     "collapsed": false
316 |    },
317 |    "id": "cdbfcb84384f18e3"
318 |   },
319 |   {
320 |    "cell_type": "markdown",
321 |    "source": [],
322 |    "metadata": {
323 |     "collapsed": false
324 |    },
325 |    "id": "ee9caf6f0f31f667"
326 |   },
327 |   {
328 |    "cell_type": "markdown",
329 |    "source": [],
330 |    "metadata": {
331 |     "collapsed": false
332 |    },
333 |    "id": "77a8d3794968c862"
334 |   }
335 |  ],
336 |  "metadata": {
337 |   "kernelspec": {
338 |    "display_name": "Python 3",
339 |    "language": "python",
340 |    "name": "python3"
341 |   },
342 |   "language_info": {
343 |    "codemirror_mode": {
344 |     "name": "ipython",
345 |     "version": 2
346 |    },
347 |    "file_extension": ".py",
348 |    "mimetype": "text/x-python",
349 |    "name": "python",
350 |    "nbconvert_exporter": "python",
351 |    "pygments_lexer": "ipython2",
352 |    "version": "2.7.6"
353 |   }
354 |  },
355 |  "nbformat": 4,
356 |  "nbformat_minor": 5
357 | }
358 | 


--------------------------------------------------------------------------------
/evaluation/evaluation_data_storages/elasticsearch/eval_elastic.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "id": "initial_id",
  7 |    "metadata": {
  8 |     "ExecuteTime": {
  9 |      "end_time": "2024-04-11T15:24:33.998287Z",
 10 |      "start_time": "2024-04-11T15:24:33.902535Z"
 11 |     },
 12 |     "collapsed": true
 13 |    },
 14 |    "outputs": [
 15 |     {
 16 |      "name": "stdout",
 17 |      "output_type": "stream",
 18 |      "text": [
 19 |       "{'name': '6c4d8312349c', 'cluster_name': 'docker-cluster', 'cluster_uuid': 'JIOcJVSbToiJAWP2y6W5jQ', 'version': {'number': '8.13.2', 'build_flavor': 'default', 'build_type': 'docker', 'build_hash': '16cc90cd2d08a3147ce02b07e50894bc060a4cbf', 'build_date': '2024-04-05T14:45:26.420424304Z', 'build_snapshot': False, 'lucene_version': '9.10.0', 'minimum_wire_compatibility_version': '7.17.0', 'minimum_index_compatibility_version': '7.0.0'}, 'tagline': 'You Know, for Search'}\n"
 20 |      ]
 21 |     }
 22 |    ],
 23 |    "source": [
 24 |     "from elasticsearch import Elasticsearch\n",
 25 |     "import urllib3\n",
 26 |     "import os\n",
 27 |     "\n",
 28 |     "\n",
 29 |     "elastic_password = os.getenv('ELASTIC_PASSWORD')\n",
 30 |     "\n",
 31 |     "es = Elasticsearch(\n",
 32 |     "            ['https://localhost:9200'],\n",
 33 |     "            basic_auth=('elastic', elastic_password),\n",
 34 |     "            verify_certs=True,\n",
 35 |     "            ca_certs=\"/home/rag/.crt/http_ca.crt\",\n",
 36 |     "            request_timeout=60\n",
 37 |     "        )\n",
 38 |     "\n",
 39 |     "index_name = \"pubmed_index\"\n",
 40 |     "\n",
 41 |     "urllib3.disable_warnings()\n",
 42 |     "\n",
 43 |     "print(es.info())"
 44 |    ]
 45 |   },
 46 |   {
 47 |    "cell_type": "code",
 48 |    "execution_count": 2,
 49 |    "id": "68205f492c85d24b",
 50 |    "metadata": {
 51 |     "ExecuteTime": {
 52 |      "end_time": "2024-04-11T15:27:03.643225Z",
 53 |      "start_time": "2024-04-11T15:27:03.607945Z"
 54 |     },
 55 |     "collapsed": false
 56 |    },
 57 |    "outputs": [
 58 |     {
 59 |      "data": {
 60 |       "text/plain": [
 61 |        "ObjectApiResponse({'count': 10269126, '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0}})"
 62 |       ]
 63 |      },
 64 |      "execution_count": 2,
 65 |      "metadata": {},
 66 |      "output_type": "execute_result"
 67 |     }
 68 |    ],
 69 |    "source": [
 70 |     "# checking number of documents in the index\n",
 71 |     "es.count(index=index_name)"
 72 |    ]
 73 |   },
 74 |   {
 75 |    "cell_type": "markdown",
 76 |    "id": "cb477e6f0b7178d4",
 77 |    "metadata": {
 78 |     "collapsed": false
 79 |    },
 80 |    "source": [
 81 |     "### Define query functions for BM25 and Vector Similarity Search\n",
 82 |     "\n",
 83 |     "Define a function to perform a BM25 search using the match query."
 84 |    ]
 85 |   },
 86 |   {
 87 |    "cell_type": "code",
 88 |    "execution_count": 10,
 89 |    "id": "f78167d428f06529",
 90 |    "metadata": {
 91 |     "ExecuteTime": {
 92 |      "end_time": "2024-04-11T15:27:07.474197Z",
 93 |      "start_time": "2024-04-11T15:27:07.468939Z"
 94 |     },
 95 |     "collapsed": false
 96 |    },
 97 |    "outputs": [],
 98 |    "source": [
 99 |     "# Define a search query\n",
100 |     "def bm25_search(query: str, k: int = 5):\n",
101 |     "    query = {\n",
102 |     "        \"size\": k,\n",
103 |     "        \"query\": {\n",
104 |     "            \"match\": {\n",
105 |     "                \"content\": f\"{query}\"\n",
106 |     "            }\n",
107 |     "        },\n",
108 |     "        \"_source\": [\"PMID\", \"title\"]\n",
109 |     "    }\n",
110 |     "    # Elasticsearch nutzt standardmässig das BM25-Modell, um die Relevanz der Dokumente zu berechnen\n",
111 |     "    return es.search(index='pubmed_index', body=query)"
112 |    ]
113 |   },
114 |   {
115 |    "cell_type": "markdown",
116 |    "id": "d079c4d6618d3b38",
117 |    "metadata": {
118 |     "collapsed": false
119 |    },
120 |    "source": [
121 |     "Initialize the text embedder and define a function to convert a query to a vector using the bioBERT embeddings."
122 |    ]
123 |   },
124 |   {
125 |    "cell_type": "code",
126 |    "execution_count": 14,
127 |    "id": "ed4ac4f757e7680d",
128 |    "metadata": {
129 |     "ExecuteTime": {
130 |      "end_time": "2024-04-11T15:27:23.549841Z",
131 |      "start_time": "2024-04-11T15:27:14.558573Z"
132 |     },
133 |     "collapsed": false
134 |    },
135 |    "outputs": [],
136 |    "source": [
137 |     "from Embedding import TextEmbedder\n",
138 |     "embedder = TextEmbedder()\n",
139 |     "\n",
140 |     "def query_to_vector(text, embedder):\n",
141 |     "    embedding = embedder.embed(text)\n",
142 |     "    return embedding"
143 |    ]
144 |   },
145 |   {
146 |    "cell_type": "markdown",
147 |    "id": "e6015236b12040cb",
148 |    "metadata": {
149 |     "collapsed": false
150 |    },
151 |    "source": [
152 |     "Define a function to perform a vector similarity search using the cosine similarity between the query vector and the embeddings in the indexed documents."
153 |    ]
154 |   },
155 |   {
156 |    "cell_type": "code",
157 |    "execution_count": 4,
158 |    "id": "caccaaa64018caea",
159 |    "metadata": {
160 |     "ExecuteTime": {
161 |      "end_time": "2024-04-10T11:14:40.480125Z",
162 |      "start_time": "2024-04-10T11:14:40.474963Z"
163 |     },
164 |     "collapsed": false
165 |    },
166 |    "outputs": [],
167 |    "source": [
168 |     "def cosine_similarity(index, query: str, k: int = 5):\n",
169 |     "    query_vector = query_to_vector(query, embedder)\n",
170 |     "    \n",
171 |     "    query = {\n",
172 |     "        \"size\": k,  # Anzahl der zurückzugebenden Ergebnisse\n",
173 |     "        \"query\": {\n",
174 |     "            \"script_score\": {\n",
175 |     "                \"query\": {\"match_all\": {}},\n",
176 |     "                \"script\": {\n",
177 |     "                    \"source\": \"cosineSimilarity(params.query_vector, 'embeddings') + 1.0\",\n",
178 |     "                    # +1.0, um sicherzustellen, dass alle Werte positiv sind\n",
179 |     "                    \"params\": {\"query_vector\": query_vector}\n",
180 |     "                }\n",
181 |     "            }\n",
182 |     "        }\n",
183 |     "    }\n",
184 |     "    return es.search(index=index, body=query)"
185 |    ]
186 |   },
187 |   {
188 |    "cell_type": "raw",
189 |    "id": "6101062777191904",
190 |    "metadata": {
191 |     "collapsed": false
192 |    },
193 |    "source": [
194 |     "trying to use the knn search instead of the cosine similarity search"
195 |    ]
196 |   },
197 |   {
198 |    "cell_type": "code",
199 |    "execution_count": 5,
200 |    "id": "89ecbbd9451faf5f",
201 |    "metadata": {
202 |     "ExecuteTime": {
203 |      "end_time": "2024-04-10T11:14:41.556397Z",
204 |      "start_time": "2024-04-10T11:14:41.551909Z"
205 |     },
206 |     "collapsed": false
207 |    },
208 |    "outputs": [],
209 |    "source": [
210 |     "def knn_search(index, query: str, k: int = 10):\n",
211 |     "    # Wandeln Sie die Abfrage in einen Vektor um\n",
212 |     "    query_vector = query_to_vector(query, embedder)\n",
213 |     "    \n",
214 |     "    # Konstruieren Sie die k-NN-Suche\n",
215 |     "    knn_query = { \n",
216 |     "        \"knn\": {\n",
217 |     "            \"field\": \"embeddings\",  # Das Feld, das die Vektoren enthält\n",
218 |     "            \"query_vector\": query_vector,\n",
219 |     "            \"k\": k,\n",
220 |     "            \"num_candidates\": 100 \n",
221 |     "        }\n",
222 |     "    }\n",
223 |     "    \n",
224 |     "    # Führen Sie die k-NN-Suche aus\n",
225 |     "    return es.search(index=index, body=knn_query)"
226 |    ]
227 |   },
228 |   {
229 |    "cell_type": "markdown",
230 |    "id": "8548993de949a242",
231 |    "metadata": {
232 |     "collapsed": false
233 |    },
234 |    "source": [
235 |     "#### Perform BM25 and Vector Similarity Searches\n",
236 |     "\n",
237 |     "first, perform a BM25 search using the match query."
238 |    ]
239 |   },
240 |   {
241 |    "cell_type": "code",
242 |    "execution_count": 14,
243 |    "id": "4cc8b147ef147f06",
244 |    "metadata": {
245 |     "ExecuteTime": {
246 |      "end_time": "2024-04-11T15:29:51.585418Z",
247 |      "start_time": "2024-04-11T15:29:48.190681Z"
248 |     },
249 |     "collapsed": false
250 |    },
251 |    "outputs": [],
252 |    "source": [
253 |     "results = bm25_search(\"What is the mortality rate of COVID-19?\", k=100)"
254 |    ]
255 |   },
256 |   {
257 |    "cell_type": "code",
258 |    "execution_count": 15,
259 |    "id": "c2c54153b7684d93",
260 |    "metadata": {
261 |     "ExecuteTime": {
262 |      "end_time": "2024-04-11T15:29:54.532943Z",
263 |      "start_time": "2024-04-11T15:29:54.527831Z"
264 |     },
265 |     "collapsed": false
266 |    },
267 |    "outputs": [
268 |     {
269 |      "name": "stdout",
270 |      "output_type": "stream",
271 |      "text": [
272 |       "Score: 26.324913, PMID: 32621066, Title: Analysis of Austrian COVID-19 deaths by age and sex.\n",
273 |       "Score: 26.013273, PMID: 34783897, Title: Preterm birth, stillbirth and early neonatal mortality during the Danish COVID-19 lockdown.\n",
274 |       "Score: 25.476734, PMID: 33776407, Title: Multiple sclerosis patients and COVID-19.\n",
275 |       "Score: 25.37893, PMID: 35813262, Title: Measuring the Effect of COVID-19 Pandemic on Mortality: Review and Prospect - China, 2021.\n",
276 |       "Score: 25.233519, PMID: 32865940, Title: COVID-19: Why Has the Mortality Rate Declined?\n",
277 |       "Score: 24.946014, PMID: 32365212, Title: [Covid-19 - deaths and analysis].\n",
278 |       "Score: 24.925226, PMID: 33218796, Title: Spatial inequalities of COVID-19 mortality rate in relation to socioeconomic and environmental factors across England.\n",
279 |       "Score: 24.884295, PMID: 35482643, Title: COVID-19 mortality in the United States: It's been two Americas from the start.\n",
280 |       "Score: 24.837067, PMID: 32678061, Title: Men and COVID-19: A Biopsychosocial Approach to Understanding Sex Differences in Mortality and Recommendations for Practice and Policy Interventions.\n",
281 |       "Score: 24.779966, PMID: 36343310, Title: Racial And Ethnic Inequalities In COVID-19 Mortality Within Carceral Settings: An Analysis Of Texas Prisons.\n",
282 |       "Score: 24.666018, PMID: 32504106, Title: [Increased risk of deep vein thrombosis in intensive care unit patients with CoViD-19 infections?-Preliminary data].\n",
283 |       "Score: 24.40749, PMID: 36199983, Title: Does 'Data fudging' explain the autocratic advantage? Evidence from the gap between Official Covid-19 mortality and excess mortality.\n",
284 |       "Score: 24.375858, PMID: 35077101, Title: [Association between Covid-19 mortality and atmospheric pollution in Mexican cities].\n",
285 |       "Score: 24.349539, PMID: 35058532, Title: State social distancing restrictions and nursing home outcomes.\n",
286 |       "Score: 24.306416, PMID: 32501382, Title: What orthopedic surgeons need to know about Covid-19 pandemic.\n",
287 |       "Score: 24.187761, PMID: 33075535, Title: Africa's low COVID-19 mortality rate: A paradox?\n",
288 |       "Score: 24.10879, PMID: 36060856, Title: Excess deaths during the COVID-19 pandemic in Alberta, Canada.\n",
289 |       "Score: 24.030546, PMID: 34581999, Title: Changes in COVID-19-Associated Deaths During a Year Among Blacks and Hispanics Compared to Whites in the State of Connecticut.\n",
290 |       "Score: 24.01647, PMID: 36142027, Title: COVID-19, Non-Communicable Diseases, and Behavioral Factors in the Peruvian Population ≥ 15 Years: An Ecological Study during the First and Second Year of the COVID-19 Pandemic.\n",
291 |       "Score: 23.977262, PMID: 33728066, Title: Effect of COVID-19 on Mortality of Pregnant and Postpartum Women: A Systematic Review and Meta-Analysis.\n",
292 |       "Score: 23.973434, PMID: 32709854, Title: Covid-19 mortality is negatively associated with test number and government effectiveness.\n",
293 |       "Score: 23.84487, PMID: 32661937, Title: Pediatrician, watch out for corona-phobia.\n",
294 |       "Score: 23.754244, PMID: 34218318, Title: COVID-19 in pediatric patients undergoing chronic dialysis and kidney transplantation.\n",
295 |       "Score: 23.751411, PMID: 32756513, Title: COVID-19 Global Risk: Expectation vs. Reality.\n",
296 |       "Score: 23.73906, PMID: 32992105, Title: Preparing for and responding to Covid-19's 'second hit'.\n",
297 |       "Score: 23.738796, PMID: 34048201, Title: Impact of COVID-19 Pandemic in a Brazilian High-Volume Aortic Center.\n",
298 |       "Score: 23.663988, PMID: 34400452, Title: Temporal trends of COVID-19 mortality and hospitalisation rates: an observational cohort study from the US Department of Veterans Affairs.\n",
299 |       "Score: 23.659967, PMID: 35217534, Title: Contribution of the elevated thrombosis risk of males to the excess male mortality observed in COVID-19: an observational study.\n",
300 |       "Score: 23.652725, PMID: 35930243, Title: The role of kidney injury biomarkers in COVID-19.\n",
301 |       "Score: 23.644642, PMID: 35330387, Title: Characteristics of the Third COVID-19 Pandemic Wave with Special Focus on Socioeconomic Inequalities in Morbidity, Mortality and the Uptake of COVID-19 Vaccination in Hungary.\n",
302 |       "Score: 23.502783, PMID: 34630651, Title: Extracorporeal membrane oxygenation in the management of critically ill patients with coronavirus disease 2019: A narrative review.\n",
303 |       "Score: 23.470608, PMID: 32343650, Title: Excess Mortality Estimation During the COVID-19 Pandemic: Preliminary Data from Portugal.\n",
304 |       "Score: 23.391472, PMID: 33292536, Title: All-cause mortality supports the COVID-19 mortality in Belgium and comparison with major fatal events of the last century.\n",
305 |       "Score: 23.388765, PMID: 34308302, Title: Estimating the early impact of vaccination against COVID-19 on deaths among elderly people in Brazil: Analyses of routinely-collected data on vaccine coverage and mortality.\n",
306 |       "Score: 23.369747, PMID: 34354682, Title: Causes and Consequences of COVID-19-Associated Bacterial Infections.\n",
307 |       "Score: 23.368668, PMID: 36093278, Title: The sources of the Kuznets relationship between the COVID-19 mortality rate and economic performance.\n",
308 |       "Score: 23.34332, PMID: 34534226, Title: COVID-19 mortality with regard to healthcare services availability, health risks, and socio-spatial factors at department level in France: A spatial cross-sectional analysis.\n",
309 |       "Score: 23.311697, PMID: 35898347, Title: Geographic and Temporal Patterns in Covid-19 Mortality by Race and Ethnicity in the United States from March 2020 to February 2022.\n",
310 |       "Score: 23.279709, PMID: 32831521, Title: Putative Natural History of CoViD-19.\n",
311 |       "Score: 23.26163, PMID: 34240648, Title: The one-sided explanations of a multifactorial coronavirus disease.\n",
312 |       "Score: 23.255978, PMID: 33860908, Title: How often and to what extent do admitted COVID-19 patients have signs of cardiac injury?\n",
313 |       "Score: 23.243877, PMID: 32776139, Title: Osteopathic Considerations for the Pregnant Patient with COVID-19.\n",
314 |       "Score: 23.24017, PMID: 32945643, Title: Sex hormones and COVID-19: tussle between the two.\n",
315 |       "Score: 23.193726, PMID: 32982349, Title: COVID's Razor: RAS Imbalance, the Common Denominator Across Disparate, Unexpected Aspects of COVID-19.\n",
316 |       "Score: 23.180195, PMID: 35737205, Title: Sex differences in the mortality rate for coronavirus disease 2019 compared to other causes of death: an analysis of population-wide data from 63 countries.\n",
317 |       "Score: 23.161182, PMID: 32953124, Title: Socio-economic inequality in global incidence and mortality rates from coronavirus disease 2019: an ecological study.\n",
318 |       "Score: 23.136011, PMID: 34318585, Title: One-year mortality and consequences of COVID-19 in cancer patients: A cohort study.\n",
319 |       "Score: 23.110867, PMID: 34960692, Title: The Impact of COVID-19 on Mortality in Spain: Monitoring Excess Mortality (MoMo) and the Surveillance of Confirmed COVID-19 Deaths.\n",
320 |       "Score: 23.061127, PMID: 34402152, Title: A global country-level analysis of the relationship between obesity and COVID-19 cases and mortality.\n",
321 |       "Score: 23.05785, PMID: 35951587, Title: Association between median household income, state Medicaid expansion status, and COVID-19 outcomes across US counties.\n",
322 |       "Score: 23.050909, PMID: 34154865, Title: Perioperative mortality and morbidity of hip fractures among COVID-19 infected and non-infected patients: A systematic review and meta-analysis.\n",
323 |       "Score: 23.036907, PMID: 34772396, Title: Effect of altitude on COVID-19 mortality in Ecuador: an ecological study.\n",
324 |       "Score: 22.996613, PMID: 35946619, Title: Social and territorial inequalities in the mortality of children and adolescents due to COVID-19 in Brazil.\n",
325 |       "Score: 22.993288, PMID: 35908851, Title: Marburg virus disease: A deadly rare virus is coming.\n",
326 |       "Score: 22.960596, PMID: 33350316, Title: The prevalence, mortality, and associated risk factors for developing COVID-19 in hip fracture patients: a systematic review and meta-analysis.\n",
327 |       "Score: 22.942175, PMID: 34612774, Title: Mortality in hospitalized COVID-19 patients was associated with the COVID-19 admission rate during the first year of the pandemic in Sweden.\n",
328 |       "Score: 22.86676, PMID: 36319938, Title: Centenarians born before 1919 are resistant to COVID-19.\n",
329 |       "Score: 22.863209, PMID: 33163879, Title: A comparative analysis of the COVID-19 pandemic response: The case of Turkey.\n",
330 |       "Score: 22.852364, PMID: 32941862, Title: Critically Ill Adults With Coronavirus Disease 2019 in New Orleans and Care With an Evidence-Based Protocol.\n",
331 |       "Score: 22.83515, PMID: 32947506, Title: Systematic review and meta-analysis of the effectiveness and safety of hydroxychloroquine in treating COVID-19 patients.\n",
332 |       "Score: 22.809055, PMID: 33690595, Title: International heterogeneity in coronavirus disease 2019 pediatric mortality rates.\n",
333 |       "Score: 22.803581, PMID: 33412821, Title: Socioeconomic inequalities in overall and COVID-19 mortality during the first outbreak peak in Emilia-Romagna Region (Northern Italy).\n",
334 |       "Score: 22.79761, PMID: 33591211, Title: The rate of COVID-19 and associated mortality after elective hip and knee arthroplasty prior to cessation of elective services in UK.\n",
335 |       "Score: 22.794813, PMID: 33655277, Title: Sex differences in the mortality rate for coronavirus disease 2019 compared to other causes of death.\n",
336 |       "Score: 22.784885, PMID: 34143810, Title: Association of the past epidemic of Mycobacterium tuberculosis with mortality and incidence of COVID-19.\n",
337 |       "Score: 22.782915, PMID: 35811045, Title: Trends in Etiology-based Mortality From Chronic Liver Disease Before and During COVID-19 Pandemic in the United States.\n",
338 |       "Score: 22.77812, PMID: 34236244, Title: Clinical characteristics, risk factors, and cardiac manifestations of cancer patients with COVID-19.\n",
339 |       "Score: 22.765633, PMID: 33495884, Title: Worldwide inverse correlation between Bacille Calmette-Guérin (BCG) immunization and COVID-19 mortality.\n",
340 |       "Score: 22.764214, PMID: 34857490, Title: Impact of diabetes mellitus on COVID-19 clinical symptoms and mortality: Jakarta's COVID-19 epidemiological registry.\n",
341 |       "Score: 22.725569, PMID: 35757461, Title: Associations between nighttime light and COVID-19 incidence and mortality in the United States.\n",
342 |       "Score: 22.719309, PMID: 33818679, Title: Sex Disparities in COVID-19 Mortality Vary Across US Racial Groups.\n",
343 |       "Score: 22.710667, PMID: 33830986, Title: COVID-19 Incidence and Mortality Among American Indian/Alaska Native and White Persons - Montana, March 13-November 30, 2020.\n",
344 |       "Score: 22.710562, PMID: 34698315, Title: Association between Obesity and COVID-19 Mortality in Peru: An Ecological Study.\n",
345 |       "Score: 22.701323, PMID: 35360752, Title: HELLP Syndrome and COVID-19; association or accident: A case series.\n",
346 |       "Score: 22.699673, PMID: 34164954, Title: Incidence and Mortality Associated with Cardiovascular Medication among Hypertensive COVID-19 Patients in South Korea.\n",
347 |       "Score: 22.697275, PMID: 36189099, Title: Low Mortality of Orthopedic Trauma Patients With Asymptomatic COVID-19: A Level I Trauma Center Pandemic Experience.\n",
348 |       "Score: 22.682953, PMID: 32980614, Title: A systematic review of COVID-19 and obstructive sleep apnoea.\n",
349 |       "Score: 22.655434, PMID: 36125437, Title: Comparison of all renal replacement therapy modalities in terms of COVID-19 infection rate &amp; mortality in the COVID-19 pandemic and importance of home therapies.\n",
350 |       "Score: 22.59405, PMID: 32865700, Title: Respiratory characteristics and related intraoperative ventilatory management for patients with COVID-19 pneumonia.\n",
351 |       "Score: 22.58143, PMID: 33519136, Title: Hospital transmission rates of the SARS-CoV 2 disease amongst orthopaedic in-patients in a secondary care centre: A quantitative review.\n",
352 |       "Score: 22.571785, PMID: 35411615, Title: Outcomes of Minority COVID-19 patients managed with ECMO: A single-center experience.\n",
353 |       "Score: 22.562658, PMID: 33024235, Title: The age distribution of mortality from novel coronavirus disease (COVID-19) suggests no large difference of susceptibility by age.\n",
354 |       "Score: 22.5612, PMID: 34884277, Title: The Impact of COVID-19 Pandemic on Management and Outcome in Patients with Heart Failure.\n",
355 |       "Score: 22.557558, PMID: 33657587, Title: Lessons from COVID-19 mortality data across countries.\n",
356 |       "Score: 22.556175, PMID: 32292261, Title: Biological and epidemiological trends in the prevalence and mortality due to outbreaks of novel coronavirus COVID-19.\n",
357 |       "Score: 22.549688, PMID: 34780361, Title: Association between obesity and diabetes prevalence and COVID-19 mortality in Mexico: an ecological study.\n",
358 |       "Score: 22.54769, PMID: 35422037, Title: Rapidly improving acute respiratory distress syndrome in COVID-19: a multi-centre observational study.\n",
359 |       "Score: 22.540619, PMID: 33831280, Title: Distribution of COVID-19 cases and deaths in Europe during the first 12 peak weeks of outbreak.\n",
360 |       "Score: 22.53608, PMID: 34816925, Title: The prognostic significance of erythrocyte sedimentation rate in COVID-19.\n",
361 |       "Score: 22.532238, PMID: 35530744, Title: Incidence and predictors of mortality among COVID-19 patients admitted to treatment centers in North West Ethiopia; A retrospective cohort study, 2021.\n",
362 |       "Score: 22.523144, PMID: 35014703, Title: COVID-19 infection and its consequences among surgical oncology patients: A systematic analysis, meta-analysis and meta-regression.\n",
363 |       "Score: 22.519073, PMID: 33288965, Title: [Public healthcare expenditure and COVID-19 mortality in Spain and in Europe].\n",
364 |       "Score: 22.519073, PMID: 34049840, Title: Public healthcare expenditure and COVID-19 mortality in Spain and in Europe.\n",
365 |       "Score: 22.511044, PMID: 33556327, Title: Factors associated with the spatial heterogeneity of the first wave of COVID-19 in France: a nationwide geo-epidemiological study.\n",
366 |       "Score: 22.510841, PMID: 34027674, Title: Mortality risk of surgically managing orthopaedic trauma during the COVID-19 pandemic.\n",
367 |       "Score: 22.508713, PMID: 35340979, Title: The world trade network: country centrality and the COVID-19 pandemic.\n",
368 |       "Score: 22.507309, PMID: 34488764, Title: Impact of long-term exposure to PM<sub>2.5</sub> and temperature on coronavirus disease mortality: observed trends in France.\n",
369 |       "Score: 22.496813, PMID: 35260481, Title: Risk and protective factors for severe COVID-19 infection in a cohort of patients with sickle cell disease.\n",
370 |       "Score: 22.496813, PMID: 35700866, Title: Geographical distribution of cystic fibrosis carriers as population genetic determinant of COVID-19 spread and fatality in 37 countries.\n",
371 |       "Score: 22.489035, PMID: 33879694, Title: The therapeutic effect and safety of the drugs for COVID-19: A systematic review and meta-analysis.\n"
372 |      ]
373 |     }
374 |    ],
375 |    "source": [
376 |     "# Print the results\n",
377 |     "for hit in results['hits']['hits']:\n",
378 |     "    print(f\"Score: {hit['_score']}, PMID: {hit['_source']['PMID']}, Title: {hit['_source']['title']}\")"
379 |    ]
380 |   },
381 |   {
382 |    "cell_type": "markdown",
383 |    "id": "a381d7040c0a1d1c",
384 |    "metadata": {
385 |     "collapsed": false
386 |    },
387 |    "source": [
388 |     "#### Vector Similarity Search\n",
389 |     "now, perform a vector similarity search using the cosine similarity between the query vector and the embeddings in the indexed documents."
390 |    ]
391 |   },
392 |   {
393 |    "cell_type": "code",
394 |    "execution_count": 30,
395 |    "id": "c9b5f988888de099",
396 |    "metadata": {
397 |     "ExecuteTime": {
398 |      "end_time": "2024-04-06T21:20:25.617032Z",
399 |      "start_time": "2024-04-06T21:19:31.899009Z"
400 |     },
401 |     "collapsed": false
402 |    },
403 |    "outputs": [
404 |     {
405 |      "name": "stdout",
406 |      "output_type": "stream",
407 |      "text": [
408 |       "Score: 1.9210962,  PMID: 1501243, Title: Epidermal growth factor receptor: elements of intracellular communication.\n",
409 |       "Score: 1.9196633,  PMID: 2042633, Title: The Egr family of nuclear signal transducers.\n",
410 |       "Score: 1.9190896,  PMID: 1633422, Title: Crossed signals: oncogenic transcription factors.\n",
411 |       "Score: 1.9177192,  PMID: 2955833, Title: Phorbol esters as signal transducers and tumor promoters.\n",
412 |       "Score: 1.9172626,  PMID: 2824532, Title: PDGF-like growth factors in autocrine stimulation of growth.\n",
413 |       "Score: 1.9162145,  PMID: 1622545, Title: The regulation and function of p21ras in T cells.\n",
414 |       "Score: 1.9161748,  PMID: 1329870, Title: The junction between cytokines and cell adhesion.\n",
415 |       "Score: 1.9161192,  PMID: 1645965, Title: Overexpression of human TRK proto-oncogene into mouse cells using an inducible vector system.\n",
416 |       "Score: 1.9159867,  PMID: 1675819, Title: The expanding family of guanylyl cyclases.\n",
417 |       "Score: 1.9159176,  PMID: 1368709, Title: Analysing lymphokine-receptor interactions of IL-1 and IL-2 by recombinant-DNA technology.\n"
418 |      ]
419 |     }
420 |    ],
421 |    "source": [
422 |     "# Führen Sie die Abfrage aus\n",
423 |     "results = cosine_similarity(index_name, \"List signaling molecules (ligands) that interact with the receptor EGFR?\", k=10)\n",
424 |     "\n",
425 |     "for hit in results['hits']['hits']:\n",
426 |     "    print(f\"Score: {hit['_score']},  PMID: {hit['_source']['PMID']}, Title: {hit['_source']['title']}\")"
427 |    ]
428 |   },
429 |   {
430 |    "cell_type": "markdown",
431 |    "id": "4cea44d3127ce96",
432 |    "metadata": {
433 |     "collapsed": false
434 |    },
435 |    "source": [
436 |     "#### k-NN Search\n",
437 |     "perform a k-NN search using the k-NN search API."
438 |    ]
439 |   },
440 |   {
441 |    "cell_type": "code",
442 |    "execution_count": 45,
443 |    "id": "bfcd1d07b4fee44c",
444 |    "metadata": {
445 |     "ExecuteTime": {
446 |      "end_time": "2024-04-06T21:35:04.707056Z",
447 |      "start_time": "2024-04-06T21:34:40.128519Z"
448 |     },
449 |     "collapsed": false
450 |    },
451 |    "outputs": [
452 |     {
453 |      "name": "stdout",
454 |      "output_type": "stream",
455 |      "text": [
456 |       "Score: 0.96054816,  PMID: 1501243, Title: Epidermal growth factor receptor: elements of intracellular communication.\n",
457 |       "Score: 0.9598316,  PMID: 2042633, Title: The Egr family of nuclear signal transducers.\n",
458 |       "Score: 0.9595448,  PMID: 1633422, Title: Crossed signals: oncogenic transcription factors.\n",
459 |       "Score: 0.9588597,  PMID: 2955833, Title: Phorbol esters as signal transducers and tumor promoters.\n",
460 |       "Score: 0.9586312,  PMID: 2824532, Title: PDGF-like growth factors in autocrine stimulation of growth.\n",
461 |       "Score: 0.95810723,  PMID: 1622545, Title: The regulation and function of p21ras in T cells.\n",
462 |       "Score: 0.9580873,  PMID: 1329870, Title: The junction between cytokines and cell adhesion.\n",
463 |       "Score: 0.9579934,  PMID: 1675819, Title: The expanding family of guanylyl cyclases.\n",
464 |       "Score: 0.9579588,  PMID: 1368709, Title: Analysing lymphokine-receptor interactions of IL-1 and IL-2 by recombinant-DNA technology.\n",
465 |       "Score: 0.9578283,  PMID: 2103500, Title: Cellular and viral ligands that interact with the EGF receptor.\n"
466 |      ]
467 |     }
468 |    ],
469 |    "source": [
470 |     "results = knn_search(index_name, \"List signaling molecules (ligands) that interact with the receptor EGFR?\", k=10)\n",
471 |     "\n",
472 |     "for hit in results['hits']['hits']:\n",
473 |     "    print(f\"Score: {hit['_score']},  PMID: {hit['_source']['PMID']}, Title: {hit['_source']['title']}\")"
474 |    ]
475 |   },
476 |   {
477 |    "cell_type": "markdown",
478 |    "id": "1846b6e75b5da818",
479 |    "metadata": {
480 |     "collapsed": false
481 |    },
482 |    "source": [
483 |     "### ELSER - Elastic Search Retrieval \n",
484 |     "\n",
485 |     "ELSER is a sparse vector representation for semantic retrieval developed by Elastic. Instead of dense vector representations, ELSER uses sparse vectors to represent text data. "
486 |    ]
487 |   }
488 |  ],
489 |  "metadata": {
490 |   "kernelspec": {
491 |    "display_name": "Python 3",
492 |    "language": "python",
493 |    "name": "python3"
494 |   },
495 |   "language_info": {
496 |    "codemirror_mode": {
497 |     "name": "ipython",
498 |     "version": 3
499 |    },
500 |    "file_extension": ".py",
501 |    "mimetype": "text/x-python",
502 |    "name": "python",
503 |    "nbconvert_exporter": "python",
504 |    "pygments_lexer": "ipython3",
505 |    "version": "3.10.12"
506 |   }
507 |  },
508 |  "nbformat": 4,
509 |  "nbformat_minor": 5
510 | }
511 | 


--------------------------------------------------------------------------------