├── information_retrieval ├── elastic_container │ ├── errors.jsonl │ ├── start_elasticsearch.sh │ ├── ingest_data.py │ └── elastic.ipynb ├── faiss_container │ ├── docker-compose.yml │ ├── Dockerfile │ ├── server.py │ └── faiss_insert_data.ipynb └── document_encoding │ ├── bioBERT_encoder.py │ ├── medCPT_encoder.py │ └── encode_documents.ipynb ├── sys_requirements.txt ├── LICENSE ├── evaluation ├── evaluation_data_storages │ ├── faiss │ │ ├── conncatinatior.py │ │ ├── embedding_extractor.py │ │ └── request.ipynb │ ├── documentation.md │ ├── mongodb │ │ └── eval_mongo.ipynb │ └── elasticsearch │ │ ├── elastic.ipynb │ │ └── eval_elastic.ipynb └── evaluation_QA_system │ ├── full_text_evaluation.py │ ├── evaluation_pipeline.ipynb │ └── RAG_evaluator.py ├── rag_system ├── bm25_retriever.py ├── bioBERT_encoder.py ├── med_rag.py ├── hybrid_retriever.py ├── bioBERT_retriever.py ├── medCPT_encoder.py ├── openAI_chat.py └── medCPT_retriever.py ├── requirements.txt ├── .gitignore └── README.md /information_retrieval/elastic_container/errors.jsonl: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /information_retrieval/faiss_container/docker-compose.yml: -------------------------------------------------------------------------------- 1 | services: 2 | faiss-app: 3 | image: continuumio/anaconda3 4 | ports: 5 | - "5000:5000" 6 | volumes: 7 | - ./server.py:/app/server.py 8 | - ./faiss_indices:/app/faiss_indices 9 | - ./PMIDs:/app/PMIDs 10 | working_dir: /app 11 | environment: 12 | - FLASK_APP=server.py 13 | - FLASK_RUN_HOST=0.0.0.0 14 | command: > 15 | /bin/bash -c "conda install -c pytorch faiss-cpu -y && 16 | pip install flask numpy pandas && 17 | python server.py" -------------------------------------------------------------------------------- /sys_requirements.txt: -------------------------------------------------------------------------------- 1 | # Essential system packages for Ubuntu 22.04.4 LTS (Jammy) 2 | apt # Advanced Package Tool, a package management system for Debian 3 | curl # Command line tool for transferring data with URLs 4 | gcc # GNU Compiler Collection, a compiler system 5 | g++ # GNU C++ Compiler 6 | make # Utility for directing compilation 7 | python3 # Python programming language interpreter 8 | python3-pip # Package installer for Python 9 | git # Version control system 10 | docker-ce # Docker: the open-source application container engine 11 | docker-compose-plugin # Docker Compose (V2) plugin for the Docker CLI 12 | build-essential # Informational list of build-essential packages 13 | -------------------------------------------------------------------------------- /information_retrieval/faiss_container/Dockerfile: -------------------------------------------------------------------------------- 1 | # Use the Anaconda base image 2 | FROM continuumio/anaconda3 3 | 4 | # Set the working directory in the container 5 | WORKDIR /app 6 | 7 | # Install necessary packages using Conda and Pip 8 | RUN conda install -c pytorch faiss-cpu -y && \ 9 | pip install flask numpy pandas 10 | 11 | # Copy the application files into the container 12 | COPY ./server.py /app/server.py 13 | COPY ./faiss_indices /app/faiss_indices 14 | COPY ./PMIDs /app/PMIDs 15 | 16 | # Set environment variables 17 | ENV FLASK_APP=/app/server.py 18 | ENV FLASK_RUN_HOST=0.0.0.0 19 | ENV FLASK_RUN_PORT=5000 20 | 21 | # Expose port 5000 for communication with the Flask app 22 | EXPOSE 5000 23 | 24 | # Define the command that runs when the container starts 25 | CMD ["flask", "run"] 26 | 27 | # docker run -d --name faiss_cpt -p 5000:5000 faiss:latest -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2025 Linus Stuhlmann & Michael Saxer 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /evaluation/evaluation_data_storages/faiss/conncatinatior.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from pathlib import Path 3 | from tqdm import tqdm 4 | import pandas as pd 5 | 6 | def concatenate_pubmed_ids(input_dir: Path, output_dir: Path) -> None: 7 | # Sicherstellen, dass das Ausgabeverzeichnis vorhanden ist 8 | output_dir.mkdir(exist_ok=True) 9 | 10 | # Laden und Konkatenieren der PubMed ID Arrays 11 | final_pubmed_ids = np.array([], dtype=int) 12 | 13 | id_files = sorted(input_dir.glob('pubmed_ids_*.npy')) 14 | 15 | if not id_files: 16 | print("Keine PubMed ID .npy Dateien gefunden.") 17 | return 18 | 19 | for file in tqdm(id_files, desc="Lade und konkatiniere PubMed IDs"): 20 | ids = np.load(file) 21 | final_pubmed_ids = np.concatenate((final_pubmed_ids, ids)) 22 | 23 | # Speichern der finalen PubMed IDs 24 | pd.DataFrame(final_pubmed_ids).to_csv(output_dir / 'concatenated_pubmed_ids.csv', index=False, header=False) 25 | print("Finale PubMed IDs gespeichert.") 26 | 27 | if __name__ == "__main__": 28 | input_dir = Path('/home/ubuntu/data/numpy_embeddings') 29 | output_dir = Path('/home/ubuntu/stuhllin/medical_RAG_system/information_retrieval/faiss_container/PMIDs') 30 | concatenate_pubmed_ids(input_dir, output_dir) 31 | -------------------------------------------------------------------------------- /information_retrieval/document_encoding/bioBERT_encoder.py: -------------------------------------------------------------------------------- 1 | from sentence_transformers import SentenceTransformer, models 2 | import torch 3 | 4 | class bioBERTEncoder: 5 | def __init__(self, max_length=512): 6 | if torch.cuda.is_available(): 7 | self.device = "cuda" 8 | else: 9 | self.device = "cpu" 10 | 11 | self.max_length = max_length 12 | 13 | word_embedding_model = models.Transformer('dmis-lab/biobert-v1.1', max_seq_length=self.max_length) 14 | pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(), 15 | pooling_mode_mean_tokens=True, 16 | pooling_mode_cls_token=False, 17 | pooling_mode_max_tokens=False) 18 | 19 | self.model = SentenceTransformer(modules=[word_embedding_model, pooling_model], device=self.device) 20 | 21 | def __call__(self, batch): 22 | contents = [item["content"] for item in batch] 23 | embeddings = self.model.encode(contents, batch_size=len(contents), show_progress_bar=False) 24 | return [{"id": item["id"], "title": item["title"], "content": item["content"], "PMID": item.get("PMID", None), "embeddings": embedding.tolist()} for item, embedding in zip(batch, embeddings)] 25 | -------------------------------------------------------------------------------- /information_retrieval/elastic_container/start_elasticsearch.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Prüfen, ob das Docker-Netzwerk existiert 4 | if ! docker network ls | grep -qw elastic; then 5 | echo "Netzwerk 'elastic' existiert nicht. Es wird erstellt..." 6 | docker network create elastic 7 | else 8 | echo "Netzwerk 'elastic' ist bereits vorhanden." 9 | fi 10 | 11 | # Prüfen, ob das Docker-Volume existiert 12 | if ! docker volume ls | grep -qw elasticsearch_data; then 13 | echo "Volume 'elasticsearch_data' existiert nicht. Es wird erstellt..." 14 | docker volume create elasticsearch_data 15 | else 16 | echo "Volume 'elasticsearch_data' ist bereits vorhanden." 17 | fi 18 | 19 | docker pull docker.elastic.co/elasticsearch/elasticsearch:8.13.4 20 | 21 | # Elasticsearch-Container starten 22 | echo "Starte Elasticsearch-Container..." 23 | docker run \ 24 | --name es01 \ 25 | --net elastic \ 26 | -p 9200:9200 \ 27 | -it \ 28 | -m 32GB \ 29 | --volume elasticsearch_data:/usr/share/elasticsearch/data \ 30 | -e "ES_JAVA_OPTS=-Xms16g -Xmx16g" \ 31 | docker.elastic.co/elasticsearch/elasticsearch:8.13.4 32 | 33 | # 16GB RAM im Heap festlegen (Xms und Xmx) um OutOfMemoryError zu vermeiden 34 | 35 | echo "Elasticsearch-Container wurde gestartet." 36 | 37 | 38 | # if crt problem, use this command to start the container 39 | # docker run --name es01 --net elastic -p 9200:9200 -it -m 32GB -e "ES_JAVA_OPTS=-Xms16g -Xmx16g" docker.elastic.co/elasticsearch/elasticsearch:8.13.4 -------------------------------------------------------------------------------- /information_retrieval/faiss_container/server.py: -------------------------------------------------------------------------------- 1 | from flask import Flask, request, jsonify 2 | import faiss 3 | import numpy as np 4 | import pandas as pd 5 | 6 | app = Flask(__name__) 7 | 8 | # Load the Faiss index (assuming the index has already been created and saved) 9 | index_path = "/app/faiss_indices/bioBERT_index.index" 10 | index = faiss.read_index(index_path) 11 | 12 | # Load PMIDs and their respective index numbers 13 | pmids_path = "/app/PMIDs/bioBERT_pmids.csv" 14 | pmids_df = pd.read_csv(pmids_path) 15 | 16 | # Create a dictionary to map index numbers to PMIDs 17 | index_to_pmids = dict(zip(pmids_df['Index'], pmids_df['PMID'])) 18 | 19 | @app.route('/search', methods=['POST']) 20 | def search(): 21 | # Extract the query vectors and the value of k from the POST request 22 | data = request.get_json() 23 | queries = np.array(data['queries'], dtype='float32') 24 | 25 | # Get the number of nearest neighbors to search for 26 | k = int(data['k']) 27 | 28 | # Perform the search in the Faiss index 29 | distances, indices = index.search(queries, k) 30 | 31 | # Map the Faiss indices to PMIDs using the dictionary 32 | matched_PMIDs = [[index_to_pmids[idx] for idx in row] for row in indices] 33 | 34 | # Return the response as JSON 35 | return jsonify(PMIDs=matched_PMIDs, distances=distances.tolist()) 36 | 37 | if __name__ == '__main__': 38 | app.run(host='0.0.0.0', port=5000) # Accessible over port 5000 on all network interfaces -------------------------------------------------------------------------------- /rag_system/bm25_retriever.py: -------------------------------------------------------------------------------- 1 | from elasticsearch import Elasticsearch 2 | import os 3 | import json 4 | 5 | class BM25Retriever: 6 | def __init__(self): 7 | elastic_password = os.getenv('ELASTIC_PASSWORD') 8 | self.es = Elasticsearch( 9 | ['https://localhost:9200'], 10 | basic_auth=('elastic', elastic_password), 11 | verify_certs=True, 12 | ca_certs="/home/rag/.crt/http_ca.crt", 13 | request_timeout=60 14 | ) 15 | self.index = "pubmed_index" 16 | 17 | def retrieve_docs(self, query: str, k: int = 10): 18 | es_query = { 19 | "size": k, 20 | "query": { 21 | "match": { 22 | "content": query 23 | } 24 | }, 25 | "_source": ["PMID", "title", "content"] 26 | } 27 | # Execute the search query 28 | response = self.es.search(index=self.index, body=es_query) 29 | 30 | # Format the results into the desired JSON structure 31 | results = {} 32 | for idx, doc in enumerate(response['hits']['hits'], 1): 33 | doc_key = f"doc{idx}" 34 | results[doc_key] = { 35 | 'PMID': doc['_source']['PMID'], 36 | 'title': doc['_source']['title'], 37 | 'content': doc['_source']['content'], 38 | 'score': doc['_score'] 39 | } 40 | 41 | return json.dumps(results, indent=4) -------------------------------------------------------------------------------- /rag_system/bioBERT_encoder.py: -------------------------------------------------------------------------------- 1 | from sentence_transformers import SentenceTransformer, models 2 | import torch 3 | import time 4 | 5 | 6 | class BioBERTQueryEncooder: 7 | def __init__(self, model_name='dmis-lab/biobert-v1.1', max_length=512): 8 | if torch.cuda.is_available(): 9 | self.device = "cuda" 10 | else: 11 | self.device = "cpu" 12 | 13 | self.max_length = max_length 14 | 15 | # Load pretrained BioBERT-Modell and adding MEAN-Pooling-Layers 16 | word_embedding_model = models.Transformer(model_name, max_seq_length=self.max_length) 17 | pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(), 18 | pooling_mode_mean_tokens=True, 19 | pooling_mode_cls_token=False, 20 | pooling_mode_max_tokens=False) 21 | 22 | self.model = SentenceTransformer(modules=[word_embedding_model, pooling_model], device=self.device) 23 | 24 | def encode(self, text): 25 | # transform text into vector representation 26 | embedding = self.model.encode([text], batch_size=1, show_progress_bar=False) 27 | return embedding[0] 28 | 29 | if __name__ == "__main__": 30 | 31 | embedder = BioBERTQueryEncooder() 32 | text = "This is a test sentence." 33 | start = time.time() 34 | embedding = embedder.embed(text) 35 | print(embedding) 36 | print(time.time() - start) -------------------------------------------------------------------------------- /information_retrieval/document_encoding/medCPT_encoder.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from transformers import AutoTokenizer, AutoModel 3 | from typing import List, Dict 4 | 5 | class medCPTArticleEncoder: 6 | def __init__(self, max_length=512): 7 | if torch.cuda.is_available(): 8 | self.device = "cuda" 9 | else: 10 | self.device = "cpu" 11 | 12 | self.max_length = max_length 13 | self.model = AutoModel.from_pretrained("ncbi/MedCPT-Article-Encoder") 14 | self.tokenizer = AutoTokenizer.from_pretrained("ncbi/MedCPT-Article-Encoder") 15 | 16 | def __call__(self, batch: List[Dict]) -> List[Dict]: 17 | encoded_articles = [] 18 | 19 | with torch.no_grad(): 20 | # Extract the article content from the batch 21 | articles = [item["content"] for item in batch] 22 | 23 | # Tokenize the articles 24 | encoded = self.tokenizer( 25 | articles, 26 | truncation=True, 27 | padding=True, 28 | return_tensors='pt', 29 | max_length=self.max_length, 30 | ) 31 | 32 | # Encode the articles (use the [CLS] token as the representation) 33 | outputs = self.model(**encoded) 34 | embeddings = outputs.last_hidden_state[:, 0, :] 35 | 36 | for i, item in enumerate(batch): 37 | encoded_articles.append({ 38 | "id": item["id"], 39 | "title": item["title"], 40 | "content": item["content"], 41 | "PMID": item.get("PMID", None), 42 | "embedding": embeddings[i].tolist() 43 | }) 44 | 45 | return encoded_articles 46 | -------------------------------------------------------------------------------- /rag_system/med_rag.py: -------------------------------------------------------------------------------- 1 | import json 2 | import time 3 | from openAI_chat import Chat 4 | from bioBERT_retriever import BioBERTRetriever 5 | from bm25_retriever import BM25Retriever 6 | from hybrid_retriever import HybridRetriever 7 | from medCPT_retriever import MedCPTRetriever 8 | 9 | class MedRAG: 10 | def __init__(self, retriever=1, question_type=1, n_docs=10): 11 | if retriever == 1: 12 | self.retriever = BioBERTRetriever() 13 | elif retriever == 2: 14 | self.retriever = BM25Retriever() 15 | elif retriever == 3: 16 | self.retriever = HybridRetriever() 17 | elif retriever == 4: 18 | self.retriever = MedCPTRetriever(rerank=True) 19 | else: 20 | raise ValueError("Invalid retriever value. Choose 1 for bioBERT, 2 for BM25, or 3 for hybrid.") 21 | 22 | self.chat = Chat(question_type=question_type) # 1 for full text, 2 for yes/no 23 | self.n_docs = n_docs 24 | 25 | def extract_pmids(self, docs): 26 | # Extracts PMIDs from the documents and returns them as a list 27 | return [doc["PMID"] for doc in docs.values()] 28 | 29 | def get_answer(self, question: str) -> str: 30 | 31 | # retrieve the documents timing the retrieval 32 | start_time_retrieval = time.time() 33 | retrieved_docs = json.loads(self.retriever.retrieve_docs(question, self.n_docs)) 34 | end_time_retrieval = time.time() 35 | 36 | # extract the PMIDs from the retrieved documents 37 | pmids = self.extract_pmids(retrieved_docs) 38 | 39 | # the chat response is a json string {'response': '...', 'used_PMIDs': [...]} and timing the generation 40 | start_time_generation = time.time() 41 | answer = self.chat.create_chat(question, retrieved_docs) 42 | end_time_generation = time.time() 43 | 44 | retrieval_time = end_time_retrieval - start_time_retrieval 45 | generation_time = end_time_generation - start_time_generation 46 | 47 | # now adding the retrieved PMIDs to the response 48 | try : 49 | answer = json.loads(answer) 50 | answer['retrieved_PMIDs'] = pmids 51 | answer['retrieval_time'] = retrieval_time 52 | answer['generation_time'] = generation_time 53 | except: 54 | return None 55 | return json.dumps(answer) -------------------------------------------------------------------------------- /evaluation/evaluation_data_storages/faiss/embedding_extractor.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import json 3 | import gc # Garbage Collector importieren 4 | from pathlib import Path 5 | from tqdm import tqdm 6 | 7 | def process_files(files): 8 | pubmed_ids = [] 9 | embeddings = [] 10 | for file_name in tqdm(files, desc="Verarbeite Dateien", leave=False): 11 | with open(file_name, 'r') as file: 12 | for line in file: 13 | try: 14 | data = json.loads(line) 15 | pubmed_ids.append(int(data.get('PMID', 0))) # Konvertierung zu Integer mit Standardwert 0 16 | embeddings.append(data.get('embeddings', [])) # Standardwert leere Liste 17 | except json.JSONDecodeError as e: 18 | print(f"Fehler beim Decodieren von JSON in Datei {file_name}: {e}") 19 | return pubmed_ids, embeddings 20 | 21 | source_directory = Path('/home/ubuntu/pubmed') 22 | jsonl_files = list(source_directory.glob('*.jsonl')) 23 | batch_size = 15 # Maximale Anzahl an Dateien pro Batch 24 | 25 | # Speicherpfade für Zwischendateien 26 | temp_dir = Path('/home/ubuntu/temp_pubmed') 27 | temp_dir.mkdir(exist_ok=True) # Stelle sicher, dass das Verzeichnis existiert 28 | 29 | # Verarbeite Dateien in Batches und speichere Zwischenergebnisse 30 | for i in tqdm(range(0, len(jsonl_files), batch_size), desc="Verarbeite Batches"): 31 | batch_files = jsonl_files[i:i + batch_size] 32 | batch_pubmed_ids, batch_embeddings = process_files(batch_files) 33 | # Speichere die Batch-Daten in temporären Dateien 34 | np.save(temp_dir / f'embeddings_{i // batch_size}.npy', batch_embeddings) 35 | np.save(temp_dir / f'pubmed_ids_{i // batch_size}.npy', batch_pubmed_ids) 36 | # Lösche die Listen, um den Speicher freizugeben 37 | del batch_pubmed_ids, batch_embeddings 38 | gc.collect() # Fordere die Garbage Collection explizit an 39 | 40 | # Lade alle Zwischendateien und konkateniere die Arrays 41 | final_embeddings = np.concatenate([np.load(file) for file in temp_dir.glob('embeddings_*.npy')]) 42 | final_pubmed_ids = np.concatenate([np.load(file) for file in temp_dir.glob('pubmed_ids_*.npy')]) 43 | 44 | # Speichere die finalen Arrays 45 | np.save('embeddings.npy', final_embeddings) 46 | np.save('pubmed_ids.npy', final_pubmed_ids) 47 | 48 | # Aufräumen: Lösche die temporären Dateien 49 | for file in temp_dir.glob('*.npy'): 50 | file.unlink() 51 | temp_dir.rmdir() # Entferne das Verzeichnis, falls es leer ist 52 | -------------------------------------------------------------------------------- /rag_system/hybrid_retriever.py: -------------------------------------------------------------------------------- 1 | from elasticsearch import Elasticsearch 2 | import os 3 | import json 4 | from medCPT_encoder import MedCPTCrossEncoder 5 | 6 | class HybridRetriever: 7 | def __init__(self): 8 | elastic_password = os.getenv('ELASTIC_PASSWORD') 9 | self.es = Elasticsearch( 10 | ['https://localhost:9200'], 11 | basic_auth=('elastic', elastic_password), 12 | verify_certs=True, 13 | ca_certs="/home/rag/.crt/http_ca.crt", 14 | request_timeout=60 15 | ) 16 | self.index = "pubmed_index" 17 | self.reranker = MedCPTCrossEncoder() 18 | 19 | def rerank_docs(self, query: str, docs: list): 20 | """Reranks the documents based on their relevance to the query.""" 21 | scores = self.reranker.score([doc['content'] for doc in docs], query) 22 | reranked_docs = sorted(zip(docs, scores), key=lambda x: x[1], reverse=True) 23 | return reranked_docs 24 | 25 | def retrieve_docs(self, query: str, top_n: int = 10, k: int = 20): 26 | """Retrieves documents from Elasticsearch and reranks them, returning only the top N results.""" 27 | es_query = { 28 | "size": k, 29 | "query": { 30 | "match": { 31 | "content": query 32 | } 33 | }, 34 | "_source": ["PMID", "title", "content"] 35 | } 36 | # Execute the search query 37 | response = self.es.search(index=self.index, body=es_query) 38 | 39 | # Extract documents with full metadata 40 | docs = [{ 41 | 'PMID': hit['_source']['PMID'], 42 | 'title': hit['_source']['title'], 43 | 'content': hit['_source']['content'] 44 | } for hit in response['hits']['hits']] 45 | 46 | # Rerank the documents 47 | reranked_docs = self.rerank_docs(query, docs) 48 | 49 | # only take documents with a score > 0 50 | reranked_docs = [doc for doc in reranked_docs if doc[1] > 0] 51 | 52 | # Take max the top N reranked documents 53 | top_reranked_docs = reranked_docs[:top_n] 54 | 55 | # Construct the final results with reranked scores 56 | results = { 57 | f"doc{idx + 1}": { 58 | 'PMID': doc['PMID'], 59 | 'title': doc['title'], 60 | 'content': doc['content'], 61 | 'score': score.item() 62 | } 63 | for idx, (doc, score) in enumerate(top_reranked_docs) 64 | } 65 | 66 | return json.dumps(results, indent=4) 67 | 68 | if __name__ == "__main__": 69 | retriever = HybridRetriever() 70 | query = "Is Alzheimer's disease hereditary?" 71 | results = retriever.retrieve_docs(query, k=100, top_n=10) 72 | print(results) 73 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | anaconda==0.0.1.1 # Anaconda package 2 | annotated-types==0.6.0 # Support for typing-annotations 3 | anyio==4.3.0 # Async network and file operations 4 | argon2-cffi==23.1.0 # The secure Argon2 password hashing algorithm 5 | attrs==23.2.0 # Attributes without boilerplate 6 | Babel==2.14.0 # Internationalization utilities 7 | beautifulsoup4==4.12.3 # Screen-scraping library 8 | bleach==6.1.0 # Sanitize your inputs 9 | click==8.1.7 # Command Line Interface Creation Kit 10 | decorator==5.1.1 # Simplifies the usage of decorators 11 | elastic-transport==8.13.0 # Transport layer for Elasticsearch 12 | elasticsearch==8.13.0 # Official Elasticsearch client 13 | faiss-cpu==1.8.0 # A library for efficient similarity search and clustering 14 | Flask==3.0.3 # Micro web framework 15 | fsspec==2024.3.1 # File system specification 16 | huggingface-hub==0.22.2 # Client library for Huggingface hub 17 | idna==3.3 # Internationalized Domain Names in Applications (IDNA) 18 | importlib-metadata==4.6.4 # Library to access the metadata for a Python package 19 | joblib==1.4.0 # Lightweight pipelining in Python 20 | jsonschema==4.21.1 # JSON Schema validation for Python 21 | jupyter==1.0.0 # Jupyter metapackage 22 | jupyterlab==4.1.8 # JupyterLab: the next generation Jupyter notebook 23 | MarkupSafe==2.1.5 # Implements a XML/HTML/XHTML Markup safe string for Python 24 | matplotlib==3.8.4 # Plotting library for Python 25 | more-itertools==8.10.0 # More routines for operating on iterables, beyond itertools 26 | nbconvert==7.16.3 # Convert Jupyter Notebooks 27 | numpy==1.26.4 # Fundamental package for array computing 28 | openai==1.23.3 # OpenAI API client 29 | packaging==24.0 # Core utilities for Python packages 30 | pandas==2.2.2 # Data analysis and manipulation library 31 | prompt-toolkit==3.0.43 # Library for building powerful interactive command lines 32 | psutil==5.9.8 # Cross-platform process and system utilities 33 | pydantic==2.7.1 # Data validation and settings management using Python type annotations 34 | Pygments==2.17.2 # Syntax highlighting package 35 | regex==2024.4.16 # Alternative regular expression module 36 | requests==2.31.0 # Simple HTTP library for Python 37 | scikit-learn==1.4.2 # Machine learning library 38 | scipy==1.13.0 # Fundamental library for scientific computing 39 | sentence-transformers==2.7.0 # BERT and SentenceTransformers library 40 | six==1.16.0 # Python 2 and 3 compatibility utilities 41 | tqdm==4.66.2 # Fast, extensible progress bar for Python 42 | traitlets==5.14.2 # Configuration system for Python applications 43 | transformers==4.40.0 # State-of-the-art Natural Language Processing for TensorFlow and PyTorch 44 | torch==2.2.2 # Tensors and Dynamic neural networks in Python with strong GPU acceleration 45 | urllib3==1.26.5 # HTTP library with thread-safe connection pooling 46 | Werkzeug==3.0.2 # Comprehensive WSGI web application library 47 | -------------------------------------------------------------------------------- /rag_system/bioBERT_retriever.py: -------------------------------------------------------------------------------- 1 | from elasticsearch import Elasticsearch 2 | import os 3 | import requests 4 | import json 5 | from bioBERT_encoder import BioBERTQueryEncooder 6 | 7 | class BioBERTRetriever: 8 | def __init__(self): 9 | elastic_password = os.getenv('ELASTIC_PASSWORD') 10 | self.es = Elasticsearch( 11 | ['https://localhost:9200'], 12 | basic_auth=('elastic', elastic_password), 13 | verify_certs=True, 14 | ca_certs="/home/ubuntu/.crts/http_ca.crt", 15 | request_timeout=60 16 | ) 17 | self.index = "pubmed_index" 18 | self.faiss_url = "http://localhost:5000/search" 19 | self.query_encoder = BioBERTQueryEncooder() 20 | 21 | def query_to_vector(self, text: str): 22 | """Converts text query to a vector using the BioBERT encoder.""" 23 | embedding = self.query_encoder.encode(text) 24 | return embedding 25 | 26 | def faiss_query(self, query: str, k: int = 10): 27 | """Performs a vector search using FAISS with the given query and k.""" 28 | vec = self.query_to_vector(query).tolist() # Convert numpy array to list 29 | data = { 30 | 'queries': [vec], # List of vectors 31 | 'k': k 32 | } 33 | response = requests.post(self.faiss_url, headers={'Content-Type': 'application/json'}, data=json.dumps(data)) 34 | return response.json() 35 | 36 | def get_docs_via_PMIDs(self, PMIDs: list): 37 | """Retrieves documents from Elasticsearch using a list of PMIDs.""" 38 | query = { 39 | "size": len(PMIDs), 40 | "query": { 41 | "terms": { 42 | "PMID": PMIDs 43 | } 44 | }, 45 | "_source": ["PMID", "title", "content"] 46 | } 47 | return self.es.search(index=self.index, body=query) 48 | 49 | def retrieve_docs(self, query: str, k: int = 10): 50 | """Retrieves documents relevant to the query using both FAISS and Elasticsearch.""" 51 | response = self.faiss_query(query, k) 52 | PMIDs = response['PMIDs'][0] # Assumes PMIDs are returned in a structured list 53 | es_response = self.get_docs_via_PMIDs(PMIDs) 54 | results = {} 55 | 56 | # Formatting the response as required 57 | for idx, hit in enumerate(es_response['hits']['hits'], 1): 58 | doc_key = f"doc{idx}" 59 | results[doc_key] = { 60 | 'PMID': hit['_source']['PMID'], 61 | 'title': hit['_source']['title'], 62 | 'content': hit['_source']['content'] 63 | } 64 | 65 | return json.dumps(results, indent=4) 66 | 67 | 68 | if __name__ == '__main__': 69 | retriever = BioBERTRetriever() 70 | query = "What is the role of sdRNA in cancer?" 71 | n_docs = 5 72 | response = retriever.retrieve_docs(query, n_docs) 73 | print(response) -------------------------------------------------------------------------------- /rag_system/medCPT_encoder.py: -------------------------------------------------------------------------------- 1 | from transformers import AutoModel, AutoTokenizer, AutoModelForSequenceClassification 2 | import torch 3 | 4 | 5 | class MedCPTQueryEncoder: 6 | def __init__(self, model_name='ncbi/MedCPT-Query-Encoder', max_length=512): 7 | if torch.cuda.is_available(): 8 | self.device = "cuda" 9 | else: 10 | self.device = "cpu" 11 | 12 | self.max_length = max_length 13 | 14 | # Load pretrained MedCPT-Query-Encoder model and tokenizer 15 | self.model = AutoModel.from_pretrained(model_name).to(self.device) 16 | self.tokenizer = AutoTokenizer.from_pretrained(model_name) 17 | 18 | def encode(self, text): 19 | with torch.no_grad(): 20 | # Tokenize the text 21 | inputs = self.tokenizer(text, return_tensors='pt', truncation=True, padding=True, max_length=self.max_length).to(self.device) 22 | # Pass the inputs through the model 23 | outputs = self.model(**inputs) 24 | # Return the last hidden states 25 | return outputs.last_hidden_state[:, 0, :] 26 | 27 | 28 | class MedCPTCrossEncoder: 29 | def __init__(self, model_name='ncbi/MedCPT-Cross-Encoder'): 30 | if torch.cuda.is_available(): 31 | self.device = "cuda" 32 | else: 33 | self.device = "cpu" 34 | 35 | # Load pretrained Cross-Encoder model and tokenizer 36 | self.model = AutoModelForSequenceClassification.from_pretrained(model_name).to(self.device) 37 | self.tokenizer = AutoTokenizer.from_pretrained(model_name) 38 | 39 | def score(self, articles, query): 40 | pairs = [[query, article] for article in articles] 41 | 42 | with torch.no_grad(): 43 | encoded = self.tokenizer( 44 | pairs, 45 | truncation=True, 46 | padding=True, 47 | return_tensors="pt", 48 | max_length=512, 49 | ).to(self.device) 50 | 51 | logits = self.model(**encoded).logits.squeeze(dim=1) 52 | return logits 53 | 54 | 55 | if __name__ == "__main__": 56 | 57 | cross_encoder = MedCPTCrossEncoder() 58 | 59 | query = "What is the treatment for diabetes?" 60 | 61 | articles = [ 62 | "Diabetes is a chronic disease that occurs when the body is unable to produce enough insulin or use it effectively. Treatment for diabetes includes lifestyle changes, such as diet and exercise, as well as medications like insulin and oral hypoglycemic drugs.", 63 | "The treatment for diabetes involves managing blood sugar levels through diet, exercise, and medication. Insulin therapy, oral medications, and lifestyle changes are common approaches to managing diabetes.", 64 | "Diabetes treatment typically involves a combination of diet, exercise, and medication. Insulin therapy, oral medications, and lifestyle changes are key components of managing diabetes.", 65 | ] 66 | 67 | scores = cross_encoder.score(articles, query) 68 | 69 | for i, (article, score) in enumerate(zip(articles, scores)): 70 | print(f"Article {i+1}: {article}") 71 | print(f"Score: {score:.4f}\n") 72 | -------------------------------------------------------------------------------- /evaluation/evaluation_QA_system/full_text_evaluation.py: -------------------------------------------------------------------------------- 1 | import os 2 | import openai 3 | 4 | 5 | class evaluateResponseGPT: 6 | def __init__(self, response, answer): 7 | self.response = response 8 | self.correct_answer = answer 9 | self.model = "gpt-3.5-turbo" 10 | api_key = os.getenv('OPENAI_API_KEY') 11 | self.client = openai.OpenAI(api_key=api_key) 12 | self.context = self.set_context() 13 | 14 | def set_context(self) -> str: 15 | return ( 16 | "You will evaluate a response by comparing it to an expert's optimal answer in the biomedical domain. " 17 | "The evaluation process should include the following steps:" 18 | "1. Identification of key terms and concepts in both the provided response and the expert's optimal answer. " 19 | "2. Assessment of the context of the used terms and concepts in the response and the expert's answer. " 20 | "3. Determination of the accuracy and completeness of the provided response. " 21 | "Score the response on a scale from 0 to 10, where 0 means completely no overlap with the expert's answer and 10 means a perfect match." 22 | "Provide the discrete numerical score as your response." 23 | ) 24 | 25 | def set_initial_message(self): 26 | return [{"role": "system", "content": self.context}] 27 | 28 | def get_evaluation(self) -> float: 29 | messages = self.set_initial_message() 30 | messages.append({"role": "user", "content": f"Response: {self.response}"}) 31 | print(f"Correct answer: {self.response}") 32 | messages.append({"role": "user", "content": f"Correct answer: {self.correct_answer}. Please score the response above from 0 to 1 based on its accuracy and completeness."}) 33 | print(f"Correct answer: {self.correct_answer}") 34 | try: 35 | completion = self.client.chat.completions.create( 36 | model=self.model, 37 | messages=messages, 38 | max_tokens=500, 39 | temperature=0.0 40 | ) 41 | 42 | # Correct way to access the message content 43 | response_content = completion.choices[0].message.content # Removed incorrect dictionary access 44 | try: 45 | score = float(response_content.strip()) 46 | except ValueError: 47 | score = 0 # Handle the case where the response cannot be converted to float 48 | except Exception as e: 49 | print(f"An error occurred during response evaluation: {e}") 50 | score = 0 51 | 52 | return score/10 # Normalize the score to be between 0 and 1 53 | 54 | 55 | if __name__ == "__main__": 56 | response = "Standard treatment for type 2 diabetes is insulin injections and does emphasize lifestyle changes or oral medications like metformin." 57 | correct_answer = "The standard treatment for type 2 diabetes involves lifestyle modifications such as diet and exercise, complemented by medications like metformin to regulate blood sugar levels." 58 | evaluator = evaluateResponseGPT(response, correct_answer) 59 | score = evaluator.get_evaluation() 60 | print(f"Response score: {score}") -------------------------------------------------------------------------------- /rag_system/openAI_chat.py: -------------------------------------------------------------------------------- 1 | import openai 2 | import os 3 | import json 4 | from typing import List, Dict 5 | 6 | class Chat: 7 | def __init__(self, question_type: int = 1, api_key: str = os.getenv('OPENAI_API_KEY'), model: str = "gpt-3.5-turbo"): 8 | self.api_key = api_key 9 | self.model = model 10 | self.client = openai.OpenAI(api_key=self.api_key) 11 | self.context = self.set_context(question_type) 12 | 13 | def set_context(self, question_type: int) -> str: 14 | base_context = ( 15 | "You are a scientific medical assistant designed to synthesize responses " 16 | "from specific medical documents. Only use the information provided in the " 17 | "documents to answer questions. The first documents should be the most relevant." 18 | "Do not use any other information except for the documents provided." 19 | "When answering questions, always format your response " 20 | "as a JSON object with fields for 'response', 'used_PMIDs'. " 21 | "Cite all PMIDs your response is based on in the 'used_PMIDs' field. " 22 | "Please think step-by-step before answering questions and provide the most accurate response possible." 23 | ) 24 | 25 | question_specific_context = { 26 | 1: " Provide a detailed answer to the question in the 'response' field.", 27 | 2: " Your response should only be 'yes', 'no'. If if no relevant documents are found, return 'no_docs_found'.", 28 | 3: " Choose between the given options 1 to 4 and return as 'response' the chosen number. If no relevant documents are found, return the number 5.", 29 | 4: " Respond with keywords and list each keyword sepeartly as a list element. For example ['keyword1', 'keyword2', 'keyword3']. If no relevant documents are found, return an empty list.", 30 | } 31 | 32 | return base_context + question_specific_context.get(question_type, "") 33 | 34 | def set_initial_message(self) -> List[dict]: 35 | return [{"role": "system", "content": self.context}] 36 | 37 | def create_chat(self, user_message: str, retrieved_documents: Dict) -> str: 38 | messages = self.set_initial_message() 39 | messages.append({"role": "user", "content": f"Answer the following question: {user_message}"}) 40 | 41 | document_texts = ["PMID {}: {} {}".format(doc['PMID'], doc['title'], doc['content']) for doc in retrieved_documents.values()] 42 | documents_message = "\n\n".join(document_texts) # Separating documents with two newlines 43 | messages.append({"role": "system", "content": documents_message}) 44 | 45 | try: 46 | completion = self.client.chat.completions.create( 47 | model=self.model, 48 | messages=messages, 49 | max_tokens=500, 50 | temperature=0.0 51 | ) 52 | 53 | response_content = completion.choices[0].message.content 54 | try: 55 | response_data = json.loads(response_content) 56 | formatted_response = { 57 | "response": response_data.get("response"), 58 | "used_PMIDs": response_data.get("used_PMIDs", []), 59 | "retrieved_PMIDs": [doc['PMID'] for doc in retrieved_documents.values()] 60 | } 61 | return json.dumps(formatted_response) 62 | except json.JSONDecodeError: 63 | return json.dumps({"error": "Invalid JSON format in response.", "response": response_content}) 64 | 65 | except Exception as e: 66 | return json.dumps({"error": str(e)}) -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | share/python-wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | MANIFEST 28 | 29 | # PyInstaller 30 | # Usually these files are written by a python script from a template 31 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 32 | *.manifest 33 | *.spec 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | .tox/ 42 | .nox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *.cover 49 | *.py,cover 50 | .hypothesis/ 51 | .pytest_cache/ 52 | cover/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | .pybuilder/ 76 | target/ 77 | 78 | # Jupyter Notebook 79 | .ipynb_checkpoints 80 | 81 | # IPython 82 | profile_default/ 83 | ipython_config.py 84 | 85 | # pyenv 86 | # For a library or package, you might want to ignore these files since the code is 87 | # intended to run in multiple environments; otherwise, check them in: 88 | # .python-version 89 | 90 | # pipenv 91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 94 | # install all needed dependencies. 95 | #Pipfile.lock 96 | 97 | # poetry 98 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 99 | # This is especially recommended for binary packages to ensure reproducibility, and is more 100 | # commonly ignored for libraries. 101 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 102 | #poetry.lock 103 | 104 | # pdm 105 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 106 | #pdm.lock 107 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 108 | # in version control. 109 | # https://pdm.fming.dev/#use-with-ide 110 | .pdm.toml 111 | 112 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 113 | __pypackages__/ 114 | 115 | # Celery stuff 116 | celerybeat-schedule 117 | celerybeat.pid 118 | 119 | # SageMath parsed files 120 | *.sage.py 121 | 122 | # Environments 123 | .env 124 | .venv 125 | env/ 126 | venv/ 127 | ENV/ 128 | env.bak/ 129 | venv.bak/ 130 | 131 | # Spyder project settings 132 | .spyderproject 133 | .spyproject 134 | 135 | # Rope project settings 136 | .ropeproject 137 | 138 | # mkdocs documentation 139 | /site 140 | 141 | # mypy 142 | .mypy_cache/ 143 | .dmypy.json 144 | dmypy.json 145 | 146 | # Pyre type checker 147 | .pyre/ 148 | 149 | # pytype static type analyzer 150 | .pytype/ 151 | 152 | # Cython debug symbols 153 | cython_debug/ 154 | 155 | # PyCharm 156 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 157 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 158 | # and can be added to the global gitignore or merged into this file. For a more nuclear 159 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 160 | .idea/ 161 | 162 | data/ 163 | 164 | information_retrieval/faiss_container/faiss_indices 165 | 166 | information_retrieval/faiss_container/PMIDs -------------------------------------------------------------------------------- /information_retrieval/elastic_container/ingest_data.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import json 4 | import os 5 | from pathlib import Path 6 | from tqdm import tqdm 7 | from elasticsearch import Elasticsearch, helpers 8 | 9 | password = os.getenv("ELASTIC_PASSWORD") 10 | 11 | es = Elasticsearch( 12 | hosts=[{"host": "localhost", "port": 9200, "scheme": "https"}], 13 | ca_certs="/home/rag/.crt/http_ca.crt", 14 | basic_auth=("elastic", password), 15 | ) 16 | 17 | # Define the index name 18 | index_name = "pubmed_index" 19 | 20 | # Delete the index if it exists 21 | if es.indices.exists(index=index_name): 22 | es.indices.delete(index=index_name) 23 | 24 | # Check again if the index exists, and if not, create it 25 | if not es.indices.exists(index=index_name): 26 | # Define the mapping 27 | mapping = { 28 | "settings": { 29 | "analysis": { 30 | "analyzer": { 31 | "default": { 32 | "type": "standard", 33 | "stopwords": "_english_" 34 | } 35 | } 36 | } 37 | }, 38 | "mappings": { 39 | "properties": { 40 | "content": { 41 | "type": "text", 42 | "analyzer": "default", 43 | "fields": { 44 | "keyword": { 45 | "type": "keyword", 46 | "ignore_above": 256 47 | } 48 | } 49 | } 50 | } 51 | } 52 | } 53 | 54 | 55 | # Create the index with the defined mapping 56 | es.indices.create(index=index_name, body=mapping) 57 | 58 | source_directory = Path('/home/rag/data/chunk') 59 | error_log_path = Path('./errors.jsonl') # Pfad zur Fehlerprotokolldatei 60 | 61 | def bulk_index_documents(source_directory, index_name, error_log_path): 62 | if not source_directory.exists(): 63 | print("The source directory does not exist.") 64 | return 65 | 66 | actions = [] # List to store the documents to be indexed 67 | 68 | # Open the error log file for writing 69 | with error_log_path.open('w') as error_log: 70 | # Iterate through each file in the source directory 71 | for file_name in tqdm(list(os.listdir(source_directory))): 72 | if file_name.endswith('.jsonl'): 73 | source_file = source_directory / file_name 74 | 75 | # Open and read the JSONL file 76 | with open(source_file, 'r') as json_file: 77 | for line in json_file: 78 | try: 79 | doc = json.loads(line) 80 | 81 | action = { 82 | "_index": index_name, 83 | "_source": doc 84 | } 85 | actions.append(action) 86 | 87 | if len(actions) == 200: # Bulk indexing threshold 88 | helpers.bulk(es, actions) 89 | actions = [] 90 | except json.JSONDecodeError as e: 91 | # Log the error 92 | error_log.write(f"Error in file {file_name}: {e}\n") 93 | error_log.write(f"{line}\n") 94 | except Exception as e: 95 | error_log.write(f"Unexpected error in file {file_name}: {e}\n") 96 | error_log.write(f"{line}\n") 97 | 98 | # Index any remaining documents 99 | if actions: 100 | helpers.bulk(es, actions) 101 | 102 | print('Indexing complete') 103 | 104 | # Call the function to index the documents 105 | bulk_index_documents(source_directory, index_name, error_log_path) 106 | 107 | # Count and print the number of documents in the index 108 | count_result = es.count(index=index_name) 109 | print(f"Index contains {count_result['count']} documents.") 110 | 111 | # to run this script in the background, use the following command: 112 | # nohup python3 ./ingest_data.py > output.log 2>&1 & -------------------------------------------------------------------------------- /rag_system/medCPT_retriever.py: -------------------------------------------------------------------------------- 1 | from medCPT_encoder import MedCPTQueryEncoder, MedCPTCrossEncoder 2 | from elasticsearch import Elasticsearch 3 | import os 4 | import requests 5 | import json 6 | 7 | 8 | class MedCPTRetriever: 9 | def __init__(self, rerank=True): 10 | elastic_password = os.getenv('ELASTIC_PASSWORD') 11 | self.es = Elasticsearch( 12 | ['https://localhost:9200'], 13 | basic_auth=('elastic', elastic_password), 14 | verify_certs=True, 15 | ca_certs="/home/ubuntu/.crts/http_ca.crt", 16 | request_timeout=60 17 | ) 18 | self.index = "pubmed_index" 19 | self.faiss_url = "http://localhost:5000/search" 20 | self.text_encoder = MedCPTQueryEncoder() 21 | self.reranker = MedCPTCrossEncoder() 22 | self.rerank_enabled = rerank 23 | 24 | def query_to_vector(self, text: str): 25 | """Converts text query to a vector using the medCPT query encoder.""" 26 | embedding = self.text_encoder.encode(text) 27 | return embedding[0] 28 | 29 | def faiss_request(self, query: str, k: int = 100): 30 | """Performs a vector search using FAISS with the given query and k.""" 31 | vec = self.query_to_vector(query).tolist() # Convert numpy array to list 32 | data = { 33 | 'queries': [vec], # List of vectors 34 | 'k': k 35 | } 36 | response = requests.post(self.faiss_url, headers={'Content-Type': 'application/json'}, data=json.dumps(data)) 37 | return response.json() 38 | 39 | def get_docs_via_PMIDs(self, PMIDs: list): 40 | """Retrieves documents from Elasticsearch using a list of PMIDs.""" 41 | query = { 42 | "size": len(PMIDs), 43 | "query": { 44 | "terms": { 45 | "PMID": PMIDs 46 | } 47 | }, 48 | "_source": ["PMID", "title", "content"] 49 | } 50 | return self.es.search(index=self.index, body=query) 51 | 52 | def rerank_docs(self, query: str, docs: list, top_n: int): 53 | """Reranks the documents based on their relevance to the query and returns the top N.""" 54 | scores = self.reranker.score([doc['content'] for doc in docs], query) 55 | reranked_docs = sorted(zip(docs, scores), key=lambda x: x[1], reverse=True)[:top_n] 56 | return reranked_docs 57 | 58 | def retrieve_docs(self, query: str, k: int = 20, top_n: int = 10): 59 | """Retrieves documents relevant to the query using both FAISS and Elasticsearch.""" 60 | response = self.faiss_request(query, k) 61 | PMIDs = response['PMIDs'][0] 62 | es_response = self.get_docs_via_PMIDs(PMIDs) 63 | 64 | docs = [{ 65 | 'PMID': hit['_source']['PMID'], 66 | 'title': hit['_source']['title'], 67 | 'content': hit['_source']['content'] 68 | } for hit in es_response['hits']['hits']] 69 | 70 | # Apply reranking if enabled 71 | if self.rerank_enabled: 72 | reranked_docs = self.rerank_docs(query, docs, top_n) 73 | 74 | # only take documents with a score > 0 75 | reranked_docs = [doc for doc in reranked_docs if doc[1] > 0] 76 | 77 | results = { 78 | f"doc{idx + 1}": { 79 | 'PMID': doc['PMID'], 80 | 'title': doc['title'], 81 | 'content': doc['content'], 82 | 'score': score.item() 83 | } 84 | for idx, (doc, score) in enumerate(reranked_docs) 85 | } 86 | else: 87 | results = { 88 | f"doc{idx + 1}": { 89 | 'PMID': doc['PMID'], 90 | 'title': doc['title'], 91 | 'content': doc['content'] 92 | } 93 | for idx, doc in enumerate(docs[:top_n]) 94 | } 95 | 96 | return json.dumps(results, indent=4) 97 | 98 | 99 | if __name__ == "__main__": 100 | retriever = MedCPTRetriever() 101 | query = "What is the treatment for diabetes?" 102 | print(retriever.retrieve_docs(query, k=20, top_n=3)) 103 | -------------------------------------------------------------------------------- /evaluation/evaluation_data_storages/faiss/request.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 2, 6 | "metadata": {}, 7 | "outputs": [ 8 | { 9 | "name": "stdout", 10 | "output_type": "stream", 11 | "text": [ 12 | "Status Code: 200\n", 13 | "Response: {'distances': [[345.34063720703125, 347.5325622558594, 347.6310729980469, 347.86968994140625, 348.7896728515625, 348.9269714355469, 349.13153076171875, 349.4542236328125, 349.48699951171875, 349.59759521484375, 349.71319580078125, 349.86529541015625, 350.11236572265625, 350.1585693359375, 350.3203125, 350.3330993652344, 350.364990234375, 350.3964538574219, 350.4454040527344, 350.55328369140625, 350.5574951171875, 350.5615234375, 350.697265625, 350.7253723144531, 350.7264404296875, 350.75177001953125, 350.86883544921875, 350.92620849609375, 350.9451904296875, 350.97515869140625, 351.14190673828125, 351.14959716796875, 351.150634765625, 351.163330078125, 351.16705322265625, 351.2235412597656, 351.3066711425781, 351.31182861328125, 351.3298034667969, 351.3414306640625, 351.3535461425781, 351.3605041503906, 351.3978576660156, 351.4039306640625, 351.42755126953125, 351.4990234375, 351.50018310546875, 351.5561828613281, 351.5712890625, 351.578369140625, 351.599365234375, 351.6015625, 351.6611328125, 351.699462890625, 351.7027587890625, 351.70855712890625, 351.7459716796875, 351.76617431640625, 351.775634765625, 351.7792663574219, 351.77984619140625, 351.797607421875, 351.80450439453125, 351.8053283691406, 351.808349609375, 351.8154296875, 351.8173828125, 351.84246826171875, 351.8436584472656, 351.85662841796875, 351.86383056640625, 351.869873046875, 351.8704833984375, 351.87548828125, 351.88934326171875, 351.89654541015625, 351.8977355957031, 351.900634765625, 351.9150390625, 351.933837890625, 351.94317626953125, 351.9481201171875, 351.9498291015625, 351.9609375, 351.97540283203125, 351.98065185546875, 351.9837646484375, 351.9930725097656, 352.00732421875, 352.0120849609375, 352.0337829589844, 352.0369873046875, 352.04071044921875, 352.04254150390625, 352.048095703125, 352.0513916015625, 352.05279541015625, 352.06463623046875, 352.0819091796875, 352.0927734375]], 'indices': [[1133332, 1129233, 2158547, 670332, 1559199, 1707872, 1346023, 1932016, 1302318, 1893635, 1375642, 2191381, 2179104, 2222540, 1133682, 1433335, 184772, 1298703, 1044265, 670344, 2402989, 334779, 2019346, 658058, 1398487, 1777282, 273849, 1384217, 436939, 2265182, 1616784, 2098363, 966665, 473742, 261485, 17393, 1722879, 1125923, 2132707, 1106967, 1910545, 82362, 552523, 958848, 2023649, 594676, 1319413, 2277406, 1895067, 1923708, 1715754, 374483, 190973, 1894858, 382862, 1881046, 401552, 1923465, 620328, 368685, 1215806, 1507128, 1765301, 88256, 527378, 1328518, 924844, 2294050, 79309, 1816391, 1538691, 975819, 2263032, 1210736, 943233, 1616936, 1426340, 1337171, 528403, 2033495, 1199468, 523944, 822048, 1138545, 1677746, 2225820, 309120, 1190258, 192989, 2212557, 886555, 225812, 1244613, 1911463, 2119927, 2344697, 1931231, 462343, 469270, 1742767]]}\n" 14 | ] 15 | } 16 | ], 17 | "source": [ 18 | "import requests\n", 19 | "import numpy as np\n", 20 | "import json\n", 21 | "\n", 22 | "# URL des Flask-Endpoints\n", 23 | "url = 'http://localhost:5000/search'\n", 24 | "\n", 25 | "# Generiere einen zufälligen Vektor\n", 26 | "random_vector = np.random.rand(768).tolist()\n", 27 | "\n", 28 | "k = 100\n", 29 | "\n", 30 | "# Daten für die POST-Anfrage\n", 31 | "data = {\n", 32 | " 'queries': [random_vector],\n", 33 | " 'k': k\n", 34 | "}\n", 35 | "\n", 36 | "# Senden der POST-Anfrage\n", 37 | "response = requests.post(url, headers={'Content-Type': 'application/json'}, data=json.dumps(data))\n", 38 | "\n", 39 | "# Ausgabe der Antwort\n", 40 | "print('Status Code:', response.status_code)\n", 41 | "print('Response:', response.json())" 42 | ] 43 | } 44 | ], 45 | "metadata": { 46 | "kernelspec": { 47 | "display_name": "Python 3", 48 | "language": "python", 49 | "name": "python3" 50 | }, 51 | "language_info": { 52 | "codemirror_mode": { 53 | "name": "ipython", 54 | "version": 3 55 | }, 56 | "file_extension": ".py", 57 | "mimetype": "text/x-python", 58 | "name": "python", 59 | "nbconvert_exporter": "python", 60 | "pygments_lexer": "ipython3", 61 | "version": "3.10.12" 62 | } 63 | }, 64 | "nbformat": 4, 65 | "nbformat_minor": 2 66 | } 67 | -------------------------------------------------------------------------------- /evaluation/evaluation_data_storages/documentation.md: -------------------------------------------------------------------------------- 1 | ### Investigation of Elasticsearch and MongoDB as Data Storage 2 | 3 | #### Elasticsearch 4 | ElasticSearch is built on Java and utilizes the Lucene search engine. It writes data to inverted indexes using Lucene segments. 5 | Elasticsearch avoids excessive I/O by creating dedicated transactional index logs, preventing frequent low-level Lucene commits during indexing. 6 | 7 | #### MongoDB: 8 | MongoDB, written in C++, uses a memory map file to map on-disk data files to in-memory byte arrays. 9 | It organizes data using a doubly linked data structure. MongoDB processes shut down in case of low system memory or high resource utilization, ensuring stability 10 | 11 | #### Indexes for Full-Text Search in Elasticsearch and MongoDB. 12 | 13 | - **Elasticsearch** uses inverted indexes for full-text search. It uses the BM25 algorithm to rank documents based on relevance. 14 | - **MongoDB** uses ... 15 | 16 | #### Loading data into Elasticsearch: 17 | 1. Created an dense vector index with 768 dimensions for bioBERT embeddings. 18 | 2. Indexed the embeddings of the first 50 JSONL files in the dense vector index. 19 | 3. Indexing time took: [2:14: 00<00:00, 80.71s/it] for 1'800'000 documents. 20 | 4. 21 | 5. Bulk loading (200 docs) turned out to be significantly faster than single document loading. 22 | 23 | #### Loading data into MongoDB: 24 | 1. Created a collection with bioBERT embeddings. 25 | 2. Inserted the embeddings of the first 100 JSONL files into the collection. 26 | 3. Inserting time took: [21:06<00:00, 12.66s/it] for 1'795'879 documents. 27 | 4. Inserting data (Bulk: 1000 docs) into MongoDB was faster than Elasticsearch. 28 | 5. Text indexing for full text search took 14:30 minutes. 29 | 30 | ### Retriever Comparison 31 | 32 | - **Full text search** using BM25 ranking algorithm. 33 | - **Semantic search** using bioBERT embedding and KNN / Cosine Similarity. 34 | - **Hybrid search** previous ranking using BM25 followed by semantic search and/or DPR. 35 | 36 | ### BioLinkBERT for Information Retrieval 37 | 38 | Die Integration von BioLinkBERT oder ähnlichen Sprachmodellen in ein Information Retrieval (IR) System kann die Genauigkeit und Relevanz der Suchergebnisse erheblich verbessern, insbesondere in spezialisierten Wissensdomänen wie der Biomedizin. Hier sind die grundlegenden Schritte, wie man ein solches Modell in ein IR-System einbinden könnte: 39 | 40 | ### 1. Auswahl des Information Retrieval Systems 41 | Zuerst muss ein geeignetes IR-System ausgewählt oder entwickelt werden. Dies könnte eine traditionelle Keyword-basierte Suche oder eine fortschrittlichere semantische Suchmaschine sein, die auf Vektorraumsuchen basiert (z.B. Elasticsearch oder Solr mit Vektor-Such-Plugins). 42 | 43 | ### 2. Vorbereitung des Index 44 | - **Dokumentenvorbereitung**: Alle Dokumente müssen indiziert werden. Dies beinhaltet das Extrahieren von Texten, das Aufbereiten und möglicherweise das Annotieren mit Metadaten. 45 | - **Einbindung von BioLinkBERT**: Verwenden Sie BioLinkBERT, um Texte in hochdimensionale Vektoren zu transformieren, die dann im Suchindex gespeichert werden. Diese Vektoren repräsentieren die semantischen Signaturen der Dokumente. 46 | 47 | ### 3. Query Processing 48 | - **Abfrage Umwandlung**: Wenn ein Benutzer eine Suchanfrage einreicht, sollte diese Anfrage ebenfalls durch BioLinkBERT verarbeitet werden, um die semantische Repräsentation der Anfrage zu erhalten. 49 | - **Vektorsuche**: Nutzen Sie die generierten Vektoren, um die semantische Nähe zwischen der Suchanfrage und den Dokumenten im Index zu berechnen. Dies kann durch Berechnung von Kosinusähnlichkeiten zwischen den Vektoren erfolgen. 50 | 51 | ### 4. Ranking und Relevanz-Feedback 52 | - **Relevanz Ranking**: Die Dokumente werden basierend auf ihrer semantischen Nähe zur Anfrage gerankt. Je höher die Ähnlichkeit, desto relevanter das Dokument. 53 | - **Feedback Loop**: Optionales Nutzerfeedback zu den Suchergebnissen kann verwendet werden, um das Modell weiter zu trainieren und die Genauigkeit der Suchergebnisse zu verbessern. 54 | 55 | ### 5. Einsatz von erweiterten NLP-Techniken 56 | - **Frage-Antwort-Funktionen**: Für spezifische Anfragen, besonders in QA-Systemen, kann BioLinkBERT verwendet werden, um direkt Antworten aus den Texten zu extrahieren, indem es relevante Textpassagen identifiziert und die darin enthaltenen Informationen herausstellt. 57 | - **Zusammenfassungen und Highlighting**: Für längere Dokumente kann BioLinkBERT genutzt werden, um Zusammenfassungen zu erstellen oder Schlüsselinformationen hervorzuheben, die für die Anfrage relevant sind. 58 | 59 | ### 6. Skalierung und Performance-Optimierung 60 | - **Effizienz**: Beachten Sie, dass die Verarbeitung von Anfragen mit einem vollständigen Sprachmodell rechenintensiv sein kann. Effizienzsteigerungen können durch Techniken wie Quantisierung, Pruning oder den Einsatz spezialisierter Hardware erreicht werden. 61 | - **Parallelisierung**: Um die Geschwindigkeit zu erhöhen, können Anfragen parallelisiert und auf mehreren Servern oder in der Cloud ausgeführt werden. 62 | 63 | Die Einbindung von BioLinkBERT in ein IR-System erfordert eine sorgfältige Planung und Optimierung, kann jedoch die Fähigkeit des Systems, thematisch relevante und kontextuell passende Dokumente zu finden, erheblich verbessern. -------------------------------------------------------------------------------- /information_retrieval/faiss_container/faiss_insert_data.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "### Ingesting embeddings into Faiss index" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": 1, 13 | "metadata": {}, 14 | "outputs": [], 15 | "source": [ 16 | "import faiss\n", 17 | "import json\n", 18 | "import numpy as np\n", 19 | "import os\n", 20 | "import csv\n", 21 | "from pathlib import Path\n", 22 | "from tqdm import tqdm" 23 | ] 24 | }, 25 | { 26 | "cell_type": "markdown", 27 | "metadata": {}, 28 | "source": [ 29 | "Parsing through the JSONL files, extracting the bioBERT embeddings and the corresponding PMIDs. Creating a Faiss index with the embeddings and a CSV file storing PMIDs with the corresponding Faiss index id. " 30 | ] 31 | }, 32 | { 33 | "cell_type": "code", 34 | "execution_count": 3, 35 | "metadata": {}, 36 | "outputs": [ 37 | { 38 | "name": "stderr", 39 | "output_type": "stream", 40 | "text": [ 41 | "Processing JSONL files: 100%|██████████| 140/140 [41:12<00:00, 17.66s/it]\n" 42 | ] 43 | }, 44 | { 45 | "name": "stdout", 46 | "output_type": "stream", 47 | "text": [ 48 | "Index successfully written to: /home/ubuntu/data/faiss_indices/medCPT/medCPT_index.index\n", 49 | "CSV file successfully written to: /home/ubuntu/data/faiss_indices/medCPT/medCPT_pmids.csv\n" 50 | ] 51 | } 52 | ], 53 | "source": [ 54 | "# Directory setup\n", 55 | "index_directory = \"/home/ubuntu/data/faiss_indices/medCPT\"\n", 56 | "index_file = \"medCPT_index.index\"\n", 57 | "index_path = os.path.join(index_directory, index_file)\n", 58 | "csv_file = \"/home/ubuntu/data/faiss_indices/medCPT/medCPT_pmids.csv\"\n", 59 | "\n", 60 | "# Ensure the index directory exists\n", 61 | "if not os.path.exists(index_directory):\n", 62 | " os.makedirs(index_directory)\n", 63 | "\n", 64 | "# Dimensions of the embeddings\n", 65 | "d = 768\n", 66 | "\n", 67 | "# Initialize the Faiss index (Flat L2-Index)\n", 68 | "index = faiss.IndexFlatL2(d)\n", 69 | "\n", 70 | "# Initialize the CSV file for PMIDs\n", 71 | "csv_path = csv_file\n", 72 | "csv_rows = []\n", 73 | "\n", 74 | "# Collecting all JSONL files in the current directory\n", 75 | "source_directory = Path('/home/ubuntu/data/pubmed_medCPT')\n", 76 | "\n", 77 | "# Retrieve and sort the files based on their numerical order in filenames\n", 78 | "sorted_files = sorted(source_directory.glob('*.jsonl'), key=lambda x: int(x.stem.split('n')[-1]))\n", 79 | "\n", 80 | "# Processing sorted files with progress display\n", 81 | "for file_name in tqdm(sorted_files, desc=\"Processing JSONL files\"):\n", 82 | " with open(file_name, 'r') as file:\n", 83 | " for line in file:\n", 84 | " try:\n", 85 | " data = json.loads(line)\n", 86 | " embeddings = data.get('embedding')\n", 87 | " pmid = int(data.get('PMID'))\n", 88 | " \n", 89 | " # If embeddings and PMID are present, add them to the index\n", 90 | " if embeddings and pmid:\n", 91 | " embeddings = np.array(embeddings, dtype='float32').reshape(1, -1) # Convert to NumPy array and reshape\n", 92 | " index.add(embeddings)\n", 93 | " \n", 94 | " # Add PMIDs, filenames, and index numbers for ordering to the CSV\n", 95 | " index_num = index.ntotal - 1 # Index number of the last added embedding\n", 96 | " csv_rows.append([pmid, file_name.name, index_num])\n", 97 | " except json.JSONDecodeError as e:\n", 98 | " print(f\"Error decoding JSON in file {file_name}: {e}\")\n", 99 | "\n", 100 | "# Write the index to a file\n", 101 | "faiss.write_index(index, index_path)\n", 102 | "\n", 103 | "print(f\"Index successfully written to: {index_path}\")\n", 104 | "\n", 105 | "# Write PMIDs to CSV file\n", 106 | "with open(csv_path, 'w', newline='') as csvfile:\n", 107 | " csv_writer = csv.writer(csvfile)\n", 108 | " csv_writer.writerow(['PMID', 'Filename', 'Index'])\n", 109 | " csv_writer.writerows(csv_rows)\n", 110 | "\n", 111 | "print(f\"CSV file successfully written to: {csv_path}\")" 112 | ] 113 | }, 114 | { 115 | "cell_type": "code", 116 | "execution_count": 4, 117 | "metadata": {}, 118 | "outputs": [], 119 | "source": [ 120 | "index = faiss.read_index('faiss_indices/PM_index.index')" 121 | ] 122 | }, 123 | { 124 | "cell_type": "code", 125 | "execution_count": null, 126 | "metadata": {}, 127 | "outputs": [], 128 | "source": [ 129 | "k = 10 # Number of nearest neighbors\n", 130 | "\n", 131 | "query = np.random.rand(768).tolist()\n", 132 | "\n", 133 | "distances, indices = index.search(query, k)" 134 | ] 135 | }, 136 | { 137 | "cell_type": "code", 138 | "execution_count": 1, 139 | "metadata": {}, 140 | "outputs": [ 141 | { 142 | "name": "stdout", 143 | "output_type": "stream", 144 | "text": [ 145 | "Status Code: 200\n", 146 | "Response: {'distances': [[348.77490234375, 348.9889221191406, 349.5247497558594, 349.7203369140625, 349.90228271484375, 349.9190979003906, 350.23382568359375, 350.36578369140625, 350.47930908203125, 350.5979309082031]], 'indices': [[470115, 1932016, 473742, 469270, 1405245, 670332, 1715754, 2382674, 1707872, 2141577]]}\n" 147 | ] 148 | } 149 | ], 150 | "source": [ 151 | "import requests\n", 152 | "import numpy as np\n", 153 | "import json\n", 154 | "\n", 155 | "# URL of the Flask endpoint\n", 156 | "url = 'http://localhost:5000/search'\n", 157 | "\n", 158 | "# Generate a random vector of length 768\n", 159 | "random_vector = np.random.rand(768).tolist() # Convert numpy array directly to list\n", 160 | "\n", 161 | "# Data for the POST request\n", 162 | "data = {\n", 163 | " 'queries': [random_vector] # Ensure this is a list of lists\n", 164 | "}\n", 165 | "\n", 166 | "# Convert data to JSON before sending as POST request\n", 167 | "json_data = json.dumps(data)\n", 168 | "\n", 169 | "# Send the POST request\n", 170 | "response = requests.post(url, headers={'Content-Type': 'application/json'}, data=json_data)\n", 171 | "\n", 172 | "# Output the response\n", 173 | "print('Status Code:', response.status_code)\n", 174 | "print('Response:', response.json())" 175 | ] 176 | } 177 | ], 178 | "metadata": { 179 | "kernelspec": { 180 | "display_name": "base", 181 | "language": "python", 182 | "name": "python3" 183 | }, 184 | "language_info": { 185 | "codemirror_mode": { 186 | "name": "ipython", 187 | "version": 3 188 | }, 189 | "file_extension": ".py", 190 | "mimetype": "text/x-python", 191 | "name": "python", 192 | "nbconvert_exporter": "python", 193 | "pygments_lexer": "ipython3", 194 | "version": "3.10.12" 195 | } 196 | }, 197 | "nbformat": 4, 198 | "nbformat_minor": 2 199 | } 200 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | # Medical RAG System 3 | 4 | This repository contains a comprehensive implementation of a Medical Retrieval-Augmented Generation (RAG) system. The system integrates multiple components for document retrieval, question answering, and evaluation, tailored specifically for the medical domain. 5 | 6 | ## Table of Contents 7 | - [Overview](#overview) 8 | - [File Structure](#file-structure) 9 | - [Installation](#installation) 10 | - [Usage](#usage) 11 | - [Components](#components) 12 | - [Retrieval System](#retrieval-system) 13 | - [Question Answering System](#question-answering-system) 14 | - [Evaluation](#evaluation) 15 | - [Data Storage](#data-storage) 16 | - [Contributing](#contributing) 17 | - [License](#license) 18 | 19 | ## Overview 20 | 21 | The Medical RAG System is designed to enhance medical information retrieval and provide accurate answers to medical queries. It combines various retrieval methods, including BM25, bioBERT, and hybrid models, with advanced question-answering techniques to ensure precise and relevant results. 22 | 23 | 24 | ## File structure 25 | 26 | ```plaintext 27 | ├── evaluation 28 | │   ├── evaluation_data_storages 29 | │   │   ├── documentation.md 30 | │   │   ├── elasticsearch 31 | │   │   │   ├── elastic.ipynb 32 | │   │   │   └── eval_elastic.ipynb 33 | │   │   ├── faiss 34 | │   │   │   ├── conncatinatior.py 35 | │   │   │   ├── embedding_extractor.py 36 | │   │   │   └── request.ipynb 37 | │   │   └── mongodb 38 | │   │   ├── eval_mongo.ipynb 39 | │   │   └── mongoDB.ipynb 40 | │   └── evaluation_QA_system 41 | │   ├── dataset_filter 42 | │   │   └── filter_data.ipynb 43 | │   ├── evaluation_pipeline.ipynb 44 | │   ├── explore_questions.ipynb 45 | │   ├── full_text_evaluation.py 46 | │   └── RAG_evaluator.py 47 | ├── information_retrieval 48 | │   ├── document_encoding 49 | │   │   ├── bioBERT_encoder.py 50 | │   │   ├── encode_documents.ipynb 51 | │   │   └── medCPT_encoder.py 52 | │   ├── elastic_container 53 | │   │   ├── elastic.ipynb 54 | │   │   ├── ingest_data.py 55 | │   │   └── start_elasticsearch.sh 56 | │   └── faiss_container 57 | │   ├── docker-compose.yml 58 | │   ├── Dockerfile 59 | │   ├── faiss_insert_data.ipynb 60 | │   └── server.py 61 | ├── rag_system 62 | │   ├── bioBERT_encoder.py 63 | │   ├── bioBERT_retriever.py 64 | │   ├── bm25_retriever.py 65 | │   ├── hybrid_retriever.py 66 | │   ├── medCPT_encoder.py 67 | │   ├── medCPT_retriever.py 68 | │   ├── med_rag.py 69 | │   ├── openAI_chat.py 70 | │   └── pipeline.ipynb 71 | ├── README.md 72 | ├── requirements.txt 73 | └── sys_requirements.txt 74 | 75 | ``` 76 | 77 | ## Installation 78 | 79 | To set up the Medical RAG System, follow these steps: 80 | 81 | 1. **Clone the Repository** 82 | 83 | ``` 84 | git clone https://github.com/slinusc/medical_RAG_system.git 85 | cd medical_RAG_system 86 | ``` 87 | 88 | 2. **Install Dependencies** 89 | 90 | Create a virtual environment and install the required packages: 91 | 92 | ``` 93 | python -m venv venv 94 | source venv/bin/activate # On Windows, use `venv\Scripts\activate` 95 | pip install -r requirements.txt 96 | ``` 97 | 98 | 3. **Download Pre-trained Models** 99 | 100 | Ensure that you download and set up any necessary pre-trained models (e.g., BioBERT, MedCPT). 101 | 102 | ## Usage 103 | 104 | The system can be used for different purposes, including document retrieval, question answering, and evaluation. Each component has its own set of instructions and example notebooks. 105 | 106 | ### Retrieval System 107 | 108 | - **BM25 Retriever**: `rag_system/bm25_retriever.py` 109 | - **BioBERT Retriever**: `rag_system/bioBERT_retriever.py` 110 | - **Hybrid Retriever**: `rag_system/hybrid_retriever.py` 111 | 112 | ### Question Answering System 113 | 114 | - **Medical RAG**: `rag_system/med_rag.py` 115 | - **OpenAI Chat**: `rag_system/openAI_chat.py` 116 | 117 | ### Datasets 118 | 119 | The 2.4M PubmEd subset we used can be accessed here: [slinusc/PubMedAbstractsSubset](https://huggingface.co/datasets/slinusc/PubMedAbstractsSubset). 120 | If you're looking for the precomputed embedding vectors (MedCPT) used in our work [*Efficient and Reproducible Biomedical Question Answering using Retrieval Augmented Generation*](https://arxiv.org/abs/2505.07917), they are available in a separate dataset: [slinusc/PubMedAbstractsSubsetEmbedded](https://huggingface.co/datasets/slinusc/PubMedAbstractsSubsetEmbedded). 121 | 122 | ### Evaluation 123 | 124 | Evaluation scripts and notebooks are located in the `evaluation/evaluation_QA_system/` directory. Example notebooks are provided to demonstrate the evaluation process. 125 | 126 | #### Running an Evaluation 127 | 128 | 1. **Filter the Data (Optional)** 129 | 130 | If you need to filter your dataset before evaluation, use the provided notebook: 131 | 132 | ``` 133 | evaluation/evaluation_QA_system/dataset_filter/filter_data.ipynb 134 | ``` 135 | 136 | 2. **Evaluate** 137 | To run the evaluation pipeline, use the evaluation_pipeline.ipynb notebook located in the evaluation/evaluation_QA_system/ directory. This notebook provides a comprehensive guide and setup to evaluate the performance of the RAG system. 138 | 139 | ## Used Infrastructure 140 | 141 | The experiments were conducted on the following system: 142 | 143 | | **Component** | **Specification** | 144 | |----------------------|--------------------------------------------| 145 | | **Architecture** | x86_64 | 146 | | **CPU** | 8 CPUs | 147 | | **Model** | Intel Core Processor (Broadwell) | 148 | | **Memory** | 32 GiB total, 10 GiB used for buffers/cache | 149 | | **Storage** | 240 GiB disk size | 150 | | **Operating System** | Ubuntu 22.04.4 LTS (Jammy) | 151 | | **Kernel Version** | 5.15.0-102-generic | 152 | | **GPU** | NVIDIA A30 | 153 | 154 | 155 | ## Contributing 156 | 157 | We welcome contributions to enhance the Medical RAG System. Please follow these steps to contribute: 158 | 159 | 1. Fork the repository. 160 | 2. Create a new branch: `git checkout -b feature-branch`. 161 | 3. Make your changes and commit them: `git commit -m 'Add new feature'`. 162 | 4. Push to the branch: `git push origin feature-branch`. 163 | 5. Create a pull request. 164 | 165 | ## 📄 Publications 166 | 167 | **Efficient and Reproducible Biomedical Question Answering using Retrieval Augmented Generation** 168 | Linus Stuhlmann, Michael Saxer, Jonathan Fürst 169 | 170 | Please cite our work: 171 | 172 | ```bibtex 173 | @INPROCEEDINGS{11081505, 174 | author={Stuhlmann, Linus and Saxer, Michael Alexander and Fürst, Jonathan}, 175 | booktitle={2025 IEEE Swiss Conference on Data Science (SDS)}, 176 | title={Efficient and Reproducible Biomedical Question Answering Using Retrieval Augmented Generation}, 177 | year={2025}, 178 | volume={}, 179 | number={}, 180 | pages={154-157}, 181 | keywords={Accuracy;Scalability;Large language models;Retrieval augmented generation;Data science;Information retrieval;Question answering (information retrieval);Hybrid power systems;Time factors;Indexing;Biomedical Information Retrieval;RetrievalAugmented Generation;Hybrid Retrieval;Large Language Models;PubMed;Information Retrieval Systems}, 182 | doi={10.1109/SDS66131.2025.00029}} 183 | ``` 184 | 185 | [Read the paper on arXiv](https://arxiv.org/abs/2505.07917) 186 | 187 | ## License 188 | 189 | This project is licensed under the MIT License. See the [LICENSE](LICENSE) file for details. 190 | ``` 191 | -------------------------------------------------------------------------------- /information_retrieval/elastic_container/elastic.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import json\n", 10 | "from elasticsearch import Elasticsearch\n", 11 | "import os\n", 12 | "from pathlib import Path\n", 13 | "from tqdm import tqdm" 14 | ] 15 | }, 16 | { 17 | "cell_type": "code", 18 | "execution_count": 4, 19 | "metadata": {}, 20 | "outputs": [ 21 | { 22 | "data": { 23 | "text/plain": [ 24 | "ObjectApiResponse({'name': 'e16354f42e49', 'cluster_name': 'docker-cluster', 'cluster_uuid': 'QRfx48-WQEmifPZNrtrbGw', 'version': {'number': '8.13.4', 'build_flavor': 'default', 'build_type': 'docker', 'build_hash': 'da95df118650b55a500dcc181889ac35c6d8da7c', 'build_date': '2024-05-06T22:04:45.107454559Z', 'build_snapshot': False, 'lucene_version': '9.10.0', 'minimum_wire_compatibility_version': '7.17.0', 'minimum_index_compatibility_version': '7.0.0'}, 'tagline': 'You Know, for Search'})" 25 | ] 26 | }, 27 | "execution_count": 4, 28 | "metadata": {}, 29 | "output_type": "execute_result" 30 | } 31 | ], 32 | "source": [ 33 | "password = os.getenv(\"ELASTIC_PASSWORD\")\n", 34 | "password = \"B*WJBFKxDIuH9erC-V2d\"\n", 35 | "\n", 36 | "es = Elasticsearch(\n", 37 | " ['https://localhost:9200'],\n", 38 | " basic_auth=('elastic', password),\n", 39 | " verify_certs=True,\n", 40 | " ca_certs=\"/home/rag/.crt/http_ca.crt\",\n", 41 | " request_timeout=60\n", 42 | " )\n", 43 | "\n", 44 | "es.info()" 45 | ] 46 | }, 47 | { 48 | "cell_type": "code", 49 | "execution_count": 4, 50 | "metadata": {}, 51 | "outputs": [ 52 | { 53 | "name": "stdout", 54 | "output_type": "stream", 55 | "text": [ 56 | "pubmed_index\n" 57 | ] 58 | } 59 | ], 60 | "source": [ 61 | "indices = es.cat.indices(format='json')\n", 62 | "\n", 63 | "# Print the indices\n", 64 | "for index in indices:\n", 65 | " print(index['index'])\n", 66 | "\n", 67 | "index = \"pubmed_index\"" 68 | ] 69 | }, 70 | { 71 | "cell_type": "code", 72 | "execution_count": null, 73 | "metadata": {}, 74 | "outputs": [], 75 | "source": [ 76 | "import re\n", 77 | "\n", 78 | "source_directory = Path('/home/rag/data/chunk')\n", 79 | "\n", 80 | "# get file namens sorted by number\n", 81 | "\n", 82 | "files = sorted([f for f in source_directory.iterdir() if f.is_file()])\n", 83 | "\n", 84 | "files" 85 | ] 86 | }, 87 | { 88 | "cell_type": "code", 89 | "execution_count": null, 90 | "metadata": {}, 91 | "outputs": [], 92 | "source": [ 93 | "# Define the index name\n", 94 | "index_name = \"pubmed_index_embedded\"\n", 95 | "\n", 96 | "# Delete the index if it exists\n", 97 | "if es.indices.exists(index=index_name):\n", 98 | " es.indices.delete(index=index_name)\n", 99 | "\n", 100 | "# Check again if the index exists, and if not, create it\n", 101 | "if not es.indices.exists(index=index_name):\n", 102 | " # Define the mapping\n", 103 | " {\n", 104 | " \"settings\": {\n", 105 | " \"analysis\": {\n", 106 | " \"analyzer\": {\n", 107 | " \"custom_lemmatizer_analyzer\": {\n", 108 | " \"type\": \"custom\",\n", 109 | " \"tokenizer\": \"standard\",\n", 110 | " \"filter\": [\"lowercase\", \"stopwords\", \"lemmatizer_filter\"]\n", 111 | " }\n", 112 | " },\n", 113 | " \"filter\": {\n", 114 | " \"lemmatizer_filter\": {\n", 115 | " \"language\": \"English\" # Specify the language for lemmatization\n", 116 | " },\n", 117 | " \"stopwords\": {\n", 118 | " \"type\": \"stop\",\n", 119 | " \"stopwords\": \"_english_\" # the built-in English stop words list\n", 120 | " }\n", 121 | " }\n", 122 | " }\n", 123 | " },\n", 124 | " \"mappings\": {\n", 125 | " \"properties\": {\n", 126 | " \"content\": {\n", 127 | " \"type\": \"text\",\n", 128 | " \"analyzer\": \"custom_lemmatizer_analyzer\"\n", 129 | " }\n", 130 | " }\n", 131 | " }\n", 132 | "}\n", 133 | "\n", 134 | "\n", 135 | "\n", 136 | "# Create the index with the defined mapping\n", 137 | "es.indices.create(index=index_name, body=mapping)\n", 138 | "\n", 139 | "source_directory = Path('/home/rag/data/chunk')\n", 140 | "error_log_path = Path('./errors.jsonl') # Pfad zur Fehlerprotokolldatei\n", 141 | "\n", 142 | "def bulk_index_documents(source_directory, index_name, error_log_path):\n", 143 | " if not source_directory.exists():\n", 144 | " print(\"The source directory does not exist.\")\n", 145 | " return\n", 146 | "\n", 147 | " actions = [] # List to store the documents to be indexed\n", 148 | "\n", 149 | " # Open the error log file for writing\n", 150 | " with error_log_path.open('w') as error_log:\n", 151 | " # Iterate through each file in the source directory\n", 152 | " num_files = 0\n", 153 | " max_files = 300\n", 154 | " for file_name in tqdm(list(os.listdir(source_directory))):\n", 155 | " if file_name.endswith('.jsonl'):\n", 156 | " source_file = source_directory / file_name\n", 157 | " \n", 158 | " # Open and read the JSONL file\n", 159 | " with open(source_file, 'r') as json_file:\n", 160 | " for line in json_file:\n", 161 | " try:\n", 162 | " doc = json.loads(line)\n", 163 | " \n", 164 | " # Remove the \"embeddings\" field from the document\n", 165 | " #if \"embeddings\" in doc:\n", 166 | " # del doc[\"embeddings\"]\n", 167 | " \n", 168 | " action = {\n", 169 | " \"_index\": index_name,\n", 170 | " \"_source\": doc\n", 171 | " }\n", 172 | " actions.append(action)\n", 173 | "\n", 174 | " if len(actions) == 200: # Bulk indexing threshold\n", 175 | " helpers.bulk(es, actions)\n", 176 | " actions = []\n", 177 | " except json.JSONDecodeError as e:\n", 178 | " # Log the error\n", 179 | " error_log.write(f\"Error in file {file_name}: {e}\\n\")\n", 180 | " error_log.write(f\"{line}\\n\")\n", 181 | " except Exception as e:\n", 182 | " error_log.write(f\"Unexpected error in file {file_name}: {e}\\n\")\n", 183 | " error_log.write(f\"{line}\\n\")\n", 184 | "\n", 185 | " # Index any remaining documents\n", 186 | " if actions:\n", 187 | " helpers.bulk(es, actions)\n", 188 | "\n", 189 | " print('Indexing complete')\n", 190 | "\n", 191 | "# Call the function to index the documents\n", 192 | "bulk_index_documents(source_directory, index_name, error_log_path)\n", 193 | "\n", 194 | "# Count and print the number of documents in the index\n", 195 | "count_result = es.count(index=index_name)\n", 196 | "print(f\"Index contains {count_result['count']} documents.\")" 197 | ] 198 | } 199 | ], 200 | "metadata": { 201 | "kernelspec": { 202 | "display_name": "Python 3", 203 | "language": "python", 204 | "name": "python3" 205 | }, 206 | "language_info": { 207 | "codemirror_mode": { 208 | "name": "ipython", 209 | "version": 3 210 | }, 211 | "file_extension": ".py", 212 | "mimetype": "text/x-python", 213 | "name": "python", 214 | "nbconvert_exporter": "python", 215 | "pygments_lexer": "ipython3", 216 | "version": "3.10.12" 217 | } 218 | }, 219 | "nbformat": 4, 220 | "nbformat_minor": 2 221 | } 222 | -------------------------------------------------------------------------------- /information_retrieval/document_encoding/encode_documents.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "id": "5ea70828-38b9-48db-a7f4-ec9c644bfc2d", 7 | "metadata": {}, 8 | "outputs": [], 9 | "source": [ 10 | "import ray\n", 11 | "import torch\n", 12 | "import os\n", 13 | "import langchain_community\n", 14 | "from ray.data import ActorPoolStrategy\n", 15 | "from tqdm import tqdm\n", 16 | "import pandas as pd\n", 17 | "from ray.data import from_pandas\n", 18 | "from functools import partial\n", 19 | "import torch\n", 20 | "from functools import partial\n", 21 | "from bioBERT_encoder import bioBERTEncoder\n", 22 | "from medCPT_encoder import medCPTArticleEncoder" 23 | ] 24 | }, 25 | { 26 | "cell_type": "code", 27 | "execution_count": 2, 28 | "id": "f380bee0-db3e-4c56-81b1-a827aca6d048", 29 | "metadata": {}, 30 | "outputs": [ 31 | { 32 | "name": "stdout", 33 | "output_type": "stream", 34 | "text": [ 35 | "Using cuda.\n" 36 | ] 37 | } 38 | ], 39 | "source": [ 40 | "device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n", 41 | "print(f\"Using {device}.\")" 42 | ] 43 | }, 44 | { 45 | "cell_type": "markdown", 46 | "id": "3c8bcc10-7ce7-4337-91b3-99795848b21c", 47 | "metadata": {}, 48 | "source": [ 49 | "### Initializing Ray" 50 | ] 51 | }, 52 | { 53 | "cell_type": "code", 54 | "execution_count": 3, 55 | "id": "9bea217e-3ee8-4fe4-9c7a-511324db3215", 56 | "metadata": {}, 57 | "outputs": [ 58 | { 59 | "name": "stderr", 60 | "output_type": "stream", 61 | "text": [ 62 | "2024-05-02 15:37:12,105\tINFO worker.py:1567 -- Connecting to existing Ray cluster at address: 10.10.2.206:6379...\n", 63 | "2024-05-02 15:37:12,116\tINFO worker.py:1743 -- Connected to Ray cluster. View the dashboard at \u001b[1m\u001b[32m127.0.0.1:8265 \u001b[39m\u001b[22m\n" 64 | ] 65 | } 66 | ], 67 | "source": [ 68 | "runtime_env = {\n", 69 | " \"pip\": [\n", 70 | " \"langchain-text-splitters\",\n", 71 | " \"langchain_community\", \n", 72 | " \"sentence_transformers\"\n", 73 | " ],\n", 74 | "}\n", 75 | "\n", 76 | "if not ray.is_initialized():\n", 77 | " ray.init(runtime_env=runtime_env)\n", 78 | "else:\n", 79 | " ray.shutdown()" 80 | ] 81 | }, 82 | { 83 | "cell_type": "code", 84 | "execution_count": 4, 85 | "id": "3de246c8-44a7-4fc1-aeeb-86396be8588d", 86 | "metadata": {}, 87 | "outputs": [ 88 | { 89 | "name": "stdout", 90 | "output_type": "stream", 91 | "text": [ 92 | "Verfügbare Ressourcen: {'CPU': 32.0, 'object_store_memory': 17714153471.0, 'memory': 40311046965.0, 'GPU': 4.0, 'accelerator_type:T4': 4.0, 'node:10.10.3.5': 1.0, 'node:10.10.3.72': 1.0, 'node:10.10.2.206': 1.0, 'node:__internal_head__': 1.0, 'node:10.10.2.65': 1.0}\n" 93 | ] 94 | } 95 | ], 96 | "source": [ 97 | "available_resources = ray.available_resources()\n", 98 | "print(\"Verfügbare Ressourcen:\", available_resources)" 99 | ] 100 | }, 101 | { 102 | "cell_type": "code", 103 | "execution_count": 5, 104 | "id": "d7f12da1-fb93-4f7e-a083-140b47c01c62", 105 | "metadata": {}, 106 | "outputs": [ 107 | { 108 | "data": { 109 | "text/plain": [ 110 | "['data/pubmed/chunk/pubmed23n0046.jsonl',\n", 111 | " 'data/pubmed/chunk/pubmed23n0050.jsonl',\n", 112 | " 'data/pubmed/chunk/pubmed23n0003.jsonl',\n", 113 | " 'data/pubmed/chunk/pubmed23n0117.jsonl',\n", 114 | " 'data/pubmed/chunk/pubmed23n0068.jsonl']" 115 | ] 116 | }, 117 | "execution_count": 5, 118 | "metadata": {}, 119 | "output_type": "execute_result" 120 | } 121 | ], 122 | "source": [ 123 | "directory_path = \"data/pubmed/chunk/\"\n", 124 | "file_names = os.listdir(directory_path)\n", 125 | "file_paths = [os.path.join(directory_path, file_name) for file_name in file_names]\n", 126 | "jsonl_file_paths = [file_path for file_path in file_paths if file_path.endswith('.jsonl')]\n", 127 | "\n", 128 | "jsonl_file_paths[:5]" 129 | ] 130 | }, 131 | { 132 | "cell_type": "markdown", 133 | "id": "429f5635-5790-4ebe-aa5e-9d40c79c0a8c", 134 | "metadata": {}, 135 | "source": [ 136 | "### Using only head node for embedding.\n", 137 | "\n", 138 | "Initializing BioBERT Embedding Model" 139 | ] 140 | }, 141 | { 142 | "cell_type": "code", 143 | "execution_count": 6, 144 | "id": "96e3ffe7-c78a-4d34-9e3c-de44a392fd32", 145 | "metadata": {}, 146 | "outputs": [], 147 | "source": [ 148 | "encoder = bioBERTEncoder()" 149 | ] 150 | }, 151 | { 152 | "cell_type": "markdown", 153 | "id": "4298cec9-539d-4c9c-8de3-fdcd49bdde71", 154 | "metadata": {}, 155 | "source": [ 156 | "Iterating through every JSONL file adding the attribute \"embeddings\"" 157 | ] 158 | }, 159 | { 160 | "cell_type": "code", 161 | "execution_count": 10, 162 | "id": "a64c8a49-1e6e-45ae-87f2-526eee4715d2", 163 | "metadata": {}, 164 | "outputs": [ 165 | { 166 | "name": "stderr", 167 | "output_type": "stream", 168 | "text": [ 169 | "100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 99/99 [13:00:29<00:00, 473.02s/it]" 170 | ] 171 | }, 172 | { 173 | "name": "stdout", 174 | "output_type": "stream", 175 | "text": [ 176 | "Alle Dateien wurden verarbeitet und gespeichert.\n" 177 | ] 178 | }, 179 | { 180 | "name": "stderr", 181 | "output_type": "stream", 182 | "text": [ 183 | "\n" 184 | ] 185 | } 186 | ], 187 | "source": [ 188 | "import os\n", 189 | "import json\n", 190 | "from pathlib import Path\n", 191 | "\n", 192 | "# Definiere die Pfade für die Quell- und Zielverzeichnisse\n", 193 | "source_directory = Path('data/pubmed/chunk')\n", 194 | "target_directory = Path('data/pubmed/embedded')\n", 195 | "target_directory.mkdir(parents=True, exist_ok=True)\n", 196 | "\n", 197 | "# Iteriert durch jede Datei im Quellverzeichnis\n", 198 | "for file_name in tqdm(os.listdir(source_directory)):\n", 199 | " if file_name.endswith('.jsonl'):\n", 200 | " source_file = source_directory / file_name\n", 201 | " target_file = target_directory / file_name\n", 202 | "\n", 203 | " # Erstellt eine neue Datei im Zielverzeichnis\n", 204 | " with open(target_file, 'w') as target:\n", 205 | " with open(source_file, 'r') as source:\n", 206 | " for line in source:\n", 207 | " # Jede Zeile ist ein JSON-Objekt\n", 208 | " item = json.loads(line)\n", 209 | " # Verarbeite das Item mit EmbedChunks\n", 210 | " embedded_item = encoder([item])[0] # [0], weil embedder eine Liste zurückgibt\n", 211 | " # Schreibe das bearbeitete Objekt in die Zieldatei\n", 212 | " target.write(json.dumps(embedded_item) + '\\n')\n", 213 | " #print(f\"{target_file} has been successfully written to data/pubmed/embedded\")\n", 214 | " \n", 215 | "print(\"Alle Dateien wurden verarbeitet und gespeichert.\")" 216 | ] 217 | }, 218 | { 219 | "cell_type": "markdown", 220 | "id": "3b1a578e-e1ca-4204-8b4b-b26fc248e400", 221 | "metadata": {}, 222 | "source": [ 223 | "To improve performance we'll try to distribute the embedding process on the Ray cluster using 4 nodes with GPUs. With one node the embedding of 1.8 mio documents took 14 hours." 224 | ] 225 | }, 226 | { 227 | "cell_type": "markdown", 228 | "id": "1b114122-6b49-4521-973f-e5ca45f6dc35", 229 | "metadata": {}, 230 | "source": [ 231 | "### MedCPT\n", 232 | "now encode with article encoder of MedCPT" 233 | ] 234 | }, 235 | { 236 | "cell_type": "code", 237 | "execution_count": 5, 238 | "id": "e586b89d-c760-41cb-b542-1222b3b69483", 239 | "metadata": {}, 240 | "outputs": [], 241 | "source": [ 242 | "encoder = medCPTArticleEncoder()" 243 | ] 244 | }, 245 | { 246 | "cell_type": "code", 247 | "execution_count": null, 248 | "id": "0c3fd8c4-f408-4eae-9ea4-e003e664ceb0", 249 | "metadata": {}, 250 | "outputs": [], 251 | "source": [ 252 | "import os\n", 253 | "import json\n", 254 | "from pathlib import Path\n", 255 | "\n", 256 | "# Definiere die Pfade für die Quell- und Zielverzeichnisse\n", 257 | "source_directory = Path('data/pubmed/chunk')\n", 258 | "target_directory = Path('data/pubmed/embedded_MedCPT')\n", 259 | "target_directory.mkdir(parents=True, exist_ok=True)\n", 260 | "\n", 261 | "# Iteriert durch jede Datei im Quellverzeichnis\n", 262 | "for file_name in tqdm(os.listdir(source_directory)):\n", 263 | " if file_name.endswith('.jsonl'):\n", 264 | " source_file = source_directory / file_name\n", 265 | " target_file = target_directory / file_name\n", 266 | "\n", 267 | " # Erstellt eine neue Datei im Zielverzeichnis\n", 268 | " with open(target_file, 'w') as target:\n", 269 | " with open(source_file, 'r') as source:\n", 270 | " for line in source:\n", 271 | " # Jede Zeile ist ein JSON-Objekt\n", 272 | " item = json.loads(line)\n", 273 | " # Verarbeite das Item mit EmbedChunks\n", 274 | " embedded_item = encoder([item])[0] # [0], weil embedder eine Liste zurückgibt\n", 275 | " # Schreibe das bearbeitete Objekt in die Zieldatei\n", 276 | " target.write(json.dumps(embedded_item) + '\\n')\n", 277 | " #print(f\"{target_file} has been successfully written to data/pubmed/embedded\")\n", 278 | " \n", 279 | "print(\"Alle Dateien wurden verarbeitet und gespeichert.\")" 280 | ] 281 | } 282 | ], 283 | "metadata": { 284 | "kernelspec": { 285 | "display_name": "Python 3 (ipykernel)", 286 | "language": "python", 287 | "name": "python3" 288 | }, 289 | "language_info": { 290 | "codemirror_mode": { 291 | "name": "ipython", 292 | "version": 3 293 | }, 294 | "file_extension": ".py", 295 | "mimetype": "text/x-python", 296 | "name": "python", 297 | "nbconvert_exporter": "python", 298 | "pygments_lexer": "ipython3", 299 | "version": "3.9.18" 300 | } 301 | }, 302 | "nbformat": 4, 303 | "nbformat_minor": 5 304 | } 305 | -------------------------------------------------------------------------------- /evaluation/evaluation_QA_system/evaluation_pipeline.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "### Evaluation of the RAG system" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "first we import some neccessary libraries" 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": 1, 20 | "metadata": {}, 21 | "outputs": [], 22 | "source": [ 23 | "import pandas as pd\n", 24 | "import matplotlib.pyplot as plt\n", 25 | "import sys\n", 26 | "import os\n", 27 | "from RAG_evaluator import RAG_evaluator\n", 28 | "sys.path.append(\"../../rag_system/\")\n", 29 | "from med_rag import MedRAG" 30 | ] 31 | }, 32 | { 33 | "cell_type": "markdown", 34 | "metadata": {}, 35 | "source": [ 36 | "now we define an experiment name, this name should ! uniqely! identify the experiemnal run" 37 | ] 38 | }, 39 | { 40 | "cell_type": "code", 41 | "execution_count": 2, 42 | "metadata": {}, 43 | "outputs": [], 44 | "source": [ 45 | "experiment_name = \"experiment_debugginglist_questions\"" 46 | ] 47 | }, 48 | { 49 | "cell_type": "markdown", 50 | "metadata": {}, 51 | "source": [ 52 | "then we implement a running experiment by using rag system one two and three" 53 | ] 54 | }, 55 | { 56 | "cell_type": "code", 57 | "execution_count": 3, 58 | "metadata": {}, 59 | "outputs": [ 60 | { 61 | "name": "stdout", 62 | "output_type": "stream", 63 | "text": [ 64 | "Directory 'experiment_debugginglist_questions' created at /home/ubuntu/questions_answers_data/experiment_results/experiment_debugginglist_questions\n" 65 | ] 66 | } 67 | ], 68 | "source": [ 69 | "# Base directory where the new folder will be created\n", 70 | "base_directory = \"/home/ubuntu/questions_answers_data/experiment_results\"\n", 71 | "# input directory, change if diffrent one is used\n", 72 | "question_input_dir = \"/home/ubuntu/questions_answers_data/all_questions_in_system_min3.json\"\n", 73 | "\n", 74 | "\n", 75 | "# Construct the path for the new experiment folder\n", 76 | "experiment_folder_path = os.path.join(base_directory, experiment_name)\n", 77 | "\n", 78 | "# Create the directory if it does not exist\n", 79 | "if not os.path.exists(experiment_folder_path):\n", 80 | " os.makedirs(experiment_folder_path)\n", 81 | " print(f\"Directory '{experiment_name}' created at {experiment_folder_path}\")\n", 82 | "else:\n", 83 | " print(f\"Directory '{experiment_name}' already exists at {experiment_folder_path}\")\n", 84 | "\n", 85 | "# Construct the path for the JSON file\n", 86 | "output_path_retriever_1 = os.path.join(experiment_folder_path, \"result_ragver_1.json\")\n", 87 | "output_path_retriever_2 = os.path.join(experiment_folder_path, \"result_ragver_2.json\")\n", 88 | "output_path_retriever_3 = os.path.join(experiment_folder_path, \"result_ragver_3.json\")\n", 89 | "output_path_retriever_4 = os.path.join(experiment_folder_path, \"result_ragver_4.json\")" 90 | ] 91 | }, 92 | { 93 | "cell_type": "markdown", 94 | "metadata": {}, 95 | "source": [ 96 | "### Evaluation of the 3 retriever types used in the RAG" 97 | ] 98 | }, 99 | { 100 | "cell_type": "markdown", 101 | "metadata": {}, 102 | "source": [ 103 | "- Retriever 1: BioBERT\n", 104 | "- Retriever 2: BM25\n", 105 | "- Retriever 3: Hybrid Retriever BM25 reranked with medCPT cross encoder\n", 106 | "- Retriever 4: medCPT Retriever with reranking" 107 | ] 108 | }, 109 | { 110 | "cell_type": "markdown", 111 | "metadata": {}, 112 | "source": [ 113 | "Retriever 2: BM25" 114 | ] 115 | }, 116 | { 117 | "cell_type": "code", 118 | "execution_count": 4, 119 | "metadata": {}, 120 | "outputs": [], 121 | "source": [ 122 | "rag_system = MedRAG(retriever=2, question_type=2)\n", 123 | "\n", 124 | "rag_type = RAG_evaluator(\n", 125 | " rag_model=rag_system,\n", 126 | " path_to_question_json=question_input_dir,\n", 127 | " output_path=output_path_retriever_2,\n", 128 | ")" 129 | ] 130 | }, 131 | { 132 | "cell_type": "code", 133 | "execution_count": null, 134 | "metadata": {}, 135 | "outputs": [], 136 | "source": [ 137 | "rag_type.run_eval()" 138 | ] 139 | }, 140 | { 141 | "cell_type": "code", 142 | "execution_count": 10, 143 | "metadata": {}, 144 | "outputs": [ 145 | { 146 | "name": "stdout", 147 | "output_type": "stream", 148 | "text": [ 149 | "Summary Statistics for RAG with retriever 4\n", 150 | "Total Questions: 85\n", 151 | "\n", 152 | "Response Time:\n", 153 | "Mean: 3.81 seconds\n", 154 | "Standard Deviation: 1.42 seconds\n", 155 | "\n", 156 | "Summary of non-answered questions:\n", 157 | "Absolute count - No Docs Found: 0\n", 158 | "Percentage - No Docs Found: 0.00%\n", 159 | "\n", 160 | "Metrics - RAG Q&A:\n", 161 | "Accuracy: 0.86\n", 162 | "Recall: 0.86\n", 163 | "Precision: 0.87\n", 164 | "F1 Score: 0.86\n", 165 | "\n", 166 | "Metrics - Retriever:\n", 167 | "Recall Retriever: 0.32\n", 168 | "Precision Retriever: 0.01\n", 169 | "F1 Score Retriever: 0.02\n", 170 | "\n", 171 | "Metrics - Used vs Retrieved:\n", 172 | "Recall Used vs Retrieved: 0.49\n", 173 | "Precision Used vs Retrieved: 0.01\n", 174 | "F1 Score Used vs Retrieved: 0.02\n", 175 | "\n", 176 | "Additional metrics:\n", 177 | "Mean response time retriever: 2.33\n", 178 | "Standard deviation response time retriever: 0.71\n", 179 | "Mean response time generation: 1.49\n", 180 | "Standard deviation response time generation: 1.33\n" 181 | ] 182 | } 183 | ], 184 | "source": [ 185 | "rag_type.analyze_performance(\"/home/ubuntu/questions_answers_data/experiment_results/experiment_bioASQ_min2/result_ragver_4.json\")" 186 | ] 187 | }, 188 | { 189 | "cell_type": "markdown", 190 | "metadata": {}, 191 | "source": [ 192 | "### Evaluation of all question types" 193 | ] 194 | }, 195 | { 196 | "cell_type": "code", 197 | "execution_count": 2, 198 | "metadata": {}, 199 | "outputs": [ 200 | { 201 | "name": "stdout", 202 | "output_type": "stream", 203 | "text": [ 204 | "Directory 'experiment_bioASQ_min1_bioBERT' already exists at /home/ubuntu/questions_answers_data/experiment_results/experiment_bioASQ_min1_bioBERT\n" 205 | ] 206 | } 207 | ], 208 | "source": [ 209 | "experiment_name = \"experiment_bioASQ_min1_bioBERT\"\n", 210 | "# Base directory where the new folder will be created\n", 211 | "base_directory = \"/home/ubuntu/questions_answers_data/experiment_results\"\n", 212 | "# input directory, change if diffrent one is used\n", 213 | "question_input_factoid = \"/home/ubuntu/questions_answers_data/bioASQ_data_min_1/factoid_questions.json\"\n", 214 | "question_input_summary = \"/home/ubuntu/questions_answers_data/bioASQ_data_min_1/summary_questions.json\"\n", 215 | "question_input_list = \"/home/ubuntu/questions_answers_data/bioASQ_data_min_1/list_questions.json\"\n", 216 | "question_input_yesno = \"/home/ubuntu/questions_answers_data/bioASQ_data_min_1/yesno_questions.json\"\n", 217 | "\n", 218 | "# Construct the path for the new experiment folder\n", 219 | "experiment_folder_path = os.path.join(base_directory, experiment_name)\n", 220 | "\n", 221 | "# Create the directory if it does not exist\n", 222 | "if not os.path.exists(experiment_folder_path):\n", 223 | " os.makedirs(experiment_folder_path)\n", 224 | " print(f\"Directory '{experiment_name}' created at {experiment_folder_path}\")\n", 225 | "else:\n", 226 | " print(f\"Directory '{experiment_name}' already exists at {experiment_folder_path}\")\n", 227 | "\n", 228 | "# Construct the path for the JSON file\n", 229 | "output_path_question_factoid = os.path.join(experiment_folder_path, \"result_factoid.json\")\n", 230 | "output_path_question_summary = os.path.join(experiment_folder_path, \"result_summary.json\")\n", 231 | "output_path_question_list = os.path.join(experiment_folder_path, \"result_list.json\")\n", 232 | "output_path_question_yesno = os.path.join(experiment_folder_path, \"result_yesno.json\")" 233 | ] 234 | }, 235 | { 236 | "cell_type": "markdown", 237 | "metadata": {}, 238 | "source": [ 239 | "1. Factoid" 240 | ] 241 | }, 242 | { 243 | "cell_type": "code", 244 | "execution_count": 3, 245 | "metadata": {}, 246 | "outputs": [], 247 | "source": [ 248 | "rag_system = MedRAG(retriever=1, question_type=1)\n", 249 | "\n", 250 | "eval_factoid = RAG_evaluator(\n", 251 | " rag_model=rag_system,\n", 252 | " path_to_question_json=question_input_factoid,\n", 253 | " output_path=output_path_question_factoid,\n", 254 | ")" 255 | ] 256 | }, 257 | { 258 | "cell_type": "code", 259 | "execution_count": null, 260 | "metadata": {}, 261 | "outputs": [], 262 | "source": [ 263 | "eval_factoid.run_eval()" 264 | ] 265 | }, 266 | { 267 | "cell_type": "code", 268 | "execution_count": 28, 269 | "metadata": {}, 270 | "outputs": [ 271 | { 272 | "name": "stdout", 273 | "output_type": "stream", 274 | "text": [ 275 | "Summary Statistics for RAG with retriever Unknown\n", 276 | "Total Questions: 175\n", 277 | "\n", 278 | "Response Time:\n", 279 | "Mean: 6.39 seconds\n", 280 | "Standard Deviation: 1.66 seconds\n", 281 | "\n", 282 | "Summary of non-answered questions:\n", 283 | "Absolute count - No Docs Found: 0\n", 284 | "Percentage - No Docs Found: 0.00%\n", 285 | "\n", 286 | "Metrics - Retriever:\n", 287 | "Average Recall: 0.58\n", 288 | "\n", 289 | "Metrics for RAG Usage:\n", 290 | "Average Precision: 0.34\n", 291 | "\n", 292 | "Additional metrics:\n", 293 | "Mean response time retriever: 4.16\n", 294 | "Standard deviation response time retriever: 1.17\n", 295 | "Mean response time generation: 2.22\n", 296 | "Standard deviation response time generation: 1.05\n" 297 | ] 298 | } 299 | ], 300 | "source": [ 301 | "eval_factoid.analyze_performance(\"/home/ubuntu/questions_answers_data/experiment_results/experiment_bioASQ_min1_hybrid/result_factoid.json\")" 302 | ] 303 | }, 304 | { 305 | "cell_type": "markdown", 306 | "metadata": {}, 307 | "source": [ 308 | "2. Summary" 309 | ] 310 | }, 311 | { 312 | "cell_type": "code", 313 | "execution_count": 11, 314 | "metadata": {}, 315 | "outputs": [], 316 | "source": [ 317 | "rag_system = MedRAG(retriever=1, question_type=1)\n", 318 | "\n", 319 | "eval_summary = RAG_evaluator(\n", 320 | " rag_model=rag_system,\n", 321 | " path_to_question_json=question_input_summary,\n", 322 | " output_path=output_path_question_summary,\n", 323 | ")" 324 | ] 325 | }, 326 | { 327 | "cell_type": "code", 328 | "execution_count": null, 329 | "metadata": {}, 330 | "outputs": [], 331 | "source": [ 332 | "eval_summary.run_eval()" 333 | ] 334 | }, 335 | { 336 | "cell_type": "code", 337 | "execution_count": 27, 338 | "metadata": {}, 339 | "outputs": [ 340 | { 341 | "name": "stdout", 342 | "output_type": "stream", 343 | "text": [ 344 | "Summary Statistics for RAG with retriever Unknown\n", 345 | "Total Questions: 121\n", 346 | "\n", 347 | "Response Time:\n", 348 | "Mean: 7.50 seconds\n", 349 | "Standard Deviation: 1.88 seconds\n", 350 | "\n", 351 | "Summary of non-answered questions:\n", 352 | "Absolute count - No Docs Found: 0\n", 353 | "Percentage - No Docs Found: 0.00%\n", 354 | "\n", 355 | "Metrics - Retriever:\n", 356 | "Average Recall: 0.59\n", 357 | "\n", 358 | "Metrics for RAG Usage:\n", 359 | "Average Precision: 0.32\n", 360 | "\n", 361 | "Additional metrics:\n", 362 | "Mean response time retriever: 4.22\n", 363 | "Standard deviation response time retriever: 1.33\n", 364 | "Mean response time generation: 3.28\n", 365 | "Standard deviation response time generation: 1.25\n" 366 | ] 367 | } 368 | ], 369 | "source": [ 370 | "eval_summary.analyze_performance(\"/home/ubuntu/questions_answers_data/experiment_results/experiment_bioASQ_min1_hybrid/result_summary.json\")" 371 | ] 372 | }, 373 | { 374 | "cell_type": "markdown", 375 | "metadata": {}, 376 | "source": [ 377 | "3. List" 378 | ] 379 | }, 380 | { 381 | "cell_type": "code", 382 | "execution_count": 14, 383 | "metadata": {}, 384 | "outputs": [], 385 | "source": [ 386 | "rag_system = MedRAG(retriever=1, question_type=4)\n", 387 | "\n", 388 | "eval_list = RAG_evaluator(\n", 389 | " rag_model=rag_system,\n", 390 | " path_to_question_json=question_input_list,\n", 391 | " output_path=output_path_question_list,\n", 392 | ")" 393 | ] 394 | }, 395 | { 396 | "cell_type": "code", 397 | "execution_count": null, 398 | "metadata": {}, 399 | "outputs": [], 400 | "source": [ 401 | "eval_list.run_eval()" 402 | ] 403 | }, 404 | { 405 | "cell_type": "code", 406 | "execution_count": 26, 407 | "metadata": {}, 408 | "outputs": [ 409 | { 410 | "name": "stdout", 411 | "output_type": "stream", 412 | "text": [ 413 | "Summary Statistics for RAG with retriever Unknown\n", 414 | "Total Questions: 131\n", 415 | "\n", 416 | "Response Time:\n", 417 | "Mean: 5.94 seconds\n", 418 | "Standard Deviation: 1.45 seconds\n", 419 | "\n", 420 | "Summary of non-answered questions:\n", 421 | "Absolute count - No Docs Found: 0\n", 422 | "Percentage - No Docs Found: 0.00%\n", 423 | "\n", 424 | "Metrics - Retriever:\n", 425 | "Average Recall: 0.51\n", 426 | "\n", 427 | "Metrics for RAG Usage:\n", 428 | "Average Precision: 0.34\n", 429 | "\n", 430 | "Additional metrics:\n", 431 | "Mean response time retriever: 4.37\n", 432 | "Standard deviation response time retriever: 1.29\n", 433 | "Mean response time generation: 1.57\n", 434 | "Standard deviation response time generation: 0.7\n" 435 | ] 436 | } 437 | ], 438 | "source": [ 439 | "eval_list.analyze_performance(\"/home/ubuntu/questions_answers_data/experiment_results/experiment_bioASQ_min1_hybrid/result_list.json\")" 440 | ] 441 | }, 442 | { 443 | "cell_type": "markdown", 444 | "metadata": {}, 445 | "source": [ 446 | "4. Yes/No" 447 | ] 448 | }, 449 | { 450 | "cell_type": "code", 451 | "execution_count": 3, 452 | "metadata": {}, 453 | "outputs": [], 454 | "source": [ 455 | "rag_system = MedRAG(retriever=1, question_type=2)\n", 456 | "\n", 457 | "eval_yesno = RAG_evaluator(\n", 458 | " rag_model=rag_system,\n", 459 | " path_to_question_json=question_input_yesno,\n", 460 | " output_path=output_path_question_yesno,\n", 461 | ")" 462 | ] 463 | }, 464 | { 465 | "cell_type": "code", 466 | "execution_count": null, 467 | "metadata": {}, 468 | "outputs": [], 469 | "source": [ 470 | "eval_yesno.run_eval()" 471 | ] 472 | }, 473 | { 474 | "cell_type": "code", 475 | "execution_count": 4, 476 | "metadata": {}, 477 | "outputs": [ 478 | { 479 | "name": "stdout", 480 | "output_type": "stream", 481 | "text": [ 482 | "Summary Statistics for RAG with retriever Unknown\n", 483 | "Total Questions: 160\n", 484 | "\n", 485 | "Response Time:\n", 486 | "Mean: 5.80 seconds\n", 487 | "Standard Deviation: 1.30 seconds\n", 488 | "\n", 489 | "Summary of non-answered questions:\n", 490 | "Absolute count - No Docs Found: 0\n", 491 | "Percentage - No Docs Found: 0.00%\n", 492 | "\n", 493 | "Metrics - RAG Q&A:\n", 494 | "Accuracy: 0.86\n", 495 | "Recall: 0.86\n", 496 | "Precision: 0.89\n", 497 | "F1 Score: 0.86\n", 498 | "\n", 499 | "Metrics - Retriever:\n", 500 | "Average Recall: 0.54\n", 501 | "\n", 502 | "Metrics for RAG Usage:\n", 503 | "Average Precision: 0.28\n", 504 | "\n", 505 | "Additional metrics:\n", 506 | "Mean response time retriever: 4.66\n", 507 | "Standard deviation response time retriever: 1.24\n", 508 | "Mean response time generation: 1.14\n", 509 | "Standard deviation response time generation: 0.38\n" 510 | ] 511 | } 512 | ], 513 | "source": [ 514 | "eval_yesno.analyze_performance(\"/home/ubuntu/questions_answers_data/experiment_results/experiment_bioASQ_min1_hybrid/result_yesno.json\")" 515 | ] 516 | } 517 | ], 518 | "metadata": { 519 | "kernelspec": { 520 | "display_name": "Python 3", 521 | "language": "python", 522 | "name": "python3" 523 | }, 524 | "language_info": { 525 | "codemirror_mode": { 526 | "name": "ipython", 527 | "version": 3 528 | }, 529 | "file_extension": ".py", 530 | "mimetype": "text/x-python", 531 | "name": "python", 532 | "nbconvert_exporter": "python", 533 | "pygments_lexer": "ipython3", 534 | "version": "3.10.12" 535 | } 536 | }, 537 | "nbformat": 4, 538 | "nbformat_minor": 2 539 | } 540 | -------------------------------------------------------------------------------- /evaluation/evaluation_QA_system/RAG_evaluator.py: -------------------------------------------------------------------------------- 1 | import time 2 | from tqdm import tqdm 3 | import re 4 | import json 5 | import pandas as pd 6 | from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score 7 | 8 | 9 | class RAG_evaluator: 10 | """ 11 | Evaluates the performance of a Retrieval-Augmented Generation (RAG) system. 12 | """ 13 | 14 | def __init__( 15 | self, rag_model, path_to_question_json, output_path, multiplechoice=False 16 | ): 17 | self.rag_model = rag_model 18 | self.path_to_jsonfile = path_to_question_json 19 | self.output_path = output_path 20 | self.multiple_choice = multiplechoice 21 | 22 | def run_eval(self): 23 | """Executes the evaluation of the RAG system.""" 24 | start_time = time.time() # Start timing 25 | 26 | # Read the input JSON file 27 | with open(self.path_to_jsonfile, "r") as file: 28 | data = json.load(file) 29 | 30 | results = [] 31 | i = 0 32 | for question in tqdm(data["questions"], desc="Processing questions"): 33 | response = self.request_selector(question) 34 | if response is not None: 35 | results.append(response) 36 | 37 | # Write the results to the output JSON file 38 | with open(self.output_path, "w") as file: 39 | json.dump(results, file, indent=4) 40 | 41 | elapsed_time = time.time() - start_time 42 | print(f"Results written to {self.output_path}") 43 | print(f"Processing time: {elapsed_time:.2f} seconds") 44 | 45 | def request_selector(self, question): 46 | """Selects the appropriate RAG model and processes the question.""" 47 | try: 48 | if not self.multiple_choice: 49 | match question["type"]: 50 | case "yesno": 51 | return self.handle_yesno(question) 52 | case "list": 53 | return self.handle_list(question) 54 | case "summary" | "factoid": 55 | return self.handle_summary_factoid(question) 56 | case _: 57 | return None 58 | else: 59 | return self.handle_multiple_choice(question) 60 | except Exception as e: 61 | print(e) 62 | return None 63 | 64 | def handle_summary_factoid(self, question): 65 | """Handles 'yesno' questions.""" 66 | start_time = time.time() 67 | rag_answer = json.loads(self.rag_model.get_answer(question["body"])) 68 | elapsed_time = time.time() - start_time 69 | 70 | response = rag_answer.get("response") 71 | k_pubmedids = list(map(str, rag_answer["retrieved_PMIDs"])) 72 | used_pubmedids = list(map(str, rag_answer["used_PMIDs"])) 73 | 74 | retriever_time = rag_answer["retrieval_time"] 75 | generation_time = rag_answer["generation_time"] 76 | 77 | ground_truth_ids = self.extract_pubmedid(question["documents"]) 78 | retrieved_correct_ids, num_correct_retrieved_ids, matching_retrieved_ids = ( 79 | self.compare_pubmed_ids(k_pubmedids, question["documents"]) 80 | ) 81 | ( 82 | rag_used_correct_ids, 83 | rag_used_num_correct_retrieved_ids, 84 | rag_used_matching_retrieved_ids, 85 | ) = self.compare_pubmed_ids(used_pubmedids, question["documents"]) 86 | 87 | limit = 0 88 | 89 | answered_correct = self.llm_eval(limit, response, question["ideal_answer"]) 90 | 91 | return { 92 | "questionid": question["id"], 93 | "querytype": question["type"], 94 | "question": question["body"], 95 | "trueresponse_exact": question["ideal_answer"], 96 | "ragresponse": response, 97 | "answered_correct": answered_correct, 98 | "pmids_retrieved": k_pubmedids, 99 | "pmids_uses_by_rag": used_pubmedids, 100 | "pmids_ground_truth": ground_truth_ids, 101 | "retrieved_correct_pubmedid": retrieved_correct_ids, 102 | "num_correct_retrieved_ids": num_correct_retrieved_ids, 103 | "matching_retrieved_ids": matching_retrieved_ids, 104 | "rag_used_correct_ids": rag_used_correct_ids, 105 | "rag_used_num_correct_retrieved_ids": rag_used_num_correct_retrieved_ids, 106 | "rag_used_matching_retrieved_ids": rag_used_matching_retrieved_ids, 107 | "requestime": elapsed_time, 108 | "retrievment_time": retriever_time, 109 | "generation_time": generation_time, 110 | } 111 | 112 | def handle_list(self, question): 113 | """Handles 'yesno' questions.""" 114 | start_time = time.time() 115 | rag_answer = json.loads(self.rag_model.get_answer(question["body"])) 116 | elapsed_time = time.time() - start_time 117 | 118 | response = rag_answer.get("response") 119 | k_pubmedids = list(map(str, rag_answer["retrieved_PMIDs"])) 120 | used_pubmedids = list(map(str, rag_answer["used_PMIDs"])) 121 | 122 | retriever_time = rag_answer["retrieval_time"] 123 | generation_time = rag_answer["generation_time"] 124 | 125 | ground_truth_ids = self.extract_pubmedid(question["documents"]) 126 | retrieved_correct_ids, num_correct_retrieved_ids, matching_retrieved_ids = ( 127 | self.compare_pubmed_ids(k_pubmedids, question["documents"]) 128 | ) 129 | ( 130 | rag_used_correct_ids, 131 | rag_used_num_correct_retrieved_ids, 132 | rag_used_matching_retrieved_ids, 133 | ) = self.compare_pubmed_ids(used_pubmedids, question["documents"]) 134 | 135 | answered_correct, response, exact_answer = self.list_eval( 136 | response, question["exact_answer"] 137 | ) 138 | 139 | return { 140 | "questionid": question["id"], 141 | "querytype": question["type"], 142 | "question": question["body"], 143 | "trueresponse_exact": exact_answer, 144 | "ragresponse": response, 145 | "answered_correct": answered_correct, 146 | "pmids_retrieved": k_pubmedids, 147 | "pmids_uses_by_rag": used_pubmedids, 148 | "pmids_ground_truth": ground_truth_ids, 149 | "retrieved_correct_pubmedid": retrieved_correct_ids, 150 | "num_correct_retrieved_ids": num_correct_retrieved_ids, 151 | "matching_retrieved_ids": matching_retrieved_ids, 152 | "rag_used_correct_ids": rag_used_correct_ids, 153 | "rag_used_num_correct_retrieved_ids": rag_used_num_correct_retrieved_ids, 154 | "rag_used_matching_retrieved_ids": rag_used_matching_retrieved_ids, 155 | "requestime": elapsed_time, 156 | "retrievment_time": retriever_time, 157 | "generation_time": generation_time, 158 | } 159 | 160 | def handle_yesno(self, question): 161 | """Handles 'yesno' questions.""" 162 | start_time = time.time() 163 | rag_answer = json.loads(self.rag_model.get_answer(question["body"])) 164 | elapsed_time = time.time() - start_time 165 | 166 | response = rag_answer.get("response") 167 | k_pubmedids = list(map(str, rag_answer["retrieved_PMIDs"])) 168 | used_pubmedids = list(map(str, rag_answer["used_PMIDs"])) 169 | 170 | retriever_time = rag_answer["retrieval_time"] 171 | generation_time = rag_answer["generation_time"] 172 | 173 | ground_truth_ids = self.extract_pubmedid(question["documents"]) 174 | retrieved_correct_ids, num_correct_retrieved_ids, matching_retrieved_ids = ( 175 | self.compare_pubmed_ids(k_pubmedids, question["documents"]) 176 | ) 177 | ( 178 | rag_used_correct_ids, 179 | rag_used_num_correct_retrieved_ids, 180 | rag_used_matching_retrieved_ids, 181 | ) = self.compare_pubmed_ids(used_pubmedids, question["documents"]) 182 | 183 | answered_correct = self.yesno_eval(response, question["exact_answer"]) 184 | 185 | return { 186 | "questionid": question["id"], 187 | "querytype": question["type"], 188 | "question": question["body"], 189 | "trueresponse_exact": question["exact_answer"].lower(), 190 | "ragresponse": response.lower(), 191 | "answered_correct": answered_correct, 192 | "pmids_retrieved": k_pubmedids, 193 | "pmids_uses_by_rag": used_pubmedids, 194 | "pmids_ground_truth": ground_truth_ids, 195 | "retrieved_correct_pubmedid": retrieved_correct_ids, 196 | "num_correct_retrieved_ids": num_correct_retrieved_ids, 197 | "matching_retrieved_ids": matching_retrieved_ids, 198 | "rag_used_correct_ids": rag_used_correct_ids, 199 | "rag_used_num_correct_retrieved_ids": rag_used_num_correct_retrieved_ids, 200 | "rag_used_matching_retrieved_ids": rag_used_matching_retrieved_ids, 201 | "requestime": elapsed_time, 202 | "retrievment_time": retriever_time, 203 | "generation_time": generation_time, 204 | } 205 | 206 | def handle_multiple_choice(self, question): 207 | """Handles multiple-choice questions.""" 208 | start_time = time.time() 209 | rag_answer = json.loads( 210 | self.rag_model.get_answer( 211 | f"{question['question']} \n" 212 | f"1: {question['opa']} \n" 213 | f"2: {question['opb']} \n" 214 | f"3: {question['opc']} \n" 215 | f"4: {question['opd']}" 216 | ) 217 | ) 218 | elapsed_time = time.time() - start_time 219 | 220 | response = rag_answer.get("response") 221 | k_pubmedids = rag_answer["retrieved_PMIDs"] 222 | used_pubmedids = rag_answer["used_PMIDs"] 223 | 224 | retriever_time = rag_answer["retrieval_time"] 225 | generation_time = rag_answer["generation_time"] 226 | 227 | answered_correct = self.evaluate_MEDMCQA(response, question["cop"]) 228 | 229 | return { 230 | "questionid": question["id"], 231 | "querytype": "MEDCQA" + question["choice_type"], 232 | "question": question["question"], 233 | "trueresponse_exact": question["cop"], 234 | "ragresponse": response, 235 | "answered_correct": answered_correct, 236 | "pmids_retrieved": k_pubmedids, 237 | "pmids_uses_by_rag": used_pubmedids, 238 | "pmids_ground_truth": "none_for_question_type", 239 | "requestime": elapsed_time, 240 | "retrievment_time": retriever_time, 241 | "generation_time": generation_time, 242 | } 243 | 244 | def evaluate_MEDMCQA(self, rag_response, true_response): 245 | """Evaluates multiple-choice questions.""" 246 | try: 247 | return int(rag_response) == int(true_response) 248 | except Exception: 249 | return False 250 | 251 | def dummy_llm(self): 252 | # delete if real implementation is done 253 | pass 254 | 255 | def llm_eval(self, limit, rag_response, true_response): 256 | return "Not_evaluated_yet" 257 | limit = limit + 1 258 | response = self.dummy_llm(rag_response, true_response) 259 | if int(response) == 0: 260 | return False 261 | elif int(response) == 1: 262 | return True 263 | else: # if there is no valid response we try 2 times more to get one else we break 264 | if limit < 3: 265 | return self.llm_eval(limit, rag_response, true_response) 266 | else: 267 | return "no_valid_response_possible" 268 | 269 | def yesno_eval(self, rag_response, true_response): 270 | """Evaluates 'yesno' questions.""" 271 | valid_responses = {"yes", "no"} 272 | if ( 273 | rag_response.lower() not in valid_responses 274 | or true_response.lower() not in valid_responses 275 | ): 276 | return False 277 | return rag_response.lower() == true_response.lower() 278 | 279 | def list_eval(self, rag_response, true_response): 280 | # Normalize responses using the helper function 281 | normalized_rag = self.flatten_and_normalize(rag_response) 282 | normalized_true = self.flatten_and_normalize(true_response) 283 | 284 | # Check if at least one item matches 285 | is_any_match = bool(set(normalized_rag) & set(normalized_true)) 286 | 287 | # Check if at least one item matches 288 | is_any_match = bool(set(normalized_rag) & set(normalized_true)) 289 | 290 | # Similarity score (nur wenns de linus wett) 291 | # The similarity score is calculated as the Jaccard similarity index, which is the size of the intersection 292 | # of the two sets divided by the size of their union. This gives us a measure of similarity based on how many 293 | # items are common to both sets relative to the total number of unique items across both sets. 294 | # intersection = set(normalized_rag).intersection(set(normalized_true)) 295 | # union = set(normalized_rag).union(set(normalized_true)) 296 | # similarity_score = len(intersection) / len(union) if union else 1.0 # Handle division by zero if both lists are empty 297 | 298 | return is_any_match, normalized_rag, normalized_true # , similarity_score, 299 | 300 | def compare_pubmed_ids(self, pubmed_ids, documents): 301 | """Compares PubMed IDs returned by the RAG system.""" 302 | if not isinstance(pubmed_ids, list): 303 | pubmed_ids = [] 304 | 305 | extracted_ids = [ 306 | re.search(r"pubmed/(\d+)", doc).group(1) 307 | for doc in documents 308 | if re.search(r"pubmed/(\d+)", doc) 309 | ] 310 | 311 | matched_ids = [pid for pid in extracted_ids if pid in pubmed_ids] 312 | 313 | return bool(matched_ids), len(matched_ids), matched_ids 314 | 315 | def extract_pubmedid(self, documents): 316 | """Extracts PubMed IDs from document URLs.""" 317 | return [ 318 | re.search(r"pubmed/(\d+)", doc).group(1) 319 | for doc in documents 320 | if re.search(r"pubmed/(\d+)", doc) 321 | ] 322 | 323 | def manual_accuracy_score(self, y_true, y_pred): 324 | """Calculates the accuracy manually.""" 325 | if len(y_true) != len(y_pred): 326 | raise ValueError( 327 | "The length of true labels and predicted labels must be the same." 328 | ) 329 | return sum(1 for true, pred in zip(y_true, y_pred) if true == pred) / len( 330 | y_true 331 | ) 332 | 333 | def flatten_and_normalize(self, response): 334 | # This helper function flattens nested lists and normalizes strings 335 | flattened = [] 336 | for item in response: 337 | if isinstance(item, list): 338 | # If the item is a list, extend the flattened list with normalized subitems 339 | flattened.extend([str(subitem).lower().strip() for subitem in item]) 340 | else: 341 | # Otherwise, just append the normalized item 342 | flattened.append(str(item).lower().strip()) 343 | return flattened 344 | 345 | # Function to handle lists, flattening nested lists and normalizing strings 346 | def process_list(self, items): 347 | flattened = [] 348 | for item in items: 349 | if isinstance(item, list): 350 | # Recursively process nested lists 351 | flattened.extend(self.process_list(item)) 352 | else: 353 | # Normalize non-list items 354 | flattened.append(self.normalize(item)) 355 | return flattened 356 | 357 | # Helper function to handle string normalization 358 | def normalize(self, item): 359 | return str(item).lower().strip() 360 | 361 | def flatten_and_normalize(self, response): 362 | 363 | # Check if the response is a dictionary and process any lists found within 364 | if isinstance(response, dict): 365 | flattened = [] 366 | for value in response.values(): 367 | if isinstance(value, list): 368 | flattened.extend(self.process_list(value)) 369 | else: 370 | flattened.append(self.normalize(value)) 371 | return flattened 372 | elif isinstance(response, list): 373 | # If the initial response is a list, process it directly 374 | return self.process_list(response) 375 | else: 376 | # Handle single non-list items 377 | return [self.normalize(response)] 378 | 379 | def analyze_performance(self, json_file_path): 380 | """Analysiert die Performance anhand der Daten aus einer JSON-Datei.""" 381 | with open(json_file_path, "r") as file: 382 | data = json.load(file) 383 | 384 | df = pd.DataFrame(data) 385 | 386 | retriever_match = re.search(r"ragver_(\d+)", json_file_path) 387 | retriever = retriever_match.group(1) if retriever_match else "Unknown" 388 | print(f"Summary Statistics for RAG with retriever {retriever}") 389 | print(f"Total Questions: {len(df)}") 390 | 391 | mean_response_time = df["requestime"].mean() 392 | sd_response_time = df["requestime"].std() 393 | print("\nResponse Time:") 394 | print(f"Mean: {mean_response_time:.2f} seconds") 395 | print(f"Standard Deviation: {sd_response_time:.2f} seconds") 396 | 397 | 398 | accuracy = self.manual_accuracy_score( 399 | df["trueresponse_exact"], df["ragresponse"] 400 | ) 401 | recall = recall_score( 402 | df["trueresponse_exact"], 403 | df["ragresponse"], 404 | average="weighted", 405 | zero_division=0, 406 | ) 407 | precision = precision_score( 408 | df["trueresponse_exact"], 409 | df["ragresponse"], 410 | average="weighted", 411 | zero_division=0, 412 | ) 413 | f1 = f1_score( 414 | df["trueresponse_exact"], 415 | df["ragresponse"], 416 | average="weighted", 417 | zero_division=0, 418 | ) 419 | 420 | 421 | recall_list = [] 422 | precision_list = [] 423 | 424 | for i in range(len(df)): 425 | ground_truth_pmids = list(df["pmids_ground_truth"][i]) 426 | matching_retrieved_ids = list(df["matching_retrieved_ids"][i]) 427 | retrieved_pmids = list(df["pmids_retrieved"][i]) 428 | matching_used_ids = list(df["rag_used_matching_retrieved_ids"][i]) 429 | used_pmids = list(df["pmids_uses_by_rag"][i]) 430 | 431 | recall_retrieval = ( 432 | len(matching_retrieved_ids) / len(ground_truth_pmids) 433 | if ground_truth_pmids 434 | else 0 435 | ) 436 | precision_rag = ( 437 | len(matching_used_ids) / len(used_pmids) 438 | if used_pmids 439 | else 0 440 | ) 441 | 442 | recall_list.append(recall_retrieval) 443 | precision_list.append(precision_rag) 444 | 445 | avg_recall_retrieval = sum(recall_list) / len(recall_list) 446 | avg_precision_rag = sum(precision_list) / len(precision_list) 447 | 448 | print("\nSummary of non-answered questions:") 449 | count_no_docs_found = (df["ragresponse"] == "no_docs_found").sum() 450 | total_specific_counts = count_no_docs_found # Adjust this if there are other specific counts to include 451 | total_rows = len(df) 452 | percentage_not_answered = (total_specific_counts / total_rows) * 100 453 | print(f"Absolute count - No Docs Found: {total_specific_counts}") 454 | print(f"Percentage - No Docs Found: {percentage_not_answered:.2f}%") 455 | 456 | 457 | print("\nMetrics - RAG Q&A:") 458 | print(f"Accuracy: {accuracy:.2f}") 459 | print(f"Recall: {recall:.2f}") 460 | print(f"Precision: {precision:.2f}") 461 | print(f"F1 Score: {f1:.2f}") 462 | 463 | 464 | print("\nMetrics - Retriever:") 465 | print(f"Average Recall: {avg_recall_retrieval:.2f}") 466 | 467 | print("\nMetrics for RAG Usage:") 468 | print(f"Average Precision: {avg_precision_rag:.2f}") 469 | 470 | print("\nAdditional metrics:") 471 | print(f"Mean response time retriever: {round(df['retrievment_time'].mean(), 2)}") 472 | print(f"Standard deviation response time retriever: {round(df['retrievment_time'].std(), 2)}") 473 | print(f"Mean response time generation: {round(df['generation_time'].mean(), 2)}") 474 | print(f"Standard deviation response time generation: {round(df['generation_time'].std(), 2)}") 475 | -------------------------------------------------------------------------------- /evaluation/evaluation_data_storages/mongodb/eval_mongo.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "source": [ 6 | "### Evaluation of the TF-IDF ranking implemented in MongoDB for probabilistic full text search\n", 7 | "\n", 8 | "We configurated a mongodb instance in a docker container using 16GB of RAM and 4 cores and port forwarding to the host machine on port 27017. We indexed the 23.9m documents on the content field using the TF-IDF ranking." 9 | ], 10 | "metadata": { 11 | "collapsed": false 12 | }, 13 | "id": "e99371764495f1bd" 14 | }, 15 | { 16 | "cell_type": "markdown", 17 | "source": [], 18 | "metadata": { 19 | "collapsed": false 20 | }, 21 | "id": "42bf126ed742dd07" 22 | }, 23 | { 24 | "cell_type": "code", 25 | "outputs": [ 26 | { 27 | "name": "stdout", 28 | "output_type": "stream", 29 | "text": [ 30 | "Collections in der Datenbank: ['Docs', 'all_docs']\n" 31 | ] 32 | } 33 | ], 34 | "source": [ 35 | "from pymongo import MongoClient\n", 36 | "import numpy as np\n", 37 | "from pathlib import Path\n", 38 | "import os\n", 39 | "import json\n", 40 | "from tqdm import tqdm\n", 41 | "\n", 42 | "# Connect to MongoDB\n", 43 | "client = MongoClient('localhost', 27017)\n", 44 | "db = client['PubMed']\n", 45 | "collection = db['Docs']\n", 46 | "\n", 47 | "collections = db.list_collection_names()\n", 48 | "print(\"Collections in der Datenbank:\", collections)" 49 | ], 50 | "metadata": { 51 | "collapsed": false, 52 | "ExecuteTime": { 53 | "end_time": "2024-04-11T14:33:26.784113Z", 54 | "start_time": "2024-04-11T14:33:26.044687Z" 55 | } 56 | }, 57 | "id": "ef097ce096f81125", 58 | "execution_count": 1 59 | }, 60 | { 61 | "cell_type": "markdown", 62 | "source": [ 63 | "Now we define the search query and the number of results we want to retrieve. We only retrieve the PMIDs of the documents to compare the results with the relevant documents to the related queries by using the bioASQ dataset." 64 | ], 65 | "metadata": { 66 | "collapsed": false 67 | }, 68 | "id": "715030c934a661f8" 69 | }, 70 | { 71 | "cell_type": "markdown", 72 | "source": [ 73 | "This funktion retrieves the PMIDs of the documents that contain the search query in the content field.\n", 74 | "\n", 75 | "This query only retrieves documents that contain the search query in the content field without any ranking. Thus, the results are not sorted by relevance." 76 | ], 77 | "metadata": { 78 | "collapsed": false 79 | }, 80 | "id": "24e31789dfd838c2" 81 | }, 82 | { 83 | "cell_type": "code", 84 | "outputs": [], 85 | "source": [ 86 | "def search(query):\n", 87 | " results = collection.find({\"$text\": {\"$search\": query}}).limit(100)\n", 88 | " return results" 89 | ], 90 | "metadata": { 91 | "collapsed": false, 92 | "ExecuteTime": { 93 | "end_time": "2024-04-11T14:39:17.794498Z", 94 | "start_time": "2024-04-11T14:39:17.790223Z" 95 | } 96 | }, 97 | "id": "87a03f7fa95bf5bd", 98 | "execution_count": 10 99 | }, 100 | { 101 | "cell_type": "code", 102 | "outputs": [ 103 | { 104 | "data": { 105 | "text/plain": "[{'_id': ObjectId('6617aff5fb3d2cdc7cd9b435'),\n 'id': 'pubmed23n0045_2126',\n 'title': 'Distribution of somatostatin-28 (1-12) in the cat brainstem: an immunocytochemical study.',\n 'content': 'We studied the distribution of somatostatin-28 (1-12)-immunoreactive fibers and cell bodies in the cat brainstem. A moderate density of cell bodies containing the peptide was observed in the ventral nucleus of the lateral lemniscus, accessory dorsal tegmental nucleus, retrofacial nucleus and in the lateral reticular nucleus, whereas a low density of such perikarya was found in the interpeduncular nucleus, nucleus incertus, nucleus sagulum, gigantocellular tegmental field, nucleus of the trapezoid body, nucleus praepositus hypoglosii, lateral and magnocellular tegmental fields, nucleus of the solitary tract, nucleus ambiguous and in the nucleus intercalatus. Moreover, a moderate density of somatostatin-28 (1-12)-immunoreactive processes was found in the dorsal nucleus of the raphe, dorsal tegmental nucleus, accessory dorsal tegmental nucleus, periaqueductal gray and in the marginal nucleus of the brachium conjunctivum. Finally, few immunoreactive fibers were visualized in the interpeduncular nucleus, cuneiform nucleus, locus coeruleus, nucleus incertus, superior and inferior central nuclei, nucleus sagulum, ventral nucleus of the lateral lemniscus, nucleus praepositus hypoglosii, medial vestibular nucleus, Kölliker-Fuse area, nucleus ambiguous, retrofacial nucleus, postpyramidal nucleus of the raphe, nucleus of the solitary tract, dorsal motor nucleus of the vagus, lateral reticular nucleus and laminar and alaminar spinal trigeminal nuclei.',\n 'contents': 'Distribution of somatostatin-28 (1-12) in the cat brainstem: an immunocytochemical study. We studied the distribution of somatostatin-28 (1-12)-immunoreactive fibers and cell bodies in the cat brainstem. A moderate density of cell bodies containing the peptide was observed in the ventral nucleus of the lateral lemniscus, accessory dorsal tegmental nucleus, retrofacial nucleus and in the lateral reticular nucleus, whereas a low density of such perikarya was found in the interpeduncular nucleus, nucleus incertus, nucleus sagulum, gigantocellular tegmental field, nucleus of the trapezoid body, nucleus praepositus hypoglosii, lateral and magnocellular tegmental fields, nucleus of the solitary tract, nucleus ambiguous and in the nucleus intercalatus. Moreover, a moderate density of somatostatin-28 (1-12)-immunoreactive processes was found in the dorsal nucleus of the raphe, dorsal tegmental nucleus, accessory dorsal tegmental nucleus, periaqueductal gray and in the marginal nucleus of the brachium conjunctivum. Finally, few immunoreactive fibers were visualized in the interpeduncular nucleus, cuneiform nucleus, locus coeruleus, nucleus incertus, superior and inferior central nuclei, nucleus sagulum, ventral nucleus of the lateral lemniscus, nucleus praepositus hypoglosii, medial vestibular nucleus, Kölliker-Fuse area, nucleus ambiguous, retrofacial nucleus, postpyramidal nucleus of the raphe, nucleus of the solitary tract, dorsal motor nucleus of the vagus, lateral reticular nucleus and laminar and alaminar spinal trigeminal nuclei.',\n 'PMID': 1346714},\n {'_id': ObjectId('6617b085fb3d2cdc7cf495f9'),\n 'id': 'pubmed23n0133_7446',\n 'title': 'Afferent and efferent connections of the medial preoptic area in the rat: a WGA-HRP study.',\n 'content': 'Afferent and efferent connections of the medial preoptic area including medial preoptic nucleus (MP) and periventricular area at the MP level were examined using WGA-HRP as a marker. Injections were performed by insertion of micropipette containing (1) small amount of HRP powder or (2) dryed HRP solution for 24 to 48 hr until the fixation or for 5 min respectively. Dorsal and ventral approaches of injection micropipettes were performed and the results were compared. Previously reported reciprocal connections with lateral septum, bed nucleus of the stria terminalis, medial amygdaloid nucleus, lateral hypothalamic nucleus, paraventricular hypothalamic nucleus, ventromedial hypothalamic nucleus, arcuate nucleus, supramammillary nucleus, central gray at the mesencephalon, raphe dorsalis, raphe medianus, and lateral parabrachial nucleus have been confirmed. In addition, we found reciprocal connections with septo-hypothalamic nucleus, amygdalo-hipocampal nucleus, subiculum, parafascicular thalamic nucleus, posterior thalamic nucleus at the caudo-ventral subdivision, median preoptic nucleus, lateral preoptic nucleus, anterior hypothalamic nucleus, periventricular area at the caudal hypothalamic level, dorsomedial hypothalamic nucleus, posterior hypothalamic nucleus, dorsal and ventral premammillary nucleus, lateral mammillary nucleus, peripeduncular nucleus, periventricular gray, ventral tegmental area, interpeduncular nucleus, nucleus raphe pontis, nucleus raphe magnus, pedunculo-pontine tegmental nucleus, gigantocellular reticular nucleus and solitary tract nucleus. The areas which had only efferent connections from MP were accumbens, caudate putamen, ventral pallidum, substantia innominata, lateral habenular nucleus, paratenial thalamic nucleus, paraventricular thalamic nucleus, mediodorsal thalamic nucleus, reuniens thalamic nucleus, median eminence, medial mammillary nucleus, subthalamic nucleus, pars compacta of substantia nigra, oculomotor nucleus, red nucleus, laterodorsal tegmental nucleus, reticular tegmental nucleus, cuneiform nucleus, nucleus locus coeruleus, and dorsal motor nucleus of vagus among which substantia innominata and median eminence were previously reported. Efferent connections to the nucleus of Darkschewitsch, interstitial nucleus of Cajal, dorsal tegmental nucleus, ventral tegmental nucleus, vestibular nuclei, nucleus raphe obsculus were very weak or abscent in the ventral approach while they were observed in dorsal approach. Previously reported afferent connections from dorsal tegmental nucleus, cuneiform nucleus, and nucleus locus ceruleus were not detected in this study.(ABSTRACT TRUNCATED AT 400 WORDS)',\n 'contents': 'Afferent and efferent connections of the medial preoptic area in the rat: a WGA-HRP study. Afferent and efferent connections of the medial preoptic area including medial preoptic nucleus (MP) and periventricular area at the MP level were examined using WGA-HRP as a marker. Injections were performed by insertion of micropipette containing (1) small amount of HRP powder or (2) dryed HRP solution for 24 to 48 hr until the fixation or for 5 min respectively. Dorsal and ventral approaches of injection micropipettes were performed and the results were compared. Previously reported reciprocal connections with lateral septum, bed nucleus of the stria terminalis, medial amygdaloid nucleus, lateral hypothalamic nucleus, paraventricular hypothalamic nucleus, ventromedial hypothalamic nucleus, arcuate nucleus, supramammillary nucleus, central gray at the mesencephalon, raphe dorsalis, raphe medianus, and lateral parabrachial nucleus have been confirmed. In addition, we found reciprocal connections with septo-hypothalamic nucleus, amygdalo-hipocampal nucleus, subiculum, parafascicular thalamic nucleus, posterior thalamic nucleus at the caudo-ventral subdivision, median preoptic nucleus, lateral preoptic nucleus, anterior hypothalamic nucleus, periventricular area at the caudal hypothalamic level, dorsomedial hypothalamic nucleus, posterior hypothalamic nucleus, dorsal and ventral premammillary nucleus, lateral mammillary nucleus, peripeduncular nucleus, periventricular gray, ventral tegmental area, interpeduncular nucleus, nucleus raphe pontis, nucleus raphe magnus, pedunculo-pontine tegmental nucleus, gigantocellular reticular nucleus and solitary tract nucleus. The areas which had only efferent connections from MP were accumbens, caudate putamen, ventral pallidum, substantia innominata, lateral habenular nucleus, paratenial thalamic nucleus, paraventricular thalamic nucleus, mediodorsal thalamic nucleus, reuniens thalamic nucleus, median eminence, medial mammillary nucleus, subthalamic nucleus, pars compacta of substantia nigra, oculomotor nucleus, red nucleus, laterodorsal tegmental nucleus, reticular tegmental nucleus, cuneiform nucleus, nucleus locus coeruleus, and dorsal motor nucleus of vagus among which substantia innominata and median eminence were previously reported. Efferent connections to the nucleus of Darkschewitsch, interstitial nucleus of Cajal, dorsal tegmental nucleus, ventral tegmental nucleus, vestibular nuclei, nucleus raphe obsculus were very weak or abscent in the ventral approach while they were observed in dorsal approach. Previously reported afferent connections from dorsal tegmental nucleus, cuneiform nucleus, and nucleus locus ceruleus were not detected in this study.(ABSTRACT TRUNCATED AT 400 WORDS)',\n 'PMID': 3995367},\n {'_id': ObjectId('6617b083fb3d2cdc7cf43ab3'),\n 'id': 'pubmed23n0132_4049',\n 'title': '[Afferent connections of the nucleus of the facial nerve in the cat detected using the technic of retrograde axonal transport of horseradish peroxidase].',\n 'content': 'Neuronal populations in the brainstem and spinal cord as sources of fibre pathways to the facial nucleus were studied in adult cats by means of microionophoretic injections of horseradish peroxidase into restricted zones of the facial nucleus. Projection from nucleus nervi hypoglossi, nucleus praepositus hypoglossi, nucleus raphe pallidus, nucleus intercalatus, medial nucleus of the solitary tract, dorsal motor nucleus of the vagus, neurons of genu of the facial nerve, ipsilateral red nucleus and reticular formation of the midbrain to the facial nucleus are found. Projections from a number of other brain structures to the facial nucleus are confirmed. A topographical map of distribution of the brainstem and spinal cord afferents in the facial nucleus is proposed.',\n 'contents': '[Afferent connections of the nucleus of the facial nerve in the cat detected using the technic of retrograde axonal transport of horseradish peroxidase]. Neuronal populations in the brainstem and spinal cord as sources of fibre pathways to the facial nucleus were studied in adult cats by means of microionophoretic injections of horseradish peroxidase into restricted zones of the facial nucleus. Projection from nucleus nervi hypoglossi, nucleus praepositus hypoglossi, nucleus raphe pallidus, nucleus intercalatus, medial nucleus of the solitary tract, dorsal motor nucleus of the vagus, neurons of genu of the facial nerve, ipsilateral red nucleus and reticular formation of the midbrain to the facial nucleus are found. Projections from a number of other brain structures to the facial nucleus are confirmed. A topographical map of distribution of the brainstem and spinal cord afferents in the facial nucleus is proposed.',\n 'PMID': 3960201},\n {'_id': ObjectId('6617b0bcfb3d2cdc7cfebd34'),\n 'id': 'pubmed23n0237_9745',\n 'title': 'Brainstem afferents to the thalamus in a lizard, Varanus exanthematicus.',\n 'content': 'HRP was injected into various thalamic nuclei in order to investigate the brainstem projections to the thalamus in the lizard Varanus exanthematicus. Nucleus dorsomedialis receives afferents from the septal area, nucleus entopeduncularis anterior, nucleus periventricularis hypothalami, area triangularis, nucleus raphes superior, nucleus reticularis inferior, and locus coeruleus. Nucleus dorsolateralis receives afferents from septal area, nucleus dorsomedialis, nucleus entopeduncularis anterior, nucleus periventricularis hypothalami, and the torus semicircularis. Nucleus rotundus receives an input from the tectum mesencephali, the pretectal area, and from the mesencephalic reticular formation. Nucleus intermedius dorsalis receives afferents from the dorsal column nuclei and nucleus periventricularis hypothalami. Nucleus ventrolateralis receives afferents from the dorsal column nuclei, the trigeminal complex, locus coeruleus, and the reticular formation. Nucleus ventromedialis also receives afferents from the trigeminal complex and the reticular formation. Afferents to the habenula have been demonstrated from the septal area, nucleus entopeduncularis anterior, triangular area, nucleus periventricularis hypothalami, nucleus interpeduncularis, nucleus raphes superior, locus coeruleus, nucleus isthmi, nucleus dorsalis motorius nervi vagi, and the mesencephalic tegmentum. The laminar part of the torus semicicularis projects to nucleus medialis.',\n 'contents': 'Brainstem afferents to the thalamus in a lizard, Varanus exanthematicus. HRP was injected into various thalamic nuclei in order to investigate the brainstem projections to the thalamus in the lizard Varanus exanthematicus. Nucleus dorsomedialis receives afferents from the septal area, nucleus entopeduncularis anterior, nucleus periventricularis hypothalami, area triangularis, nucleus raphes superior, nucleus reticularis inferior, and locus coeruleus. Nucleus dorsolateralis receives afferents from septal area, nucleus dorsomedialis, nucleus entopeduncularis anterior, nucleus periventricularis hypothalami, and the torus semicircularis. Nucleus rotundus receives an input from the tectum mesencephali, the pretectal area, and from the mesencephalic reticular formation. Nucleus intermedius dorsalis receives afferents from the dorsal column nuclei and nucleus periventricularis hypothalami. Nucleus ventrolateralis receives afferents from the dorsal column nuclei, the trigeminal complex, locus coeruleus, and the reticular formation. Nucleus ventromedialis also receives afferents from the trigeminal complex and the reticular formation. Afferents to the habenula have been demonstrated from the septal area, nucleus entopeduncularis anterior, triangular area, nucleus periventricularis hypothalami, nucleus interpeduncularis, nucleus raphes superior, locus coeruleus, nucleus isthmi, nucleus dorsalis motorius nervi vagi, and the mesencephalic tegmentum. The laminar part of the torus semicicularis projects to nucleus medialis.',\n 'PMID': 7130476},\n {'_id': ObjectId('6617b007fb3d2cdc7cde3763'),\n 'id': 'pubmed23n0058_6194',\n 'title': 'Comparative cytoarchitectonic analysis of some visual pretectal nuclei in teleosts.',\n 'content': 'The posterior pretectal nucleus, which in Osteoglossum receives second order visual input and projects to the inferior lobe of the hypothalamus, was identified and characterized in species from all major groups of non-neoteleost teleosts. The hypothesis that the posterior pretectal nucleus in these species is homologous to both the pars intermedius of the superficial pretectal nucleus and nucleus glomerulosus in acanthopterygians is supported by multiple similarities in relative position and cytoarchitecture. Nucleus corticalis, which receives retinal input and projects to the posterior pretectal nucleus (or to nucleus glomerulosus), was identified in species belonging to three of the four major teleost radiations. Both the posterior pretectal nucleus and nucleus corticalis are plesiomorphic for teleosts. The presence of glomeruli in the posterior pretectal nucleus and nucleus glomerulosus in esocids and acanthopterygians, respectively, and the presence of two nuclei, the pars intermedius and nucleus glomerulosus, in acanthopterygians, as opposed to one nucleus, the posterior pretectal nucleus, are apomorphies.',\n 'contents': 'Comparative cytoarchitectonic analysis of some visual pretectal nuclei in teleosts. The posterior pretectal nucleus, which in Osteoglossum receives second order visual input and projects to the inferior lobe of the hypothalamus, was identified and characterized in species from all major groups of non-neoteleost teleosts. The hypothesis that the posterior pretectal nucleus in these species is homologous to both the pars intermedius of the superficial pretectal nucleus and nucleus glomerulosus in acanthopterygians is supported by multiple similarities in relative position and cytoarchitecture. Nucleus corticalis, which receives retinal input and projects to the posterior pretectal nucleus (or to nucleus glomerulosus), was identified in species belonging to three of the four major teleost radiations. Both the posterior pretectal nucleus and nucleus corticalis are plesiomorphic for teleosts. The presence of glomeruli in the posterior pretectal nucleus and nucleus glomerulosus in esocids and acanthopterygians, respectively, and the presence of two nuclei, the pars intermedius and nucleus glomerulosus, in acanthopterygians, as opposed to one nucleus, the posterior pretectal nucleus, are apomorphies.',\n 'PMID': 1742601}]" 106 | }, 107 | "execution_count": 14, 108 | "metadata": {}, 109 | "output_type": "execute_result" 110 | } 111 | ], 112 | "source": [ 113 | "pmid_liste = search(\"Is it possible to visualize subtahalamic nucleus by using transcranial ultrasound?\")\n", 114 | "\n", 115 | "list(pmid_liste)[:5]" 116 | ], 117 | "metadata": { 118 | "collapsed": false, 119 | "ExecuteTime": { 120 | "end_time": "2024-04-11T14:48:21.797401Z", 121 | "start_time": "2024-04-11T14:48:21.771026Z" 122 | } 123 | }, 124 | "id": "885b4cd3f85a48b4", 125 | "execution_count": 14 126 | }, 127 | { 128 | "cell_type": "markdown", 129 | "source": [ 130 | "This funktion retrieves the PMIDs of the documents that contain the search query in the content field. The results are sorted by the TF-IDF ranking.\n", 131 | "\n", 132 | "It takes significantly longer to retrieve the results because the documents are sorted by the TF-IDF ranking." 133 | ], 134 | "metadata": { 135 | "collapsed": false 136 | }, 137 | "id": "998b429c804633be" 138 | }, 139 | { 140 | "cell_type": "code", 141 | "outputs": [], 142 | "source": [ 143 | "def search_TF_IDF(query, k):\n", 144 | " results = collection.find({\"$text\": {\"$search\": query}}, {\"_id\": 0, \"PMID\": 1, \"score\": {\"$meta\": \"textScore\"}}).sort([(\"score\", {\"$meta\": \"textScore\"})]).limit(k)\n", 145 | " return results" 146 | ], 147 | "metadata": { 148 | "collapsed": false, 149 | "ExecuteTime": { 150 | "end_time": "2024-04-11T14:33:28.996003Z", 151 | "start_time": "2024-04-11T14:33:28.989219Z" 152 | } 153 | }, 154 | "id": "2398467d2a980e74", 155 | "execution_count": 2 156 | }, 157 | { 158 | "cell_type": "markdown", 159 | "source": [ 160 | "MongoDB uses lazy evaluation. Thus, the query is not executed until the results are accessed. We access the results to measure the time it takes to retrieve the results." 161 | ], 162 | "metadata": { 163 | "collapsed": false 164 | }, 165 | "id": "3a316c740cda85d" 166 | }, 167 | { 168 | "cell_type": "code", 169 | "outputs": [], 170 | "source": [ 171 | "pmid_liste = search_TF_IDF(\"Is it possible to visualize subtahalamic nucleus by using transcranial ultrasound?\", 10)" 172 | ], 173 | "metadata": { 174 | "collapsed": false, 175 | "ExecuteTime": { 176 | "end_time": "2024-04-11T14:33:31.255852Z", 177 | "start_time": "2024-04-11T14:33:31.249673Z" 178 | } 179 | }, 180 | "id": "52bfc4116b88b82d", 181 | "execution_count": 3 182 | }, 183 | { 184 | "cell_type": "markdown", 185 | "source": [ 186 | "The results are retrieved as a cursor. We convert the cursor to a list to access the results. This takes a while because the results are sorted by the TF-IDF ranking." 187 | ], 188 | "metadata": { 189 | "collapsed": false 190 | }, 191 | "id": "6a462e683a59117" 192 | }, 193 | { 194 | "cell_type": "code", 195 | "outputs": [ 196 | { 197 | "data": { 198 | "text/plain": "[{'PMID': 1627439, 'score': 3.31173513986014},\n {'PMID': 64478, 'score': 3.2818834459459465},\n {'PMID': 1705058, 'score': 3.2310779816513757},\n {'PMID': 6763082, 'score': 3.229017857142857},\n {'PMID': 2683312, 'score': 3.1279296875},\n {'PMID': 1650433, 'score': 3.095602766798419},\n {'PMID': 2473416, 'score': 3.095472440944882},\n {'PMID': 3473897, 'score': 3.07967032967033},\n {'PMID': 1519071, 'score': 3.0697115384615383},\n {'PMID': 3545257, 'score': 3.0483333333333333}]" 199 | }, 200 | "execution_count": 4, 201 | "metadata": {}, 202 | "output_type": "execute_result" 203 | } 204 | ], 205 | "source": [ 206 | "pmid_liste = list(pmid_liste)\n", 207 | "pmid_liste" 208 | ], 209 | "metadata": { 210 | "collapsed": false, 211 | "ExecuteTime": { 212 | "end_time": "2024-04-11T14:33:59.618631Z", 213 | "start_time": "2024-04-11T14:33:33.265473Z" 214 | } 215 | }, 216 | "id": "c84b73f819247fb5", 217 | "execution_count": 4 218 | }, 219 | { 220 | "cell_type": "markdown", 221 | "source": [ 222 | "25 seconds are needed to retrieve the results for the query. This time is impractical for a real-time search engine." 223 | ], 224 | "metadata": { 225 | "collapsed": false 226 | }, 227 | "id": "74e53b14defc9b03" 228 | } 229 | ], 230 | "metadata": { 231 | "kernelspec": { 232 | "display_name": "Python 3", 233 | "language": "python", 234 | "name": "python3" 235 | }, 236 | "language_info": { 237 | "codemirror_mode": { 238 | "name": "ipython", 239 | "version": 2 240 | }, 241 | "file_extension": ".py", 242 | "mimetype": "text/x-python", 243 | "name": "python", 244 | "nbconvert_exporter": "python", 245 | "pygments_lexer": "ipython2", 246 | "version": "2.7.6" 247 | } 248 | }, 249 | "nbformat": 4, 250 | "nbformat_minor": 5 251 | } 252 | -------------------------------------------------------------------------------- /evaluation/evaluation_data_storages/elasticsearch/elastic.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "outputs": [], 6 | "source": [ 7 | "import json\n", 8 | "from elasticsearch import Elasticsearch, helpers\n", 9 | "import urllib3\n", 10 | "import os\n", 11 | "\n", 12 | "\n", 13 | "elastic_password = os.getenv('ELASTIC_PASSWORD_SERVER')\n", 14 | "\n", 15 | "es = Elasticsearch(\n", 16 | " ['https://localhost:9200'],\n", 17 | " basic_auth=('elastic', elastic_password),\n", 18 | " verify_certs=False,\n", 19 | " ca_certs=\"C:/Users/linus/http_ca.crt\"\n", 20 | ")\n", 21 | "\n", 22 | "urllib3.disable_warnings()" 23 | ], 24 | "metadata": { 25 | "collapsed": false, 26 | "ExecuteTime": { 27 | "end_time": "2024-04-06T20:56:12.028476Z", 28 | "start_time": "2024-04-06T20:56:11.995320Z" 29 | } 30 | }, 31 | "id": "c730151aee91dee3", 32 | "execution_count": 7 33 | }, 34 | { 35 | "cell_type": "code", 36 | "outputs": [ 37 | { 38 | "data": { 39 | "text/plain": "ObjectApiResponse({'name': 'b3472380ffa2', 'cluster_name': 'docker-cluster', 'cluster_uuid': 'DQvmIapdSNS30vfmGkeR8w', 'version': {'number': '8.13.1', 'build_flavor': 'default', 'build_type': 'docker', 'build_hash': '9287f29bba5e270bd51d557b8daccb7d118ba247', 'build_date': '2024-03-29T10:05:29.787251984Z', 'build_snapshot': False, 'lucene_version': '9.10.0', 'minimum_wire_compatibility_version': '7.17.0', 'minimum_index_compatibility_version': '7.0.0'}, 'tagline': 'You Know, for Search'})" 40 | }, 41 | "execution_count": 8, 42 | "metadata": {}, 43 | "output_type": "execute_result" 44 | } 45 | ], 46 | "source": [ 47 | "# Test the connection\n", 48 | "es.info()" 49 | ], 50 | "metadata": { 51 | "collapsed": false, 52 | "ExecuteTime": { 53 | "end_time": "2024-04-06T20:56:15.903778Z", 54 | "start_time": "2024-04-06T20:56:15.741059Z" 55 | } 56 | }, 57 | "id": "9900f0eb7bdc320", 58 | "execution_count": 8 59 | }, 60 | { 61 | "cell_type": "markdown", 62 | "source": [ 63 | "### Indexing Documents with Embeddings into Elasticsearch for Vector Similarity Search" 64 | ], 65 | "metadata": { 66 | "collapsed": false 67 | }, 68 | "id": "a2769980d775a70d" 69 | }, 70 | { 71 | "cell_type": "markdown", 72 | "source": [ 73 | "initialize the index with the appropriate mapping for the dense vector field." 74 | ], 75 | "metadata": { 76 | "collapsed": false 77 | }, 78 | "id": "d35e8d0e15557047" 79 | }, 80 | { 81 | "cell_type": "code", 82 | "outputs": [], 83 | "source": [ 84 | "# drop the index if it already exists\n", 85 | "if es.indices.exists(index='pubmed_emb_index'):\n", 86 | " es.indices.delete(index='pubmed_emb_index')" 87 | ], 88 | "metadata": { 89 | "collapsed": false 90 | }, 91 | "id": "a4809b06942eaf91", 92 | "execution_count": 14 93 | }, 94 | { 95 | "cell_type": "code", 96 | "outputs": [ 97 | { 98 | "name": "stderr", 99 | "output_type": "stream", 100 | "text": [ 101 | "C:\\Users\\linus\\anaconda3\\Lib\\site-packages\\urllib3\\connectionpool.py:1056: InsecureRequestWarning: Unverified HTTPS request is being made to host 'localhost'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n", 102 | " warnings.warn(\n", 103 | "C:\\Users\\linus\\anaconda3\\Lib\\site-packages\\urllib3\\connectionpool.py:1056: InsecureRequestWarning: Unverified HTTPS request is being made to host 'localhost'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n", 104 | " warnings.warn(\n" 105 | ] 106 | } 107 | ], 108 | "source": [ 109 | "# Define the index name\n", 110 | "index_name = \"pubmed_emb_index\"\n", 111 | "\n", 112 | "# Check if the index already exists\n", 113 | "if not es.indices.exists(index=index_name):\n", 114 | " # Define the mapping\n", 115 | " mapping = {\n", 116 | " \"mappings\": {\n", 117 | " \"properties\": {\n", 118 | " \"embeddings\": {\"type\": \"dense_vector\", \"dims\": 768} # Adjust the dimension size as needed\n", 119 | " # Add other field mappings as necessary\n", 120 | " }\n", 121 | " }\n", 122 | " }\n", 123 | " \n", 124 | " # Create the index with the defined mapping\n", 125 | " es.indices.create(index=index_name, body=mapping)" 126 | ], 127 | "metadata": { 128 | "collapsed": false 129 | }, 130 | "id": "e950a01927be6bdd", 131 | "execution_count": 7 132 | }, 133 | { 134 | "cell_type": "markdown", 135 | "source": [ 136 | "load the JSONL files containing the PubMed documents, extract the embeddings, and index the documents into Elasticsearch." 137 | ], 138 | "metadata": { 139 | "collapsed": false 140 | }, 141 | "id": "d299505c678d88c3" 142 | }, 143 | { 144 | "cell_type": "code", 145 | "outputs": [ 146 | { 147 | "name": "stderr", 148 | "output_type": "stream", 149 | "text": [ 150 | " 63%|██████▎ | 63/100 [35:47<51:34, 83.63s/it]" 151 | ] 152 | }, 153 | { 154 | "name": "stdout", 155 | "output_type": "stream", 156 | "text": [ 157 | "An error occurred: Connection timed out\n" 158 | ] 159 | }, 160 | { 161 | "name": "stderr", 162 | "output_type": "stream", 163 | "text": [ 164 | "100%|██████████| 100/100 [48:28<00:00, 29.09s/it]\n" 165 | ] 166 | }, 167 | { 168 | "ename": "ConnectionTimeout", 169 | "evalue": "Connection timed out", 170 | "output_type": "error", 171 | "traceback": [ 172 | "\u001B[1;31m---------------------------------------------------------------------------\u001B[0m", 173 | "\u001B[1;31mConnectionTimeout\u001B[0m Traceback (most recent call last)", 174 | "Cell \u001B[1;32mIn[15], line 49\u001B[0m\n\u001B[0;32m 46\u001B[0m \u001B[38;5;28mprint\u001B[39m(\u001B[38;5;124m'\u001B[39m\u001B[38;5;124mIndexing complete\u001B[39m\u001B[38;5;124m'\u001B[39m)\n\u001B[0;32m 48\u001B[0m \u001B[38;5;66;03m# Rufen Sie die Funktion auf, um die Dokumente zu indizieren\u001B[39;00m\n\u001B[1;32m---> 49\u001B[0m bulk_index_documents(source_directory, index_name)\n", 175 | "Cell \u001B[1;32mIn[15], line 44\u001B[0m, in \u001B[0;36mbulk_index_documents\u001B[1;34m(source_directory, index_name)\u001B[0m\n\u001B[0;32m 42\u001B[0m \u001B[38;5;66;03m# Indexieren Sie alle verbleibenden Dokumente\u001B[39;00m\n\u001B[0;32m 43\u001B[0m \u001B[38;5;28;01mif\u001B[39;00m actions:\n\u001B[1;32m---> 44\u001B[0m helpers\u001B[38;5;241m.\u001B[39mbulk(es, actions)\n\u001B[0;32m 46\u001B[0m \u001B[38;5;28mprint\u001B[39m(\u001B[38;5;124m'\u001B[39m\u001B[38;5;124mIndexing complete\u001B[39m\u001B[38;5;124m'\u001B[39m)\n", 176 | "File \u001B[1;32m~\\anaconda3\\Lib\\site-packages\\elasticsearch\\helpers\\actions.py:521\u001B[0m, in \u001B[0;36mbulk\u001B[1;34m(client, actions, stats_only, ignore_status, *args, **kwargs)\u001B[0m\n\u001B[0;32m 519\u001B[0m \u001B[38;5;66;03m# make streaming_bulk yield successful results so we can count them\u001B[39;00m\n\u001B[0;32m 520\u001B[0m kwargs[\u001B[38;5;124m\"\u001B[39m\u001B[38;5;124myield_ok\u001B[39m\u001B[38;5;124m\"\u001B[39m] \u001B[38;5;241m=\u001B[39m \u001B[38;5;28;01mTrue\u001B[39;00m\n\u001B[1;32m--> 521\u001B[0m \u001B[38;5;28;01mfor\u001B[39;00m ok, item \u001B[38;5;129;01min\u001B[39;00m streaming_bulk(\n\u001B[0;32m 522\u001B[0m client, actions, ignore_status\u001B[38;5;241m=\u001B[39mignore_status, \u001B[38;5;241m*\u001B[39margs, \u001B[38;5;241m*\u001B[39m\u001B[38;5;241m*\u001B[39mkwargs \u001B[38;5;66;03m# type: ignore[misc]\u001B[39;00m\n\u001B[0;32m 523\u001B[0m ):\n\u001B[0;32m 524\u001B[0m \u001B[38;5;66;03m# go through request-response pairs and detect failures\u001B[39;00m\n\u001B[0;32m 525\u001B[0m \u001B[38;5;28;01mif\u001B[39;00m \u001B[38;5;129;01mnot\u001B[39;00m ok:\n\u001B[0;32m 526\u001B[0m \u001B[38;5;28;01mif\u001B[39;00m \u001B[38;5;129;01mnot\u001B[39;00m stats_only:\n", 177 | "File \u001B[1;32m~\\anaconda3\\Lib\\site-packages\\elasticsearch\\helpers\\actions.py:436\u001B[0m, in \u001B[0;36mstreaming_bulk\u001B[1;34m(client, actions, chunk_size, max_chunk_bytes, raise_on_error, expand_action_callback, raise_on_exception, max_retries, initial_backoff, max_backoff, yield_ok, ignore_status, *args, **kwargs)\u001B[0m\n\u001B[0;32m 433\u001B[0m time\u001B[38;5;241m.\u001B[39msleep(\u001B[38;5;28mmin\u001B[39m(max_backoff, initial_backoff \u001B[38;5;241m*\u001B[39m \u001B[38;5;241m2\u001B[39m \u001B[38;5;241m*\u001B[39m\u001B[38;5;241m*\u001B[39m (attempt \u001B[38;5;241m-\u001B[39m \u001B[38;5;241m1\u001B[39m)))\n\u001B[0;32m 435\u001B[0m \u001B[38;5;28;01mtry\u001B[39;00m:\n\u001B[1;32m--> 436\u001B[0m \u001B[38;5;28;01mfor\u001B[39;00m data, (ok, info) \u001B[38;5;129;01min\u001B[39;00m \u001B[38;5;28mzip\u001B[39m(\n\u001B[0;32m 437\u001B[0m bulk_data,\n\u001B[0;32m 438\u001B[0m _process_bulk_chunk(\n\u001B[0;32m 439\u001B[0m client,\n\u001B[0;32m 440\u001B[0m bulk_actions,\n\u001B[0;32m 441\u001B[0m bulk_data,\n\u001B[0;32m 442\u001B[0m raise_on_exception,\n\u001B[0;32m 443\u001B[0m raise_on_error,\n\u001B[0;32m 444\u001B[0m ignore_status,\n\u001B[0;32m 445\u001B[0m \u001B[38;5;241m*\u001B[39margs,\n\u001B[0;32m 446\u001B[0m \u001B[38;5;241m*\u001B[39m\u001B[38;5;241m*\u001B[39mkwargs,\n\u001B[0;32m 447\u001B[0m ),\n\u001B[0;32m 448\u001B[0m ):\n\u001B[0;32m 449\u001B[0m \u001B[38;5;28;01mif\u001B[39;00m \u001B[38;5;129;01mnot\u001B[39;00m ok:\n\u001B[0;32m 450\u001B[0m action, info \u001B[38;5;241m=\u001B[39m info\u001B[38;5;241m.\u001B[39mpopitem()\n", 178 | "File \u001B[1;32m~\\anaconda3\\Lib\\site-packages\\elasticsearch\\helpers\\actions.py:339\u001B[0m, in \u001B[0;36m_process_bulk_chunk\u001B[1;34m(client, bulk_actions, bulk_data, raise_on_exception, raise_on_error, ignore_status, *args, **kwargs)\u001B[0m\n\u001B[0;32m 335\u001B[0m ignore_status \u001B[38;5;241m=\u001B[39m (ignore_status,)\n\u001B[0;32m 337\u001B[0m \u001B[38;5;28;01mtry\u001B[39;00m:\n\u001B[0;32m 338\u001B[0m \u001B[38;5;66;03m# send the actual request\u001B[39;00m\n\u001B[1;32m--> 339\u001B[0m resp \u001B[38;5;241m=\u001B[39m client\u001B[38;5;241m.\u001B[39mbulk(\u001B[38;5;241m*\u001B[39margs, operations\u001B[38;5;241m=\u001B[39mbulk_actions, \u001B[38;5;241m*\u001B[39m\u001B[38;5;241m*\u001B[39mkwargs) \u001B[38;5;66;03m# type: ignore[arg-type]\u001B[39;00m\n\u001B[0;32m 340\u001B[0m \u001B[38;5;28;01mexcept\u001B[39;00m ApiError \u001B[38;5;28;01mas\u001B[39;00m e:\n\u001B[0;32m 341\u001B[0m gen \u001B[38;5;241m=\u001B[39m _process_bulk_chunk_error(\n\u001B[0;32m 342\u001B[0m error\u001B[38;5;241m=\u001B[39me,\n\u001B[0;32m 343\u001B[0m bulk_data\u001B[38;5;241m=\u001B[39mbulk_data,\n\u001B[1;32m (...)\u001B[0m\n\u001B[0;32m 346\u001B[0m raise_on_error\u001B[38;5;241m=\u001B[39mraise_on_error,\n\u001B[0;32m 347\u001B[0m )\n", 179 | "File \u001B[1;32m~\\anaconda3\\Lib\\site-packages\\elasticsearch\\_sync\\client\\utils.py:446\u001B[0m, in \u001B[0;36m_rewrite_parameters..wrapper..wrapped\u001B[1;34m(*args, **kwargs)\u001B[0m\n\u001B[0;32m 443\u001B[0m \u001B[38;5;28;01mexcept\u001B[39;00m \u001B[38;5;167;01mKeyError\u001B[39;00m:\n\u001B[0;32m 444\u001B[0m \u001B[38;5;28;01mpass\u001B[39;00m\n\u001B[1;32m--> 446\u001B[0m \u001B[38;5;28;01mreturn\u001B[39;00m api(\u001B[38;5;241m*\u001B[39margs, \u001B[38;5;241m*\u001B[39m\u001B[38;5;241m*\u001B[39mkwargs)\n", 180 | "File \u001B[1;32m~\\anaconda3\\Lib\\site-packages\\elasticsearch\\_sync\\client\\__init__.py:714\u001B[0m, in \u001B[0;36mElasticsearch.bulk\u001B[1;34m(self, operations, body, index, error_trace, filter_path, human, pipeline, pretty, refresh, require_alias, routing, source, source_excludes, source_includes, timeout, wait_for_active_shards)\u001B[0m\n\u001B[0;32m 709\u001B[0m __body \u001B[38;5;241m=\u001B[39m operations \u001B[38;5;28;01mif\u001B[39;00m operations \u001B[38;5;129;01mis\u001B[39;00m \u001B[38;5;129;01mnot\u001B[39;00m \u001B[38;5;28;01mNone\u001B[39;00m \u001B[38;5;28;01melse\u001B[39;00m body\n\u001B[0;32m 710\u001B[0m __headers \u001B[38;5;241m=\u001B[39m {\n\u001B[0;32m 711\u001B[0m \u001B[38;5;124m\"\u001B[39m\u001B[38;5;124maccept\u001B[39m\u001B[38;5;124m\"\u001B[39m: \u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mapplication/json\u001B[39m\u001B[38;5;124m\"\u001B[39m,\n\u001B[0;32m 712\u001B[0m \u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mcontent-type\u001B[39m\u001B[38;5;124m\"\u001B[39m: \u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mapplication/x-ndjson\u001B[39m\u001B[38;5;124m\"\u001B[39m,\n\u001B[0;32m 713\u001B[0m }\n\u001B[1;32m--> 714\u001B[0m \u001B[38;5;28;01mreturn\u001B[39;00m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mperform_request( \u001B[38;5;66;03m# type: ignore[return-value]\u001B[39;00m\n\u001B[0;32m 715\u001B[0m \u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mPUT\u001B[39m\u001B[38;5;124m\"\u001B[39m,\n\u001B[0;32m 716\u001B[0m __path,\n\u001B[0;32m 717\u001B[0m params\u001B[38;5;241m=\u001B[39m__query,\n\u001B[0;32m 718\u001B[0m headers\u001B[38;5;241m=\u001B[39m__headers,\n\u001B[0;32m 719\u001B[0m body\u001B[38;5;241m=\u001B[39m__body,\n\u001B[0;32m 720\u001B[0m endpoint_id\u001B[38;5;241m=\u001B[39m\u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mbulk\u001B[39m\u001B[38;5;124m\"\u001B[39m,\n\u001B[0;32m 721\u001B[0m path_parts\u001B[38;5;241m=\u001B[39m__path_parts,\n\u001B[0;32m 722\u001B[0m )\n", 181 | "File \u001B[1;32m~\\anaconda3\\Lib\\site-packages\\elasticsearch\\_sync\\client\\_base.py:271\u001B[0m, in \u001B[0;36mBaseClient.perform_request\u001B[1;34m(self, method, path, params, headers, body, endpoint_id, path_parts)\u001B[0m\n\u001B[0;32m 255\u001B[0m \u001B[38;5;28;01mdef\u001B[39;00m \u001B[38;5;21mperform_request\u001B[39m(\n\u001B[0;32m 256\u001B[0m \u001B[38;5;28mself\u001B[39m,\n\u001B[0;32m 257\u001B[0m method: \u001B[38;5;28mstr\u001B[39m,\n\u001B[1;32m (...)\u001B[0m\n\u001B[0;32m 264\u001B[0m path_parts: Optional[Mapping[\u001B[38;5;28mstr\u001B[39m, Any]] \u001B[38;5;241m=\u001B[39m \u001B[38;5;28;01mNone\u001B[39;00m,\n\u001B[0;32m 265\u001B[0m ) \u001B[38;5;241m-\u001B[39m\u001B[38;5;241m>\u001B[39m ApiResponse[Any]:\n\u001B[0;32m 266\u001B[0m \u001B[38;5;28;01mwith\u001B[39;00m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39m_otel\u001B[38;5;241m.\u001B[39mspan(\n\u001B[0;32m 267\u001B[0m method,\n\u001B[0;32m 268\u001B[0m endpoint_id\u001B[38;5;241m=\u001B[39mendpoint_id,\n\u001B[0;32m 269\u001B[0m path_parts\u001B[38;5;241m=\u001B[39mpath_parts \u001B[38;5;129;01mor\u001B[39;00m {},\n\u001B[0;32m 270\u001B[0m ) \u001B[38;5;28;01mas\u001B[39;00m otel_span:\n\u001B[1;32m--> 271\u001B[0m response \u001B[38;5;241m=\u001B[39m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39m_perform_request(\n\u001B[0;32m 272\u001B[0m method,\n\u001B[0;32m 273\u001B[0m path,\n\u001B[0;32m 274\u001B[0m params\u001B[38;5;241m=\u001B[39mparams,\n\u001B[0;32m 275\u001B[0m headers\u001B[38;5;241m=\u001B[39mheaders,\n\u001B[0;32m 276\u001B[0m body\u001B[38;5;241m=\u001B[39mbody,\n\u001B[0;32m 277\u001B[0m otel_span\u001B[38;5;241m=\u001B[39motel_span,\n\u001B[0;32m 278\u001B[0m )\n\u001B[0;32m 279\u001B[0m otel_span\u001B[38;5;241m.\u001B[39mset_elastic_cloud_metadata(response\u001B[38;5;241m.\u001B[39mmeta\u001B[38;5;241m.\u001B[39mheaders)\n\u001B[0;32m 280\u001B[0m \u001B[38;5;28;01mreturn\u001B[39;00m response\n", 182 | "File \u001B[1;32m~\\anaconda3\\Lib\\site-packages\\elasticsearch\\_sync\\client\\_base.py:316\u001B[0m, in \u001B[0;36mBaseClient._perform_request\u001B[1;34m(self, method, path, params, headers, body, otel_span)\u001B[0m\n\u001B[0;32m 313\u001B[0m \u001B[38;5;28;01melse\u001B[39;00m:\n\u001B[0;32m 314\u001B[0m target \u001B[38;5;241m=\u001B[39m path\n\u001B[1;32m--> 316\u001B[0m meta, resp_body \u001B[38;5;241m=\u001B[39m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mtransport\u001B[38;5;241m.\u001B[39mperform_request(\n\u001B[0;32m 317\u001B[0m method,\n\u001B[0;32m 318\u001B[0m target,\n\u001B[0;32m 319\u001B[0m headers\u001B[38;5;241m=\u001B[39mrequest_headers,\n\u001B[0;32m 320\u001B[0m body\u001B[38;5;241m=\u001B[39mbody,\n\u001B[0;32m 321\u001B[0m request_timeout\u001B[38;5;241m=\u001B[39m\u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39m_request_timeout,\n\u001B[0;32m 322\u001B[0m max_retries\u001B[38;5;241m=\u001B[39m\u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39m_max_retries,\n\u001B[0;32m 323\u001B[0m retry_on_status\u001B[38;5;241m=\u001B[39m\u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39m_retry_on_status,\n\u001B[0;32m 324\u001B[0m retry_on_timeout\u001B[38;5;241m=\u001B[39m\u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39m_retry_on_timeout,\n\u001B[0;32m 325\u001B[0m client_meta\u001B[38;5;241m=\u001B[39m\u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39m_client_meta,\n\u001B[0;32m 326\u001B[0m otel_span\u001B[38;5;241m=\u001B[39motel_span,\n\u001B[0;32m 327\u001B[0m )\n\u001B[0;32m 329\u001B[0m \u001B[38;5;66;03m# HEAD with a 404 is returned as a normal response\u001B[39;00m\n\u001B[0;32m 330\u001B[0m \u001B[38;5;66;03m# since this is used as an 'exists' functionality.\u001B[39;00m\n\u001B[0;32m 331\u001B[0m \u001B[38;5;28;01mif\u001B[39;00m \u001B[38;5;129;01mnot\u001B[39;00m (method \u001B[38;5;241m==\u001B[39m \u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mHEAD\u001B[39m\u001B[38;5;124m\"\u001B[39m \u001B[38;5;129;01mand\u001B[39;00m meta\u001B[38;5;241m.\u001B[39mstatus \u001B[38;5;241m==\u001B[39m \u001B[38;5;241m404\u001B[39m) \u001B[38;5;129;01mand\u001B[39;00m (\n\u001B[0;32m 332\u001B[0m \u001B[38;5;129;01mnot\u001B[39;00m \u001B[38;5;241m200\u001B[39m \u001B[38;5;241m<\u001B[39m\u001B[38;5;241m=\u001B[39m meta\u001B[38;5;241m.\u001B[39mstatus \u001B[38;5;241m<\u001B[39m \u001B[38;5;241m299\u001B[39m\n\u001B[0;32m 333\u001B[0m \u001B[38;5;129;01mand\u001B[39;00m (\n\u001B[1;32m (...)\u001B[0m\n\u001B[0;32m 337\u001B[0m )\n\u001B[0;32m 338\u001B[0m ):\n", 183 | "File \u001B[1;32m~\\anaconda3\\Lib\\site-packages\\elastic_transport\\_transport.py:342\u001B[0m, in \u001B[0;36mTransport.perform_request\u001B[1;34m(self, method, target, body, headers, max_retries, retry_on_status, retry_on_timeout, request_timeout, client_meta, otel_span)\u001B[0m\n\u001B[0;32m 340\u001B[0m \u001B[38;5;28;01mtry\u001B[39;00m:\n\u001B[0;32m 341\u001B[0m otel_span\u001B[38;5;241m.\u001B[39mset_node_metadata(node\u001B[38;5;241m.\u001B[39mhost, node\u001B[38;5;241m.\u001B[39mport, node\u001B[38;5;241m.\u001B[39mbase_url, target)\n\u001B[1;32m--> 342\u001B[0m resp \u001B[38;5;241m=\u001B[39m node\u001B[38;5;241m.\u001B[39mperform_request(\n\u001B[0;32m 343\u001B[0m method,\n\u001B[0;32m 344\u001B[0m target,\n\u001B[0;32m 345\u001B[0m body\u001B[38;5;241m=\u001B[39mrequest_body,\n\u001B[0;32m 346\u001B[0m headers\u001B[38;5;241m=\u001B[39mrequest_headers,\n\u001B[0;32m 347\u001B[0m request_timeout\u001B[38;5;241m=\u001B[39mrequest_timeout,\n\u001B[0;32m 348\u001B[0m )\n\u001B[0;32m 349\u001B[0m _logger\u001B[38;5;241m.\u001B[39minfo(\n\u001B[0;32m 350\u001B[0m \u001B[38;5;124m\"\u001B[39m\u001B[38;5;132;01m%s\u001B[39;00m\u001B[38;5;124m \u001B[39m\u001B[38;5;132;01m%s\u001B[39;00m\u001B[38;5;132;01m%s\u001B[39;00m\u001B[38;5;124m [status:\u001B[39m\u001B[38;5;132;01m%s\u001B[39;00m\u001B[38;5;124m duration:\u001B[39m\u001B[38;5;132;01m%.3f\u001B[39;00m\u001B[38;5;124ms]\u001B[39m\u001B[38;5;124m\"\u001B[39m\n\u001B[0;32m 351\u001B[0m \u001B[38;5;241m%\u001B[39m (\n\u001B[1;32m (...)\u001B[0m\n\u001B[0;32m 357\u001B[0m )\n\u001B[0;32m 358\u001B[0m )\n\u001B[0;32m 360\u001B[0m \u001B[38;5;28;01mif\u001B[39;00m method \u001B[38;5;241m!=\u001B[39m \u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mHEAD\u001B[39m\u001B[38;5;124m\"\u001B[39m:\n", 184 | "File \u001B[1;32m~\\anaconda3\\Lib\\site-packages\\elastic_transport\\_node\\_http_urllib3.py:202\u001B[0m, in \u001B[0;36mUrllib3HttpNode.perform_request\u001B[1;34m(self, method, target, body, headers, request_timeout)\u001B[0m\n\u001B[0;32m 194\u001B[0m err \u001B[38;5;241m=\u001B[39m \u001B[38;5;167;01mConnectionError\u001B[39;00m(\u001B[38;5;28mstr\u001B[39m(e), errors\u001B[38;5;241m=\u001B[39m(e,))\n\u001B[0;32m 195\u001B[0m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39m_log_request(\n\u001B[0;32m 196\u001B[0m method\u001B[38;5;241m=\u001B[39mmethod,\n\u001B[0;32m 197\u001B[0m target\u001B[38;5;241m=\u001B[39mtarget,\n\u001B[1;32m (...)\u001B[0m\n\u001B[0;32m 200\u001B[0m exception\u001B[38;5;241m=\u001B[39merr,\n\u001B[0;32m 201\u001B[0m )\n\u001B[1;32m--> 202\u001B[0m \u001B[38;5;28;01mraise\u001B[39;00m err \u001B[38;5;28;01mfrom\u001B[39;00m \u001B[38;5;28;01mNone\u001B[39;00m\n\u001B[0;32m 204\u001B[0m meta \u001B[38;5;241m=\u001B[39m ApiResponseMeta(\n\u001B[0;32m 205\u001B[0m node\u001B[38;5;241m=\u001B[39m\u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mconfig,\n\u001B[0;32m 206\u001B[0m duration\u001B[38;5;241m=\u001B[39mduration,\n\u001B[1;32m (...)\u001B[0m\n\u001B[0;32m 209\u001B[0m headers\u001B[38;5;241m=\u001B[39mresponse_headers,\n\u001B[0;32m 210\u001B[0m )\n\u001B[0;32m 211\u001B[0m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39m_log_request(\n\u001B[0;32m 212\u001B[0m method\u001B[38;5;241m=\u001B[39mmethod,\n\u001B[0;32m 213\u001B[0m target\u001B[38;5;241m=\u001B[39mtarget,\n\u001B[1;32m (...)\u001B[0m\n\u001B[0;32m 217\u001B[0m response\u001B[38;5;241m=\u001B[39mdata,\n\u001B[0;32m 218\u001B[0m )\n", 185 | "\u001B[1;31mConnectionTimeout\u001B[0m: Connection timed out" 186 | ] 187 | } 188 | ], 189 | "source": [ 190 | "from pathlib import Path\n", 191 | "import os\n", 192 | "import json\n", 193 | "from tqdm import tqdm\n", 194 | "\n", 195 | "source_directory = Path('C:/Users/linus/big_data/pubmed/first100JSONLembedded/')\n", 196 | "\n", 197 | "index_name = \"pubmed_emb_index\"\n", 198 | "\n", 199 | "def bulk_index_documents(source_directory, index_name):\n", 200 | " if not source_directory.exists():\n", 201 | " print(\"The source directory does not exist.\")\n", 202 | " return\n", 203 | "\n", 204 | " actions = [] \n", 205 | "\n", 206 | " for file_name in tqdm(os.listdir(source_directory)):\n", 207 | " if file_name.endswith('.jsonl'):\n", 208 | " source_file = source_directory / file_name\n", 209 | " \n", 210 | " with open(source_file, 'r') as json_file:\n", 211 | " for line in json_file:\n", 212 | " try:\n", 213 | " doc = json.loads(line)\n", 214 | " action = {\n", 215 | " \"_index\": index_name,\n", 216 | " \"_source\": doc\n", 217 | " }\n", 218 | " actions.append(action)\n", 219 | "\n", 220 | " if len(actions) == 600: # Bulk 600 docs\n", 221 | " helpers.bulk(es, actions)\n", 222 | " actions = [] \n", 223 | " except json.JSONDecodeError as e:\n", 224 | " print(f\"Error decoding JSON: {e}\")\n", 225 | " except Exception as e:\n", 226 | " print(f\"An error occurred: {e}\")\n", 227 | "\n", 228 | " if actions:\n", 229 | " helpers.bulk(es, actions)\n", 230 | "\n", 231 | " print('Indexing complete')\n", 232 | "\n", 233 | "bulk_index_documents(source_directory, index_name)" 234 | ], 235 | "metadata": { 236 | "collapsed": false 237 | }, 238 | "id": "7a60983235cd1e2b", 239 | "execution_count": 15 240 | }, 241 | { 242 | "cell_type": "code", 243 | "outputs": [ 244 | { 245 | "name": "stdout", 246 | "output_type": "stream", 247 | "text": [ 248 | "Index contains 100 JSONL Chunks with 1795307 documents.\n" 249 | ] 250 | } 251 | ], 252 | "source": [ 253 | "count_result = es.count(index='pubmed_emb_index')\n", 254 | "\n", 255 | "# Print the count\n", 256 | "print(f\"Index contains 100 JSONL Chunks with {count_result['count']} documents.\")" 257 | ], 258 | "metadata": { 259 | "collapsed": false, 260 | "ExecuteTime": { 261 | "end_time": "2024-04-06T21:02:07.817164Z", 262 | "start_time": "2024-04-06T21:02:07.768706Z" 263 | } 264 | }, 265 | "id": "30f73281bec8ed96", 266 | "execution_count": 9 267 | }, 268 | { 269 | "cell_type": "code", 270 | "outputs": [ 271 | { 272 | "name": "stdout", 273 | "output_type": "stream", 274 | "text": [ 275 | "Die Grösse des Indexes ist 30.2 GB.\n" 276 | ] 277 | } 278 | ], 279 | "source": [ 280 | "response = es.indices.stats(index='pubmed_emb_index')\n", 281 | "index_size = response['_all']['total']['store']['size_in_bytes']\n", 282 | "\n", 283 | "print(f\"Die Grösse des Indexes ist {round(index_size/1000000000, 2)} GB.\")" 284 | ], 285 | "metadata": { 286 | "collapsed": false, 287 | "ExecuteTime": { 288 | "end_time": "2024-04-06T21:02:12.749395Z", 289 | "start_time": "2024-04-06T21:02:12.668611Z" 290 | } 291 | }, 292 | "id": "6980954f4300fb66", 293 | "execution_count": 10 294 | }, 295 | { 296 | "cell_type": "markdown", 297 | "source": [], 298 | "metadata": { 299 | "collapsed": false 300 | }, 301 | "id": "2b4b89dea2ad760a" 302 | }, 303 | { 304 | "cell_type": "markdown", 305 | "source": [], 306 | "metadata": { 307 | "collapsed": false 308 | }, 309 | "id": "74ff19354036d83e" 310 | }, 311 | { 312 | "cell_type": "markdown", 313 | "source": [], 314 | "metadata": { 315 | "collapsed": false 316 | }, 317 | "id": "cdbfcb84384f18e3" 318 | }, 319 | { 320 | "cell_type": "markdown", 321 | "source": [], 322 | "metadata": { 323 | "collapsed": false 324 | }, 325 | "id": "ee9caf6f0f31f667" 326 | }, 327 | { 328 | "cell_type": "markdown", 329 | "source": [], 330 | "metadata": { 331 | "collapsed": false 332 | }, 333 | "id": "77a8d3794968c862" 334 | } 335 | ], 336 | "metadata": { 337 | "kernelspec": { 338 | "display_name": "Python 3", 339 | "language": "python", 340 | "name": "python3" 341 | }, 342 | "language_info": { 343 | "codemirror_mode": { 344 | "name": "ipython", 345 | "version": 2 346 | }, 347 | "file_extension": ".py", 348 | "mimetype": "text/x-python", 349 | "name": "python", 350 | "nbconvert_exporter": "python", 351 | "pygments_lexer": "ipython2", 352 | "version": "2.7.6" 353 | } 354 | }, 355 | "nbformat": 4, 356 | "nbformat_minor": 5 357 | } 358 | -------------------------------------------------------------------------------- /evaluation/evaluation_data_storages/elasticsearch/eval_elastic.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "id": "initial_id", 7 | "metadata": { 8 | "ExecuteTime": { 9 | "end_time": "2024-04-11T15:24:33.998287Z", 10 | "start_time": "2024-04-11T15:24:33.902535Z" 11 | }, 12 | "collapsed": true 13 | }, 14 | "outputs": [ 15 | { 16 | "name": "stdout", 17 | "output_type": "stream", 18 | "text": [ 19 | "{'name': '6c4d8312349c', 'cluster_name': 'docker-cluster', 'cluster_uuid': 'JIOcJVSbToiJAWP2y6W5jQ', 'version': {'number': '8.13.2', 'build_flavor': 'default', 'build_type': 'docker', 'build_hash': '16cc90cd2d08a3147ce02b07e50894bc060a4cbf', 'build_date': '2024-04-05T14:45:26.420424304Z', 'build_snapshot': False, 'lucene_version': '9.10.0', 'minimum_wire_compatibility_version': '7.17.0', 'minimum_index_compatibility_version': '7.0.0'}, 'tagline': 'You Know, for Search'}\n" 20 | ] 21 | } 22 | ], 23 | "source": [ 24 | "from elasticsearch import Elasticsearch\n", 25 | "import urllib3\n", 26 | "import os\n", 27 | "\n", 28 | "\n", 29 | "elastic_password = os.getenv('ELASTIC_PASSWORD')\n", 30 | "\n", 31 | "es = Elasticsearch(\n", 32 | " ['https://localhost:9200'],\n", 33 | " basic_auth=('elastic', elastic_password),\n", 34 | " verify_certs=True,\n", 35 | " ca_certs=\"/home/rag/.crt/http_ca.crt\",\n", 36 | " request_timeout=60\n", 37 | " )\n", 38 | "\n", 39 | "index_name = \"pubmed_index\"\n", 40 | "\n", 41 | "urllib3.disable_warnings()\n", 42 | "\n", 43 | "print(es.info())" 44 | ] 45 | }, 46 | { 47 | "cell_type": "code", 48 | "execution_count": 2, 49 | "id": "68205f492c85d24b", 50 | "metadata": { 51 | "ExecuteTime": { 52 | "end_time": "2024-04-11T15:27:03.643225Z", 53 | "start_time": "2024-04-11T15:27:03.607945Z" 54 | }, 55 | "collapsed": false 56 | }, 57 | "outputs": [ 58 | { 59 | "data": { 60 | "text/plain": [ 61 | "ObjectApiResponse({'count': 10269126, '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0}})" 62 | ] 63 | }, 64 | "execution_count": 2, 65 | "metadata": {}, 66 | "output_type": "execute_result" 67 | } 68 | ], 69 | "source": [ 70 | "# checking number of documents in the index\n", 71 | "es.count(index=index_name)" 72 | ] 73 | }, 74 | { 75 | "cell_type": "markdown", 76 | "id": "cb477e6f0b7178d4", 77 | "metadata": { 78 | "collapsed": false 79 | }, 80 | "source": [ 81 | "### Define query functions for BM25 and Vector Similarity Search\n", 82 | "\n", 83 | "Define a function to perform a BM25 search using the match query." 84 | ] 85 | }, 86 | { 87 | "cell_type": "code", 88 | "execution_count": 10, 89 | "id": "f78167d428f06529", 90 | "metadata": { 91 | "ExecuteTime": { 92 | "end_time": "2024-04-11T15:27:07.474197Z", 93 | "start_time": "2024-04-11T15:27:07.468939Z" 94 | }, 95 | "collapsed": false 96 | }, 97 | "outputs": [], 98 | "source": [ 99 | "# Define a search query\n", 100 | "def bm25_search(query: str, k: int = 5):\n", 101 | " query = {\n", 102 | " \"size\": k,\n", 103 | " \"query\": {\n", 104 | " \"match\": {\n", 105 | " \"content\": f\"{query}\"\n", 106 | " }\n", 107 | " },\n", 108 | " \"_source\": [\"PMID\", \"title\"]\n", 109 | " }\n", 110 | " # Elasticsearch nutzt standardmässig das BM25-Modell, um die Relevanz der Dokumente zu berechnen\n", 111 | " return es.search(index='pubmed_index', body=query)" 112 | ] 113 | }, 114 | { 115 | "cell_type": "markdown", 116 | "id": "d079c4d6618d3b38", 117 | "metadata": { 118 | "collapsed": false 119 | }, 120 | "source": [ 121 | "Initialize the text embedder and define a function to convert a query to a vector using the bioBERT embeddings." 122 | ] 123 | }, 124 | { 125 | "cell_type": "code", 126 | "execution_count": 14, 127 | "id": "ed4ac4f757e7680d", 128 | "metadata": { 129 | "ExecuteTime": { 130 | "end_time": "2024-04-11T15:27:23.549841Z", 131 | "start_time": "2024-04-11T15:27:14.558573Z" 132 | }, 133 | "collapsed": false 134 | }, 135 | "outputs": [], 136 | "source": [ 137 | "from Embedding import TextEmbedder\n", 138 | "embedder = TextEmbedder()\n", 139 | "\n", 140 | "def query_to_vector(text, embedder):\n", 141 | " embedding = embedder.embed(text)\n", 142 | " return embedding" 143 | ] 144 | }, 145 | { 146 | "cell_type": "markdown", 147 | "id": "e6015236b12040cb", 148 | "metadata": { 149 | "collapsed": false 150 | }, 151 | "source": [ 152 | "Define a function to perform a vector similarity search using the cosine similarity between the query vector and the embeddings in the indexed documents." 153 | ] 154 | }, 155 | { 156 | "cell_type": "code", 157 | "execution_count": 4, 158 | "id": "caccaaa64018caea", 159 | "metadata": { 160 | "ExecuteTime": { 161 | "end_time": "2024-04-10T11:14:40.480125Z", 162 | "start_time": "2024-04-10T11:14:40.474963Z" 163 | }, 164 | "collapsed": false 165 | }, 166 | "outputs": [], 167 | "source": [ 168 | "def cosine_similarity(index, query: str, k: int = 5):\n", 169 | " query_vector = query_to_vector(query, embedder)\n", 170 | " \n", 171 | " query = {\n", 172 | " \"size\": k, # Anzahl der zurückzugebenden Ergebnisse\n", 173 | " \"query\": {\n", 174 | " \"script_score\": {\n", 175 | " \"query\": {\"match_all\": {}},\n", 176 | " \"script\": {\n", 177 | " \"source\": \"cosineSimilarity(params.query_vector, 'embeddings') + 1.0\",\n", 178 | " # +1.0, um sicherzustellen, dass alle Werte positiv sind\n", 179 | " \"params\": {\"query_vector\": query_vector}\n", 180 | " }\n", 181 | " }\n", 182 | " }\n", 183 | " }\n", 184 | " return es.search(index=index, body=query)" 185 | ] 186 | }, 187 | { 188 | "cell_type": "raw", 189 | "id": "6101062777191904", 190 | "metadata": { 191 | "collapsed": false 192 | }, 193 | "source": [ 194 | "trying to use the knn search instead of the cosine similarity search" 195 | ] 196 | }, 197 | { 198 | "cell_type": "code", 199 | "execution_count": 5, 200 | "id": "89ecbbd9451faf5f", 201 | "metadata": { 202 | "ExecuteTime": { 203 | "end_time": "2024-04-10T11:14:41.556397Z", 204 | "start_time": "2024-04-10T11:14:41.551909Z" 205 | }, 206 | "collapsed": false 207 | }, 208 | "outputs": [], 209 | "source": [ 210 | "def knn_search(index, query: str, k: int = 10):\n", 211 | " # Wandeln Sie die Abfrage in einen Vektor um\n", 212 | " query_vector = query_to_vector(query, embedder)\n", 213 | " \n", 214 | " # Konstruieren Sie die k-NN-Suche\n", 215 | " knn_query = { \n", 216 | " \"knn\": {\n", 217 | " \"field\": \"embeddings\", # Das Feld, das die Vektoren enthält\n", 218 | " \"query_vector\": query_vector,\n", 219 | " \"k\": k,\n", 220 | " \"num_candidates\": 100 \n", 221 | " }\n", 222 | " }\n", 223 | " \n", 224 | " # Führen Sie die k-NN-Suche aus\n", 225 | " return es.search(index=index, body=knn_query)" 226 | ] 227 | }, 228 | { 229 | "cell_type": "markdown", 230 | "id": "8548993de949a242", 231 | "metadata": { 232 | "collapsed": false 233 | }, 234 | "source": [ 235 | "#### Perform BM25 and Vector Similarity Searches\n", 236 | "\n", 237 | "first, perform a BM25 search using the match query." 238 | ] 239 | }, 240 | { 241 | "cell_type": "code", 242 | "execution_count": 14, 243 | "id": "4cc8b147ef147f06", 244 | "metadata": { 245 | "ExecuteTime": { 246 | "end_time": "2024-04-11T15:29:51.585418Z", 247 | "start_time": "2024-04-11T15:29:48.190681Z" 248 | }, 249 | "collapsed": false 250 | }, 251 | "outputs": [], 252 | "source": [ 253 | "results = bm25_search(\"What is the mortality rate of COVID-19?\", k=100)" 254 | ] 255 | }, 256 | { 257 | "cell_type": "code", 258 | "execution_count": 15, 259 | "id": "c2c54153b7684d93", 260 | "metadata": { 261 | "ExecuteTime": { 262 | "end_time": "2024-04-11T15:29:54.532943Z", 263 | "start_time": "2024-04-11T15:29:54.527831Z" 264 | }, 265 | "collapsed": false 266 | }, 267 | "outputs": [ 268 | { 269 | "name": "stdout", 270 | "output_type": "stream", 271 | "text": [ 272 | "Score: 26.324913, PMID: 32621066, Title: Analysis of Austrian COVID-19 deaths by age and sex.\n", 273 | "Score: 26.013273, PMID: 34783897, Title: Preterm birth, stillbirth and early neonatal mortality during the Danish COVID-19 lockdown.\n", 274 | "Score: 25.476734, PMID: 33776407, Title: Multiple sclerosis patients and COVID-19.\n", 275 | "Score: 25.37893, PMID: 35813262, Title: Measuring the Effect of COVID-19 Pandemic on Mortality: Review and Prospect - China, 2021.\n", 276 | "Score: 25.233519, PMID: 32865940, Title: COVID-19: Why Has the Mortality Rate Declined?\n", 277 | "Score: 24.946014, PMID: 32365212, Title: [Covid-19 - deaths and analysis].\n", 278 | "Score: 24.925226, PMID: 33218796, Title: Spatial inequalities of COVID-19 mortality rate in relation to socioeconomic and environmental factors across England.\n", 279 | "Score: 24.884295, PMID: 35482643, Title: COVID-19 mortality in the United States: It's been two Americas from the start.\n", 280 | "Score: 24.837067, PMID: 32678061, Title: Men and COVID-19: A Biopsychosocial Approach to Understanding Sex Differences in Mortality and Recommendations for Practice and Policy Interventions.\n", 281 | "Score: 24.779966, PMID: 36343310, Title: Racial And Ethnic Inequalities In COVID-19 Mortality Within Carceral Settings: An Analysis Of Texas Prisons.\n", 282 | "Score: 24.666018, PMID: 32504106, Title: [Increased risk of deep vein thrombosis in intensive care unit patients with CoViD-19 infections?-Preliminary data].\n", 283 | "Score: 24.40749, PMID: 36199983, Title: Does 'Data fudging' explain the autocratic advantage? Evidence from the gap between Official Covid-19 mortality and excess mortality.\n", 284 | "Score: 24.375858, PMID: 35077101, Title: [Association between Covid-19 mortality and atmospheric pollution in Mexican cities].\n", 285 | "Score: 24.349539, PMID: 35058532, Title: State social distancing restrictions and nursing home outcomes.\n", 286 | "Score: 24.306416, PMID: 32501382, Title: What orthopedic surgeons need to know about Covid-19 pandemic.\n", 287 | "Score: 24.187761, PMID: 33075535, Title: Africa's low COVID-19 mortality rate: A paradox?\n", 288 | "Score: 24.10879, PMID: 36060856, Title: Excess deaths during the COVID-19 pandemic in Alberta, Canada.\n", 289 | "Score: 24.030546, PMID: 34581999, Title: Changes in COVID-19-Associated Deaths During a Year Among Blacks and Hispanics Compared to Whites in the State of Connecticut.\n", 290 | "Score: 24.01647, PMID: 36142027, Title: COVID-19, Non-Communicable Diseases, and Behavioral Factors in the Peruvian Population ≥ 15 Years: An Ecological Study during the First and Second Year of the COVID-19 Pandemic.\n", 291 | "Score: 23.977262, PMID: 33728066, Title: Effect of COVID-19 on Mortality of Pregnant and Postpartum Women: A Systematic Review and Meta-Analysis.\n", 292 | "Score: 23.973434, PMID: 32709854, Title: Covid-19 mortality is negatively associated with test number and government effectiveness.\n", 293 | "Score: 23.84487, PMID: 32661937, Title: Pediatrician, watch out for corona-phobia.\n", 294 | "Score: 23.754244, PMID: 34218318, Title: COVID-19 in pediatric patients undergoing chronic dialysis and kidney transplantation.\n", 295 | "Score: 23.751411, PMID: 32756513, Title: COVID-19 Global Risk: Expectation vs. Reality.\n", 296 | "Score: 23.73906, PMID: 32992105, Title: Preparing for and responding to Covid-19's 'second hit'.\n", 297 | "Score: 23.738796, PMID: 34048201, Title: Impact of COVID-19 Pandemic in a Brazilian High-Volume Aortic Center.\n", 298 | "Score: 23.663988, PMID: 34400452, Title: Temporal trends of COVID-19 mortality and hospitalisation rates: an observational cohort study from the US Department of Veterans Affairs.\n", 299 | "Score: 23.659967, PMID: 35217534, Title: Contribution of the elevated thrombosis risk of males to the excess male mortality observed in COVID-19: an observational study.\n", 300 | "Score: 23.652725, PMID: 35930243, Title: The role of kidney injury biomarkers in COVID-19.\n", 301 | "Score: 23.644642, PMID: 35330387, Title: Characteristics of the Third COVID-19 Pandemic Wave with Special Focus on Socioeconomic Inequalities in Morbidity, Mortality and the Uptake of COVID-19 Vaccination in Hungary.\n", 302 | "Score: 23.502783, PMID: 34630651, Title: Extracorporeal membrane oxygenation in the management of critically ill patients with coronavirus disease 2019: A narrative review.\n", 303 | "Score: 23.470608, PMID: 32343650, Title: Excess Mortality Estimation During the COVID-19 Pandemic: Preliminary Data from Portugal.\n", 304 | "Score: 23.391472, PMID: 33292536, Title: All-cause mortality supports the COVID-19 mortality in Belgium and comparison with major fatal events of the last century.\n", 305 | "Score: 23.388765, PMID: 34308302, Title: Estimating the early impact of vaccination against COVID-19 on deaths among elderly people in Brazil: Analyses of routinely-collected data on vaccine coverage and mortality.\n", 306 | "Score: 23.369747, PMID: 34354682, Title: Causes and Consequences of COVID-19-Associated Bacterial Infections.\n", 307 | "Score: 23.368668, PMID: 36093278, Title: The sources of the Kuznets relationship between the COVID-19 mortality rate and economic performance.\n", 308 | "Score: 23.34332, PMID: 34534226, Title: COVID-19 mortality with regard to healthcare services availability, health risks, and socio-spatial factors at department level in France: A spatial cross-sectional analysis.\n", 309 | "Score: 23.311697, PMID: 35898347, Title: Geographic and Temporal Patterns in Covid-19 Mortality by Race and Ethnicity in the United States from March 2020 to February 2022.\n", 310 | "Score: 23.279709, PMID: 32831521, Title: Putative Natural History of CoViD-19.\n", 311 | "Score: 23.26163, PMID: 34240648, Title: The one-sided explanations of a multifactorial coronavirus disease.\n", 312 | "Score: 23.255978, PMID: 33860908, Title: How often and to what extent do admitted COVID-19 patients have signs of cardiac injury?\n", 313 | "Score: 23.243877, PMID: 32776139, Title: Osteopathic Considerations for the Pregnant Patient with COVID-19.\n", 314 | "Score: 23.24017, PMID: 32945643, Title: Sex hormones and COVID-19: tussle between the two.\n", 315 | "Score: 23.193726, PMID: 32982349, Title: COVID's Razor: RAS Imbalance, the Common Denominator Across Disparate, Unexpected Aspects of COVID-19.\n", 316 | "Score: 23.180195, PMID: 35737205, Title: Sex differences in the mortality rate for coronavirus disease 2019 compared to other causes of death: an analysis of population-wide data from 63 countries.\n", 317 | "Score: 23.161182, PMID: 32953124, Title: Socio-economic inequality in global incidence and mortality rates from coronavirus disease 2019: an ecological study.\n", 318 | "Score: 23.136011, PMID: 34318585, Title: One-year mortality and consequences of COVID-19 in cancer patients: A cohort study.\n", 319 | "Score: 23.110867, PMID: 34960692, Title: The Impact of COVID-19 on Mortality in Spain: Monitoring Excess Mortality (MoMo) and the Surveillance of Confirmed COVID-19 Deaths.\n", 320 | "Score: 23.061127, PMID: 34402152, Title: A global country-level analysis of the relationship between obesity and COVID-19 cases and mortality.\n", 321 | "Score: 23.05785, PMID: 35951587, Title: Association between median household income, state Medicaid expansion status, and COVID-19 outcomes across US counties.\n", 322 | "Score: 23.050909, PMID: 34154865, Title: Perioperative mortality and morbidity of hip fractures among COVID-19 infected and non-infected patients: A systematic review and meta-analysis.\n", 323 | "Score: 23.036907, PMID: 34772396, Title: Effect of altitude on COVID-19 mortality in Ecuador: an ecological study.\n", 324 | "Score: 22.996613, PMID: 35946619, Title: Social and territorial inequalities in the mortality of children and adolescents due to COVID-19 in Brazil.\n", 325 | "Score: 22.993288, PMID: 35908851, Title: Marburg virus disease: A deadly rare virus is coming.\n", 326 | "Score: 22.960596, PMID: 33350316, Title: The prevalence, mortality, and associated risk factors for developing COVID-19 in hip fracture patients: a systematic review and meta-analysis.\n", 327 | "Score: 22.942175, PMID: 34612774, Title: Mortality in hospitalized COVID-19 patients was associated with the COVID-19 admission rate during the first year of the pandemic in Sweden.\n", 328 | "Score: 22.86676, PMID: 36319938, Title: Centenarians born before 1919 are resistant to COVID-19.\n", 329 | "Score: 22.863209, PMID: 33163879, Title: A comparative analysis of the COVID-19 pandemic response: The case of Turkey.\n", 330 | "Score: 22.852364, PMID: 32941862, Title: Critically Ill Adults With Coronavirus Disease 2019 in New Orleans and Care With an Evidence-Based Protocol.\n", 331 | "Score: 22.83515, PMID: 32947506, Title: Systematic review and meta-analysis of the effectiveness and safety of hydroxychloroquine in treating COVID-19 patients.\n", 332 | "Score: 22.809055, PMID: 33690595, Title: International heterogeneity in coronavirus disease 2019 pediatric mortality rates.\n", 333 | "Score: 22.803581, PMID: 33412821, Title: Socioeconomic inequalities in overall and COVID-19 mortality during the first outbreak peak in Emilia-Romagna Region (Northern Italy).\n", 334 | "Score: 22.79761, PMID: 33591211, Title: The rate of COVID-19 and associated mortality after elective hip and knee arthroplasty prior to cessation of elective services in UK.\n", 335 | "Score: 22.794813, PMID: 33655277, Title: Sex differences in the mortality rate for coronavirus disease 2019 compared to other causes of death.\n", 336 | "Score: 22.784885, PMID: 34143810, Title: Association of the past epidemic of Mycobacterium tuberculosis with mortality and incidence of COVID-19.\n", 337 | "Score: 22.782915, PMID: 35811045, Title: Trends in Etiology-based Mortality From Chronic Liver Disease Before and During COVID-19 Pandemic in the United States.\n", 338 | "Score: 22.77812, PMID: 34236244, Title: Clinical characteristics, risk factors, and cardiac manifestations of cancer patients with COVID-19.\n", 339 | "Score: 22.765633, PMID: 33495884, Title: Worldwide inverse correlation between Bacille Calmette-Guérin (BCG) immunization and COVID-19 mortality.\n", 340 | "Score: 22.764214, PMID: 34857490, Title: Impact of diabetes mellitus on COVID-19 clinical symptoms and mortality: Jakarta's COVID-19 epidemiological registry.\n", 341 | "Score: 22.725569, PMID: 35757461, Title: Associations between nighttime light and COVID-19 incidence and mortality in the United States.\n", 342 | "Score: 22.719309, PMID: 33818679, Title: Sex Disparities in COVID-19 Mortality Vary Across US Racial Groups.\n", 343 | "Score: 22.710667, PMID: 33830986, Title: COVID-19 Incidence and Mortality Among American Indian/Alaska Native and White Persons - Montana, March 13-November 30, 2020.\n", 344 | "Score: 22.710562, PMID: 34698315, Title: Association between Obesity and COVID-19 Mortality in Peru: An Ecological Study.\n", 345 | "Score: 22.701323, PMID: 35360752, Title: HELLP Syndrome and COVID-19; association or accident: A case series.\n", 346 | "Score: 22.699673, PMID: 34164954, Title: Incidence and Mortality Associated with Cardiovascular Medication among Hypertensive COVID-19 Patients in South Korea.\n", 347 | "Score: 22.697275, PMID: 36189099, Title: Low Mortality of Orthopedic Trauma Patients With Asymptomatic COVID-19: A Level I Trauma Center Pandemic Experience.\n", 348 | "Score: 22.682953, PMID: 32980614, Title: A systematic review of COVID-19 and obstructive sleep apnoea.\n", 349 | "Score: 22.655434, PMID: 36125437, Title: Comparison of all renal replacement therapy modalities in terms of COVID-19 infection rate & mortality in the COVID-19 pandemic and importance of home therapies.\n", 350 | "Score: 22.59405, PMID: 32865700, Title: Respiratory characteristics and related intraoperative ventilatory management for patients with COVID-19 pneumonia.\n", 351 | "Score: 22.58143, PMID: 33519136, Title: Hospital transmission rates of the SARS-CoV 2 disease amongst orthopaedic in-patients in a secondary care centre: A quantitative review.\n", 352 | "Score: 22.571785, PMID: 35411615, Title: Outcomes of Minority COVID-19 patients managed with ECMO: A single-center experience.\n", 353 | "Score: 22.562658, PMID: 33024235, Title: The age distribution of mortality from novel coronavirus disease (COVID-19) suggests no large difference of susceptibility by age.\n", 354 | "Score: 22.5612, PMID: 34884277, Title: The Impact of COVID-19 Pandemic on Management and Outcome in Patients with Heart Failure.\n", 355 | "Score: 22.557558, PMID: 33657587, Title: Lessons from COVID-19 mortality data across countries.\n", 356 | "Score: 22.556175, PMID: 32292261, Title: Biological and epidemiological trends in the prevalence and mortality due to outbreaks of novel coronavirus COVID-19.\n", 357 | "Score: 22.549688, PMID: 34780361, Title: Association between obesity and diabetes prevalence and COVID-19 mortality in Mexico: an ecological study.\n", 358 | "Score: 22.54769, PMID: 35422037, Title: Rapidly improving acute respiratory distress syndrome in COVID-19: a multi-centre observational study.\n", 359 | "Score: 22.540619, PMID: 33831280, Title: Distribution of COVID-19 cases and deaths in Europe during the first 12 peak weeks of outbreak.\n", 360 | "Score: 22.53608, PMID: 34816925, Title: The prognostic significance of erythrocyte sedimentation rate in COVID-19.\n", 361 | "Score: 22.532238, PMID: 35530744, Title: Incidence and predictors of mortality among COVID-19 patients admitted to treatment centers in North West Ethiopia; A retrospective cohort study, 2021.\n", 362 | "Score: 22.523144, PMID: 35014703, Title: COVID-19 infection and its consequences among surgical oncology patients: A systematic analysis, meta-analysis and meta-regression.\n", 363 | "Score: 22.519073, PMID: 33288965, Title: [Public healthcare expenditure and COVID-19 mortality in Spain and in Europe].\n", 364 | "Score: 22.519073, PMID: 34049840, Title: Public healthcare expenditure and COVID-19 mortality in Spain and in Europe.\n", 365 | "Score: 22.511044, PMID: 33556327, Title: Factors associated with the spatial heterogeneity of the first wave of COVID-19 in France: a nationwide geo-epidemiological study.\n", 366 | "Score: 22.510841, PMID: 34027674, Title: Mortality risk of surgically managing orthopaedic trauma during the COVID-19 pandemic.\n", 367 | "Score: 22.508713, PMID: 35340979, Title: The world trade network: country centrality and the COVID-19 pandemic.\n", 368 | "Score: 22.507309, PMID: 34488764, Title: Impact of long-term exposure to PM2.5 and temperature on coronavirus disease mortality: observed trends in France.\n", 369 | "Score: 22.496813, PMID: 35260481, Title: Risk and protective factors for severe COVID-19 infection in a cohort of patients with sickle cell disease.\n", 370 | "Score: 22.496813, PMID: 35700866, Title: Geographical distribution of cystic fibrosis carriers as population genetic determinant of COVID-19 spread and fatality in 37 countries.\n", 371 | "Score: 22.489035, PMID: 33879694, Title: The therapeutic effect and safety of the drugs for COVID-19: A systematic review and meta-analysis.\n" 372 | ] 373 | } 374 | ], 375 | "source": [ 376 | "# Print the results\n", 377 | "for hit in results['hits']['hits']:\n", 378 | " print(f\"Score: {hit['_score']}, PMID: {hit['_source']['PMID']}, Title: {hit['_source']['title']}\")" 379 | ] 380 | }, 381 | { 382 | "cell_type": "markdown", 383 | "id": "a381d7040c0a1d1c", 384 | "metadata": { 385 | "collapsed": false 386 | }, 387 | "source": [ 388 | "#### Vector Similarity Search\n", 389 | "now, perform a vector similarity search using the cosine similarity between the query vector and the embeddings in the indexed documents." 390 | ] 391 | }, 392 | { 393 | "cell_type": "code", 394 | "execution_count": 30, 395 | "id": "c9b5f988888de099", 396 | "metadata": { 397 | "ExecuteTime": { 398 | "end_time": "2024-04-06T21:20:25.617032Z", 399 | "start_time": "2024-04-06T21:19:31.899009Z" 400 | }, 401 | "collapsed": false 402 | }, 403 | "outputs": [ 404 | { 405 | "name": "stdout", 406 | "output_type": "stream", 407 | "text": [ 408 | "Score: 1.9210962, PMID: 1501243, Title: Epidermal growth factor receptor: elements of intracellular communication.\n", 409 | "Score: 1.9196633, PMID: 2042633, Title: The Egr family of nuclear signal transducers.\n", 410 | "Score: 1.9190896, PMID: 1633422, Title: Crossed signals: oncogenic transcription factors.\n", 411 | "Score: 1.9177192, PMID: 2955833, Title: Phorbol esters as signal transducers and tumor promoters.\n", 412 | "Score: 1.9172626, PMID: 2824532, Title: PDGF-like growth factors in autocrine stimulation of growth.\n", 413 | "Score: 1.9162145, PMID: 1622545, Title: The regulation and function of p21ras in T cells.\n", 414 | "Score: 1.9161748, PMID: 1329870, Title: The junction between cytokines and cell adhesion.\n", 415 | "Score: 1.9161192, PMID: 1645965, Title: Overexpression of human TRK proto-oncogene into mouse cells using an inducible vector system.\n", 416 | "Score: 1.9159867, PMID: 1675819, Title: The expanding family of guanylyl cyclases.\n", 417 | "Score: 1.9159176, PMID: 1368709, Title: Analysing lymphokine-receptor interactions of IL-1 and IL-2 by recombinant-DNA technology.\n" 418 | ] 419 | } 420 | ], 421 | "source": [ 422 | "# Führen Sie die Abfrage aus\n", 423 | "results = cosine_similarity(index_name, \"List signaling molecules (ligands) that interact with the receptor EGFR?\", k=10)\n", 424 | "\n", 425 | "for hit in results['hits']['hits']:\n", 426 | " print(f\"Score: {hit['_score']}, PMID: {hit['_source']['PMID']}, Title: {hit['_source']['title']}\")" 427 | ] 428 | }, 429 | { 430 | "cell_type": "markdown", 431 | "id": "4cea44d3127ce96", 432 | "metadata": { 433 | "collapsed": false 434 | }, 435 | "source": [ 436 | "#### k-NN Search\n", 437 | "perform a k-NN search using the k-NN search API." 438 | ] 439 | }, 440 | { 441 | "cell_type": "code", 442 | "execution_count": 45, 443 | "id": "bfcd1d07b4fee44c", 444 | "metadata": { 445 | "ExecuteTime": { 446 | "end_time": "2024-04-06T21:35:04.707056Z", 447 | "start_time": "2024-04-06T21:34:40.128519Z" 448 | }, 449 | "collapsed": false 450 | }, 451 | "outputs": [ 452 | { 453 | "name": "stdout", 454 | "output_type": "stream", 455 | "text": [ 456 | "Score: 0.96054816, PMID: 1501243, Title: Epidermal growth factor receptor: elements of intracellular communication.\n", 457 | "Score: 0.9598316, PMID: 2042633, Title: The Egr family of nuclear signal transducers.\n", 458 | "Score: 0.9595448, PMID: 1633422, Title: Crossed signals: oncogenic transcription factors.\n", 459 | "Score: 0.9588597, PMID: 2955833, Title: Phorbol esters as signal transducers and tumor promoters.\n", 460 | "Score: 0.9586312, PMID: 2824532, Title: PDGF-like growth factors in autocrine stimulation of growth.\n", 461 | "Score: 0.95810723, PMID: 1622545, Title: The regulation and function of p21ras in T cells.\n", 462 | "Score: 0.9580873, PMID: 1329870, Title: The junction between cytokines and cell adhesion.\n", 463 | "Score: 0.9579934, PMID: 1675819, Title: The expanding family of guanylyl cyclases.\n", 464 | "Score: 0.9579588, PMID: 1368709, Title: Analysing lymphokine-receptor interactions of IL-1 and IL-2 by recombinant-DNA technology.\n", 465 | "Score: 0.9578283, PMID: 2103500, Title: Cellular and viral ligands that interact with the EGF receptor.\n" 466 | ] 467 | } 468 | ], 469 | "source": [ 470 | "results = knn_search(index_name, \"List signaling molecules (ligands) that interact with the receptor EGFR?\", k=10)\n", 471 | "\n", 472 | "for hit in results['hits']['hits']:\n", 473 | " print(f\"Score: {hit['_score']}, PMID: {hit['_source']['PMID']}, Title: {hit['_source']['title']}\")" 474 | ] 475 | }, 476 | { 477 | "cell_type": "markdown", 478 | "id": "1846b6e75b5da818", 479 | "metadata": { 480 | "collapsed": false 481 | }, 482 | "source": [ 483 | "### ELSER - Elastic Search Retrieval \n", 484 | "\n", 485 | "ELSER is a sparse vector representation for semantic retrieval developed by Elastic. Instead of dense vector representations, ELSER uses sparse vectors to represent text data. " 486 | ] 487 | } 488 | ], 489 | "metadata": { 490 | "kernelspec": { 491 | "display_name": "Python 3", 492 | "language": "python", 493 | "name": "python3" 494 | }, 495 | "language_info": { 496 | "codemirror_mode": { 497 | "name": "ipython", 498 | "version": 3 499 | }, 500 | "file_extension": ".py", 501 | "mimetype": "text/x-python", 502 | "name": "python", 503 | "nbconvert_exporter": "python", 504 | "pygments_lexer": "ipython3", 505 | "version": "3.10.12" 506 | } 507 | }, 508 | "nbformat": 4, 509 | "nbformat_minor": 5 510 | } 511 | --------------------------------------------------------------------------------