├── src ├── __init__.py ├── computer_vision │ ├── __init__.py │ ├── object_detection │ │ ├── __init__.py │ │ ├── multi_objects.py │ │ └── face_detection.py │ └── landmarks │ │ ├── __init__.py │ │ ├── pose_landmarks.py │ │ ├── face_landmarks.py │ │ └── base.py ├── machine_learning │ ├── __init__.py │ ├── clustering │ │ ├── __init__.py │ │ ├── dbscan_manager.py │ │ └── kmeans_manager.py │ ├── datasets.py │ └── xgboost_manager.py ├── statistics │ ├── statistical_tests │ │ ├── __init__.py │ │ ├── chi_squared.py │ │ └── ab_test.py │ └── dimensionality_reduction │ │ ├── __init__.py │ │ ├── umap_manager.py │ │ ├── tsne_manager.py │ │ └── pca_manager.py └── generative_ai │ ├── image_generation │ ├── __init__.py │ ├── dall_e.py │ └── stable_diffusion.py │ └── large_language_models │ ├── __init__.py │ ├── chatbots │ ├── __init__.py │ ├── chatbot_web_summary.py │ ├── chatbot_rag.py │ ├── chatbot_tools.py │ └── chatbot.py │ ├── callbacks.py │ └── ingest.py ├── data └── documents │ └── .gitkeep ├── faiss_index └── .gitkeep ├── notebooks └── draft.ipynb ├── packages.txt ├── pages ├── __init__.py ├── landmarks │ ├── __init__.py │ ├── face_landmarks.py │ └── pose_landmarks.py ├── classification │ ├── __init__.py │ └── xgboost.py ├── clustering │ ├── __init__.py │ ├── dbscan.py │ └── kmeans.py ├── regression │ ├── __init__.py │ └── xgboost.py ├── image_generation │ ├── __init__.py │ ├── dall_e.py │ └── stable_diffusion.py ├── object_detection │ ├── __init__.py │ ├── face_detection.py │ └── multi_objects.py ├── statistical_tests │ ├── __init__.py │ ├── ab_test.py │ └── chi2_test.py ├── dimensionality_reduction │ ├── __init__.py │ ├── t-sne.py │ ├── umap.py │ └── pca.py ├── large_language_models │ ├── __init__.py │ ├── chatbot_web_summary.py │ ├── chatbot.py │ ├── chatbot_tools.py │ └── chatbot_rag.py └── pages_config.yaml ├── utils ├── widgets │ ├── __init__.py │ ├── language.py │ └── lakera.py ├── callbacks.py ├── shap.py ├── secrets.py ├── turn.py ├── __init__.py ├── image_annotation.py ├── logging.py ├── pages_config.py ├── misc.py └── streamlit_display.py ├── .streamlit ├── config.toml └── secrets.toml.example ├── .gitignore ├── app.py ├── Dockerfile ├── bin └── run.sh ├── config ├── providers.yaml └── models.yaml ├── LICENSE ├── pyproject.toml └── README.md /src/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /data/documents/.gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /faiss_index/.gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /notebooks/draft.ipynb: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /packages.txt: -------------------------------------------------------------------------------- 1 | python3-opencv -------------------------------------------------------------------------------- /src/computer_vision/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /pages/__init__.py: -------------------------------------------------------------------------------- 1 | import utils 2 | 3 | loader = utils.PageConfigLoader(__file__) 4 | loader.set_page_config(globals()) 5 | -------------------------------------------------------------------------------- /pages/landmarks/__init__.py: -------------------------------------------------------------------------------- 1 | import utils 2 | 3 | loader = utils.PageConfigLoader(__file__) 4 | loader.set_page_config(globals()) 5 | -------------------------------------------------------------------------------- /pages/classification/__init__.py: -------------------------------------------------------------------------------- 1 | import utils 2 | 3 | loader = utils.PageConfigLoader(__file__) 4 | loader.set_page_config(globals()) 5 | -------------------------------------------------------------------------------- /pages/clustering/__init__.py: -------------------------------------------------------------------------------- 1 | import utils 2 | 3 | loader = utils.PageConfigLoader(__file__) 4 | loader.set_page_config(globals()) 5 | -------------------------------------------------------------------------------- /pages/regression/__init__.py: -------------------------------------------------------------------------------- 1 | import utils 2 | 3 | loader = utils.PageConfigLoader(__file__) 4 | loader.set_page_config(globals()) 5 | -------------------------------------------------------------------------------- /pages/image_generation/__init__.py: -------------------------------------------------------------------------------- 1 | import utils 2 | 3 | loader = utils.PageConfigLoader(__file__) 4 | loader.set_page_config(globals()) 5 | -------------------------------------------------------------------------------- /pages/object_detection/__init__.py: -------------------------------------------------------------------------------- 1 | import utils 2 | 3 | loader = utils.PageConfigLoader(__file__) 4 | loader.set_page_config(globals()) 5 | -------------------------------------------------------------------------------- /pages/statistical_tests/__init__.py: -------------------------------------------------------------------------------- 1 | import utils 2 | 3 | loader = utils.PageConfigLoader(__file__) 4 | loader.set_page_config(globals()) 5 | -------------------------------------------------------------------------------- /pages/dimensionality_reduction/__init__.py: -------------------------------------------------------------------------------- 1 | import utils 2 | 3 | loader = utils.PageConfigLoader(__file__) 4 | loader.set_page_config(globals()) 5 | -------------------------------------------------------------------------------- /utils/widgets/__init__.py: -------------------------------------------------------------------------------- 1 | from utils.widgets.lakera import LakeraWidget 2 | from utils.widgets.language import LanguageWidget 3 | 4 | __all__ = ["LakeraWidget", "LanguageWidget"] 5 | -------------------------------------------------------------------------------- /.streamlit/config.toml: -------------------------------------------------------------------------------- 1 | [browser] 2 | gatherUsageStats = true 3 | 4 | [server] 5 | address = "0.0.0.0" 6 | port = 8501 7 | 8 | [global] 9 | disableWidgetStateDuplicationWarning = true 10 | -------------------------------------------------------------------------------- /utils/callbacks.py: -------------------------------------------------------------------------------- 1 | import streamlit as st 2 | 3 | 4 | def update_slider_callback(updated: str, to_update: str): 5 | setattr(st.session_state, to_update, 1 - st.session_state.get(updated)) 6 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | __pycache__/ 2 | .DS_store 3 | .mypy_cache/ 4 | .ruff_cache/ 5 | .__pycache__/ 6 | .streamlit/secrets.toml 7 | *.pem 8 | *.pyc 9 | cache/ 10 | TODO.md 11 | *.pt 12 | *.faiss 13 | *.pkl -------------------------------------------------------------------------------- /src/machine_learning/__init__.py: -------------------------------------------------------------------------------- 1 | from src.machine_learning.datasets import Dataset 2 | from src.machine_learning.xgboost_manager import XGBoostManager 3 | 4 | __all__ = ["Dataset", "XGBoostManager"] 5 | -------------------------------------------------------------------------------- /src/machine_learning/clustering/__init__.py: -------------------------------------------------------------------------------- 1 | from src.machine_learning.clustering.dbscan_manager import DBScanManager 2 | from src.machine_learning.clustering.kmeans_manager import KMeansManager 3 | 4 | __all__ = ["KMeansManager", "DBScanManager"] 5 | -------------------------------------------------------------------------------- /utils/shap.py: -------------------------------------------------------------------------------- 1 | import shap 2 | import streamlit.components.v1 as components 3 | 4 | 5 | def st_shap(plot, height=None): 6 | shap_html = f"{shap.getjs()}{plot.html()}" 7 | components.html(shap_html, height=height) 8 | -------------------------------------------------------------------------------- /app.py: -------------------------------------------------------------------------------- 1 | import streamlit as st 2 | import streamlit_superapp as st_superapp 3 | 4 | import utils 5 | 6 | utils.load_secrets() 7 | 8 | st.set_page_config(page_title="daltunay", page_icon="🚀", layout="centered") 9 | 10 | st_superapp.run() 11 | -------------------------------------------------------------------------------- /src/computer_vision/object_detection/__init__.py: -------------------------------------------------------------------------------- 1 | from src.computer_vision.object_detection.face_detection import \ 2 | FaceDetectionApp 3 | from src.computer_vision.object_detection.multi_objects import \ 4 | MultiObjectsDetectionApp 5 | 6 | __all__ = ["FaceDetectionApp", "MultiObjectsDetectionApp"] 7 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM python:3.11-slim 2 | 3 | WORKDIR /app 4 | 5 | COPY . /app 6 | 7 | RUN pip install "poetry==1.7.0" \ 8 | && poetry config virtualenvs.create false \ 9 | && poetry install --no-interaction --no-dev 10 | 11 | EXPOSE 8501 12 | 13 | ENTRYPOINT ["streamlit", "run"] 14 | 15 | CMD ["app.py"] 16 | -------------------------------------------------------------------------------- /src/statistics/statistical_tests/__init__.py: -------------------------------------------------------------------------------- 1 | from src.statistics.statistical_tests.ab_test import (ABTesting, 2 | input_group_data) 3 | from src.statistics.statistical_tests.chi_squared import Chi2Testing 4 | 5 | __all__ = ["ABTesting", "input_group_data", "Chi2Testing"] 6 | -------------------------------------------------------------------------------- /bin/run.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | export DOCKER_CLI_HINTS=false 4 | 5 | if [ "$(docker ps -q --filter ancestor=daltunay)" ]; then 6 | docker stop $(docker ps -q --filter ancestor=daltunay) 7 | docker rm $(docker ps -q --filter ancestor=daltunay) 8 | fi 9 | 10 | docker build -t daltunay . && 11 | docker run -p 8501:8501 daltunay 12 | -------------------------------------------------------------------------------- /src/computer_vision/landmarks/__init__.py: -------------------------------------------------------------------------------- 1 | from src.computer_vision.landmarks.base import BaseLandmarkerApp 2 | from src.computer_vision.landmarks.face_landmarks import FaceLandmarkerApp 3 | from src.computer_vision.landmarks.pose_landmarks import PoseLandmarkerApp 4 | 5 | __all__ = ["BaseLandmarkerApp", "FaceLandmarkerApp", "PoseLandmarkerApp"] 6 | -------------------------------------------------------------------------------- /.streamlit/secrets.toml.example: -------------------------------------------------------------------------------- 1 | [twilio] 2 | TWILIO_ACCOUNT_SID = "<...>" 3 | TWILIO_AUTH_TOKEN = "<...>" 4 | 5 | [openai] 6 | OPENAI_API_KEY = "<...>" 7 | 8 | [together] 9 | TOGETHER_API_KEY = "<...>" 10 | 11 | [lakera_guard] 12 | LAKERA_GUARD_API_KEY = "<...>" 13 | 14 | [google] 15 | GOOGLE_API_KEY = "<...>" 16 | GOOGLE_CSE_ID = "<...>" 17 | -------------------------------------------------------------------------------- /src/statistics/dimensionality_reduction/__init__.py: -------------------------------------------------------------------------------- 1 | from src.statistics.dimensionality_reduction.pca_manager import PCAManager 2 | from src.statistics.dimensionality_reduction.tsne_manager import TSNEManager 3 | from src.statistics.dimensionality_reduction.umap_manager import UMAPManager 4 | 5 | __all__ = ["PCAManager", "TSNEManager", "UMAPManager"] 6 | -------------------------------------------------------------------------------- /pages/large_language_models/__init__.py: -------------------------------------------------------------------------------- 1 | import typing as t 2 | 3 | import yaml 4 | 5 | import utils 6 | 7 | loader = utils.PageConfigLoader(__file__) 8 | loader.set_page_config(globals()) 9 | 10 | with open("config/models.yaml") as f: 11 | LLM_CONFIG: t.Dict[str, str] = yaml.safe_load(f)["generative_ai"][ 12 | "large_language_models" 13 | ] 14 | 15 | __all__ = ["LLM_CONFIG"] 16 | -------------------------------------------------------------------------------- /utils/secrets.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import streamlit as st 4 | 5 | import utils 6 | 7 | logger = utils.CustomLogger(__file__) 8 | 9 | 10 | def load_secrets(): 11 | for secrets in st.secrets.values(): 12 | for secret_name, secret in secrets.items(): 13 | masked_secret = secret[:4] + "*" * (len(secret) - 4) 14 | logger.info(f"Setting {secret_name}={masked_secret}") 15 | os.environ[secret_name] = secret 16 | -------------------------------------------------------------------------------- /config/providers.yaml: -------------------------------------------------------------------------------- 1 | openai: 2 | name: OpenAI 3 | url: https://openai.com/ 4 | api: 5 | help: https://platform.openai.com/account/api-keys 6 | endpoint: https://api.openai.com/v1/images/generations 7 | key: OPENAI_API_KEY 8 | 9 | together: 10 | name: Together AI 11 | url: https://www.together.ai/ 12 | api: 13 | help: https://api.together.xyz/settings/api-keys 14 | endpoint: https://api.together.xyz/inference 15 | key: TOGETHER_API_KEY 16 | -------------------------------------------------------------------------------- /utils/turn.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import streamlit as st 4 | from twilio.rest import Client 5 | 6 | import utils 7 | 8 | logger = utils.CustomLogger(__file__) 9 | 10 | 11 | @st.cache_data(show_spinner=False) 12 | def get_ice_servers(): 13 | account_sid = os.getenv("TWILIO_ACCOUNT_SID") 14 | auth_token = os.getenv("TWILIO_AUTH_TOKEN") 15 | 16 | client = Client(account_sid, auth_token) 17 | token = client.tokens.create() 18 | 19 | return token.ice_servers 20 | -------------------------------------------------------------------------------- /pages/landmarks/face_landmarks.py: -------------------------------------------------------------------------------- 1 | import streamlit as st 2 | 3 | import utils 4 | from src.computer_vision.landmarks import FaceLandmarkerApp 5 | 6 | loader = utils.PageConfigLoader(__file__) 7 | loader.set_page_config(globals()) 8 | 9 | logger = utils.CustomLogger(__file__) 10 | 11 | st_ss = st.session_state 12 | 13 | 14 | def main(): 15 | utils.show_source_code("src/computer_vision/landmarks/face_landmarks.py") 16 | 17 | st_ss.setdefault("face_app", FaceLandmarkerApp()).stream() 18 | -------------------------------------------------------------------------------- /pages/landmarks/pose_landmarks.py: -------------------------------------------------------------------------------- 1 | import streamlit as st 2 | 3 | import utils 4 | from src.computer_vision.landmarks import PoseLandmarkerApp 5 | 6 | loader = utils.PageConfigLoader(__file__) 7 | loader.set_page_config(globals()) 8 | 9 | logger = utils.CustomLogger(__file__) 10 | 11 | st_ss = st.session_state 12 | 13 | 14 | def main(): 15 | utils.show_source_code("src/computer_vision/landmarks/pose_landmarks.py") 16 | 17 | st_ss.setdefault("pose_app", PoseLandmarkerApp()).stream() 18 | -------------------------------------------------------------------------------- /src/generative_ai/image_generation/__init__.py: -------------------------------------------------------------------------------- 1 | import typing as t 2 | 3 | import yaml 4 | 5 | from src.generative_ai.image_generation.dall_e import dall_e_image 6 | from src.generative_ai.image_generation.stable_diffusion import \ 7 | stable_diffusion_image 8 | 9 | with open("config/models.yaml") as f: 10 | IMAGE_GEN_CONFIG: t.Dict[str, str] = yaml.safe_load(f)["generative_ai"][ 11 | "image_creation" 12 | ] 13 | 14 | __all__ = ["IMAGE_GEN_CONFIG", "dall_e_image", "stable_diffusion_image"] 15 | -------------------------------------------------------------------------------- /pages/object_detection/face_detection.py: -------------------------------------------------------------------------------- 1 | import streamlit as st 2 | 3 | import utils 4 | from src.computer_vision.object_detection import FaceDetectionApp 5 | 6 | loader = utils.PageConfigLoader(__file__) 7 | loader.set_page_config(globals()) 8 | 9 | logger = utils.CustomLogger(__file__) 10 | 11 | st_ss = st.session_state 12 | 13 | 14 | def main(): 15 | utils.show_source_code( 16 | path="src/computer_vision/object_detection/face_detection.py" 17 | ) 18 | 19 | st_ss.setdefault("face_detection_app", FaceDetectionApp()).stream() 20 | -------------------------------------------------------------------------------- /pages/object_detection/multi_objects.py: -------------------------------------------------------------------------------- 1 | import streamlit as st 2 | 3 | import utils 4 | from src.computer_vision.object_detection import MultiObjectsDetectionApp 5 | 6 | loader = utils.PageConfigLoader(__file__) 7 | loader.set_page_config(globals()) 8 | 9 | logger = utils.CustomLogger(__file__) 10 | 11 | st_ss = st.session_state 12 | 13 | 14 | def main(): 15 | utils.show_source_code(path="src/computer_vision/object_detection/multi_objects.py") 16 | 17 | st_ss.setdefault("multi_objects_detection_app", MultiObjectsDetectionApp()).stream() 18 | -------------------------------------------------------------------------------- /src/generative_ai/large_language_models/__init__.py: -------------------------------------------------------------------------------- 1 | from src.generative_ai.large_language_models.callbacks import \ 2 | StreamingChatCallbackHandler 3 | from src.generative_ai.large_language_models.chatbots import ( 4 | Chatbot, ChatbotRAG, ChatbotTools, ChatbotWebSummary) 5 | from src.generative_ai.large_language_models.ingest import get_vector_store 6 | 7 | __all__ = [ 8 | "Chatbot", 9 | "ChatbotRAG", 10 | "ChatbotTools", 11 | "ChatbotWebSummary", 12 | "StreamingChatCallbackHandler", 13 | "get_vector_store", 14 | ] 15 | -------------------------------------------------------------------------------- /src/generative_ai/large_language_models/chatbots/__init__.py: -------------------------------------------------------------------------------- 1 | from src.generative_ai.large_language_models.chatbots.chatbot import ( 2 | Chatbot, ModelArgs) 3 | from src.generative_ai.large_language_models.chatbots.chatbot_rag import \ 4 | ChatbotRAG 5 | from src.generative_ai.large_language_models.chatbots.chatbot_tools import \ 6 | ChatbotTools 7 | from src.generative_ai.large_language_models.chatbots.chatbot_web_summary import \ 8 | ChatbotWebSummary 9 | 10 | __all__ = ["Chatbot", "ModelArgs", "ChatbotRAG", "ChatbotTools", "ChatbotWebSummary"] 11 | -------------------------------------------------------------------------------- /src/generative_ai/large_language_models/callbacks.py: -------------------------------------------------------------------------------- 1 | import streamlit as st 2 | from langchain.callbacks.base import BaseCallbackHandler 3 | 4 | 5 | class StreamingChatCallbackHandler(BaseCallbackHandler): 6 | def __init__(self): 7 | pass 8 | 9 | def on_llm_start(self, *args, **kwargs): 10 | self.container = st.empty() 11 | self.text = "" 12 | 13 | def on_llm_new_token(self, token: str, *args, **kwargs): 14 | self.text += token 15 | self.container.markdown( 16 | body=self.text, 17 | unsafe_allow_html=False, 18 | ) 19 | 20 | def on_llm_end(self, response: str, *args, **kwargs): 21 | self.container.markdown( 22 | body=response.generations[0][0].text, 23 | unsafe_allow_html=False, 24 | ) 25 | -------------------------------------------------------------------------------- /src/generative_ai/image_generation/dall_e.py: -------------------------------------------------------------------------------- 1 | import streamlit as st 2 | from openai import OpenAI 3 | from PIL import Image 4 | 5 | import utils 6 | from utils.misc import base64_to_img 7 | 8 | logger = utils.CustomLogger(__file__) 9 | 10 | 11 | @st.cache_data(show_spinner="Generating picture...") 12 | def dall_e_image( 13 | prompt: str, 14 | width: int = 1024, 15 | height: int = 1024, 16 | ) -> Image.Image: 17 | from src.generative_ai.image_generation import IMAGE_GEN_CONFIG 18 | 19 | model_config = IMAGE_GEN_CONFIG["DALL-E 2"] 20 | 21 | client = OpenAI() 22 | response = client.images.generate( 23 | model=model_config["string"], 24 | prompt=prompt, 25 | size=f"{width}x{height}", 26 | n=1, 27 | response_format="b64_json", 28 | ) 29 | base64 = response.data[0].b64_json 30 | return base64_to_img(base64) 31 | -------------------------------------------------------------------------------- /pages/image_generation/dall_e.py: -------------------------------------------------------------------------------- 1 | import streamlit as st 2 | 3 | import utils 4 | from src.generative_ai.image_generation import dall_e_image 5 | 6 | loader = utils.PageConfigLoader(__file__) 7 | loader.set_page_config(globals()) 8 | 9 | logger = utils.CustomLogger(__file__) 10 | 11 | 12 | def main(): 13 | utils.show_source_code("src/generative_ai/image_generation/dall_e.py") 14 | 15 | submitted = False 16 | with st.form(key="dall_e_form"): 17 | prompt = st.text_input(label="Input prompt: ") 18 | centered = st.columns(3)[1] 19 | with centered: 20 | submitted = st.form_submit_button( 21 | label="Generate with DALL·E", use_container_width=True 22 | ) 23 | st.subheader(body="Output", anchor=False) 24 | if submitted: 25 | image = dall_e_image(prompt=prompt) 26 | st.image(image=image, caption=f"{prompt} - Generated by DALL·E") 27 | -------------------------------------------------------------------------------- /src/generative_ai/image_generation/stable_diffusion.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import streamlit as st 4 | import together 5 | from PIL import Image 6 | 7 | import utils 8 | from utils.misc import base64_to_img 9 | 10 | logger = utils.CustomLogger(__file__) 11 | 12 | 13 | @st.cache_data(show_spinner="Generating picture...") 14 | def stable_diffusion_image( 15 | prompt: str, 16 | width: int = 1024, 17 | height: int = 1024, 18 | ) -> Image.Image: 19 | from src.generative_ai.image_generation import IMAGE_GEN_CONFIG 20 | 21 | model_config = IMAGE_GEN_CONFIG["Stable Diffusion 2.1"] 22 | together.api_key = os.getenv("TOGETHER_API_KEY") 23 | 24 | response = together.Image.create( 25 | model=f"{model_config['owner']}/{model_config['string']}", 26 | prompt=prompt, 27 | width=width, 28 | height=height, 29 | ) 30 | 31 | base64 = response["output"]["choices"][0]["image_base64"] 32 | return base64_to_img(base64) 33 | -------------------------------------------------------------------------------- /pages/image_generation/stable_diffusion.py: -------------------------------------------------------------------------------- 1 | import streamlit as st 2 | 3 | import utils 4 | from src.generative_ai.image_generation import stable_diffusion_image 5 | 6 | loader = utils.PageConfigLoader(__file__) 7 | loader.set_page_config(globals()) 8 | 9 | logger = utils.CustomLogger(__file__) 10 | 11 | 12 | def main(): 13 | utils.show_source_code( 14 | path="src/generative_ai/image_generation/stable_diffusion.py" 15 | ) 16 | 17 | submitted = False 18 | with st.form(key="stable_diffusion_form"): 19 | prompt = st.text_input(label="Input prompt: ") 20 | centered = st.columns(3)[1] 21 | with centered: 22 | submitted = st.form_submit_button( 23 | label="Generate with Stable Diffusion", use_container_width=True 24 | ) 25 | st.subheader(body="Output", anchor=False) 26 | if submitted: 27 | image = stable_diffusion_image(prompt=prompt) 28 | st.image(image=image, caption=f"{prompt} - Generated by Stable Diffusion") 29 | -------------------------------------------------------------------------------- /config/models.yaml: -------------------------------------------------------------------------------- 1 | generative_ai: 2 | large_language_models: 3 | GPT-3.5 Turbo: 4 | provider: openai 5 | organization: OpenAI 6 | owner: null 7 | string: gpt-3.5-turbo 8 | experimental_flag: true 9 | 10 | LLaMA-2 Chat (7B): 11 | provider: together 12 | organization: Meta 13 | owner: togethercomputer 14 | string: llama-2-7b-chat 15 | experimental_flag: true 16 | 17 | Mistral (7B) Instruct: 18 | provider: together 19 | organization: mistralai 20 | owner: mistralai 21 | string: Mistral-7B-Instruct-v0.1 22 | experimental_flag: true 23 | 24 | image_creation: 25 | DALL-E 2: 26 | provider: openai 27 | organization: OpenAI 28 | owner: null 29 | string: dall-e-2 30 | experimental_flag: true 31 | 32 | Stable Diffusion 2.1: 33 | provider: together 34 | organization: Stability AI 35 | owner: stabilityai 36 | string: stable-diffusion-2-1 37 | experimental_flag: true 38 | -------------------------------------------------------------------------------- /utils/__init__.py: -------------------------------------------------------------------------------- 1 | from utils.callbacks import update_slider_callback 2 | from utils.image_annotation import annotate_time 3 | from utils.logging import CustomLogger 4 | from utils.misc import (base64_to_img, generate_logo_link, 5 | reset_session_state_key, show_logos, show_source_code) 6 | from utils.pages_config import PageConfigLoader 7 | from utils.secrets import load_secrets 8 | from utils.shap import st_shap 9 | from utils.streamlit_display import display_tab_content, tabs_config 10 | from utils.turn import get_ice_servers 11 | from utils.widgets import LakeraWidget, LanguageWidget 12 | 13 | __all__ = [ 14 | "base64_to_img", 15 | "generate_logo_link", 16 | "load_secrets", 17 | "CustomLogger", 18 | "show_logos", 19 | "show_source_code", 20 | "LakeraWidget", 21 | "LanguageWidget", 22 | "PageConfigLoader", 23 | "reset_session_state_key", 24 | "get_ice_servers", 25 | "annotate_time", 26 | "tabs_config", 27 | "display_tab_content", 28 | "update_slider_callback", 29 | "st_shap", 30 | ] 31 | -------------------------------------------------------------------------------- /utils/image_annotation.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime 2 | 3 | import cv2 4 | from numpy import ndarray 5 | 6 | 7 | def annotate_time(image: ndarray) -> None: 8 | text = datetime.now().strftime("%Y-%m-%d %H:%M:%S.%f")[:-3] 9 | text_args = { 10 | "text": text, 11 | "fontFace": cv2.FONT_HERSHEY_SIMPLEX, 12 | "fontScale": .5, 13 | "thickness": 1, 14 | } 15 | text_size = cv2.getTextSize(**text_args)[0] 16 | rect_width, rect_height = text_size[0] + 20, text_size[1] + 20 17 | cv2.rectangle( 18 | img=image, 19 | pt1=(0, 0), 20 | pt2=(rect_width, rect_height), 21 | color=(255, 255, 255), 22 | thickness=cv2.FILLED, 23 | ) 24 | cv2.rectangle( 25 | img=image, 26 | pt1=(0, 0), 27 | pt2=(rect_width, rect_height), 28 | color=(0, 0, 0), 29 | thickness=2, 30 | ) 31 | cv2.putText( 32 | img=image, 33 | org=(10, text_size[1] + 10), 34 | color=(0, 0, 0), 35 | lineType=cv2.LINE_AA, 36 | **text_args, 37 | ) 38 | -------------------------------------------------------------------------------- /utils/widgets/language.py: -------------------------------------------------------------------------------- 1 | import typing as t 2 | 3 | import streamlit as st 4 | 5 | import utils 6 | 7 | logger = utils.CustomLogger(__file__) 8 | 9 | st_ss = st.session_state 10 | 11 | 12 | class LanguageWidget: 13 | widget_key = "language_widget" 14 | selectbox_key = f"{widget_key}.selection" 15 | 16 | def __init__( 17 | self, 18 | languages: t.List[str] | None = None, 19 | default: str | None = None, 20 | ): 21 | logger.info(f"Initializing {self.__class__.__name__}") 22 | self.languages = languages or ["English", "French"] 23 | self.default = default or "English" 24 | 25 | @property 26 | def selected_language(self): 27 | return st.selectbox( 28 | label="Language:", 29 | options=list(self.languages), 30 | index=list(self.languages).index( 31 | st_ss.get(self.selectbox_key, self.default) 32 | ), 33 | key=self.selectbox_key, 34 | help="Changes the **chat language only**, not the interface language", 35 | ) 36 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2023 Daniel Altunay 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /src/computer_vision/landmarks/pose_landmarks.py: -------------------------------------------------------------------------------- 1 | import typing as t 2 | from functools import cached_property 3 | 4 | import mediapipe as mp 5 | 6 | from src.computer_vision.landmarks import BaseLandmarkerApp 7 | 8 | 9 | class PoseLandmarkerApp(BaseLandmarkerApp): 10 | landmarks_type = "pose_landmarks" 11 | 12 | def __init__(self): 13 | super().__init__() 14 | 15 | @cached_property 16 | def landmarker(self) -> mp.solutions.pose.Pose: 17 | return mp.solutions.pose.Pose( 18 | static_image_mode=False, 19 | model_complexity=1, 20 | smooth_landmarks=True, 21 | enable_segmentation=False, 22 | min_detection_confidence=0.5, 23 | min_tracking_confidence=0.5, 24 | ) 25 | 26 | @cached_property 27 | def connections_list(self) -> t.List[t.FrozenSet[t.Tuple[int, int]]]: 28 | return [mp.solutions.pose.POSE_CONNECTIONS] 29 | 30 | @cached_property 31 | def drawing_specs_list( 32 | self, 33 | ) -> t.List[t.Dict[str, mp.solutions.drawing_utils.DrawingSpec]]: 34 | return [ 35 | { 36 | "landmark_drawing_spec": mp.solutions.drawing_styles.get_default_pose_landmarks_style() 37 | } 38 | ] 39 | -------------------------------------------------------------------------------- /utils/logging.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import typing as t 3 | from functools import cached_property 4 | 5 | import streamlit as st 6 | 7 | 8 | class CustomLogger: 9 | method_names = ["debug", "info", "warning", "error", "critical"] 10 | 11 | def __init__(self, file: str, level: str = "info"): 12 | self.file = file.split("my-superapp")[1] if "my-superapp" in file else file 13 | self.level = getattr(logging, level.upper()) 14 | self.cache_methods(methods_to_cache=self.method_names) 15 | 16 | @cached_property 17 | def logger(self) -> logging.Logger: 18 | logger = logging.getLogger(self.file) 19 | logger.setLevel(self.level) 20 | stream_handler = logging.StreamHandler() 21 | formatter = logging.Formatter( 22 | "%(asctime)s - %(name)s - %(levelname)s - %(message)s" 23 | ) 24 | stream_handler.setFormatter(formatter) 25 | logger.addHandler(stream_handler) 26 | 27 | return logger 28 | 29 | def cache_methods(self, methods_to_cache: t.List[str]) -> None: 30 | for method_name in methods_to_cache: 31 | method = getattr(self.logger, method_name) 32 | wrapped_method = st.cache_resource(func=method, show_spinner=False) 33 | setattr(self, method_name, wrapped_method) 34 | -------------------------------------------------------------------------------- /src/statistics/statistical_tests/chi_squared.py: -------------------------------------------------------------------------------- 1 | import typing as t 2 | 3 | import pandas as pd 4 | import streamlit as st 5 | from scipy.stats import chi2_contingency 6 | 7 | 8 | class Chi2Testing: 9 | def __init__( 10 | self, 11 | observed: pd.DataFrame, 12 | alpha: float, 13 | ): 14 | self.observed = observed 15 | self.alpha = alpha 16 | 17 | @staticmethod 18 | @st.cache_data(show_spinner=False) 19 | def chi2_test( 20 | observed: pd.DataFrame, 21 | ) -> t.Tuple[float, float, int, t.List[t.List[float]]]: 22 | chi2, p_value, dof, expected = chi2_contingency(observed) 23 | return chi2, p_value, dof, expected 24 | 25 | @staticmethod 26 | @st.cache_data(show_spinner=False) 27 | def is_statistically_significant(p_value: float, alpha: float) -> bool: 28 | return p_value < alpha 29 | 30 | def perform_chi2_test(self) -> t.Dict[str, t.Any]: 31 | chi2, p_value, dof, expected = self.chi2_test(self.observed) 32 | is_significant = self.is_statistically_significant(p_value, self.alpha) 33 | 34 | return { 35 | "chi2_statistic": chi2, 36 | "p_value": p_value, 37 | "degrees_of_freedom": dof, 38 | "expected_frequencies": expected, 39 | "is_significant": is_significant, 40 | } 41 | -------------------------------------------------------------------------------- /utils/pages_config.py: -------------------------------------------------------------------------------- 1 | import typing as t 2 | from functools import cached_property 3 | 4 | import yaml 5 | 6 | import utils 7 | 8 | 9 | class PageConfigLoader: 10 | config_path = "pages/pages_config.yaml" 11 | 12 | def __init__(self, file): 13 | self.file = file 14 | self.logger = utils.CustomLogger(self.file) 15 | 16 | @cached_property 17 | def pages_config(self) -> t.Dict: 18 | with open(self.config_path, "r") as file: 19 | pages_config = yaml.safe_load(file) 20 | return pages_config 21 | 22 | @cached_property 23 | def page_config(self) -> t.Dict: 24 | path_keys = self.file.split("my-superapp/pages/")[1].split("/") 25 | section = self.pages_config 26 | 27 | for path_key in path_keys: 28 | section = section.get(path_key, {}) 29 | 30 | return self._set_recursive(section, path_keys) 31 | 32 | def _set_recursive(self, section, keys) -> t.Dict: 33 | return { 34 | key: self._set_recursive(value, keys + [key]) 35 | if isinstance(value, dict) 36 | else value 37 | for key, value in section.items() 38 | } 39 | 40 | def set_page_config(self, _globals): 41 | self.logger.info(f"Setting page config: {self.page_config}") 42 | for key, value in self.page_config.items(): 43 | _globals[key] = value 44 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.poetry] 2 | name = "my-superapp" 3 | version = "0.0.0" 4 | description = "Daniel ALTUNAY's superapp!" 5 | authors = ["Daniel Altunay "] 6 | readme = "README.md" 7 | homepage = "https://data-science-superapp.streamlit.app/" 8 | repository = "https://github.com/daltunay/my-superapp" 9 | packages = [{ include = "src" }, { include = "utils" }, { include = "pages" }] 10 | 11 | [tool.poetry.dependencies] 12 | python = "~3.11" 13 | # Streamlit 14 | streamlit = "^1.29.0" 15 | streamlit-superapp = "^1.3.0" 16 | streamlit-webrtc = "^0.47.1" 17 | twilio = "^8.10.3" 18 | watchdog = "^3.0.0" 19 | # Computer Vision 20 | opencv-python-headless = "^4.8.1.78" 21 | av = ">=9.0.0,<11.0.0" 22 | ultralytics = "^8.0.222" 23 | mediapipe = "^0.10.8" 24 | # LLMs 25 | langchain = "^0.0.345" 26 | openai = "^1.3.5" 27 | together = "^0.2.8" 28 | tiktoken = "^0.5.1" 29 | faiss-cpu = "^1.7.4" 30 | transformers = "^4.35.2" 31 | pypdf = "^3.17.1" 32 | unstructured = "^0.11.2" 33 | validators = "^0.22.0" 34 | ## Tools 35 | google-api-python-client = "^2.108.0" 36 | arxiv = "^2.0.0" 37 | wikipedia = "^1.4.0" 38 | stackapi = "^0.3.0" 39 | # Machine Learning 40 | scipy = "^1.11.4" 41 | scikit-learn = "^1.3.2" 42 | xgboost = "^2.0.2" 43 | shap = "^0.44.0" 44 | umap-learn = "^0.5.5" 45 | # Data Visualization 46 | plotly = "^5.18.0" 47 | 48 | [tool.poetry.group.dev.dependencies] 49 | ruff = "^0.1.3" 50 | isort = "^5.12.0" 51 | mypy = "^1.6.1" 52 | ipykernel = "^6.26.0" 53 | 54 | [tool.poetry.group.types.dependencies] 55 | types-requests = "^2.31.0.10" 56 | 57 | [build-system] 58 | requires = ["poetry-core"] 59 | build-backend = "poetry.core.masonry.api" 60 | -------------------------------------------------------------------------------- /src/computer_vision/landmarks/face_landmarks.py: -------------------------------------------------------------------------------- 1 | import typing as t 2 | from functools import cached_property 3 | 4 | import mediapipe as mp 5 | 6 | from src.computer_vision.landmarks import BaseLandmarkerApp 7 | 8 | 9 | class FaceLandmarkerApp(BaseLandmarkerApp): 10 | landmarks_type = "multi_face_landmarks" 11 | 12 | def __init__(self): 13 | super().__init__() 14 | 15 | @cached_property 16 | def landmarker(self) -> mp.solutions.face_mesh.FaceMesh: 17 | return mp.solutions.face_mesh.FaceMesh( 18 | static_image_mode=False, 19 | max_num_faces=1, 20 | refine_landmarks=True, 21 | min_detection_confidence=0.5, 22 | min_tracking_confidence=0.5, 23 | ) 24 | 25 | @cached_property 26 | def connections_list(self) -> t.List[t.FrozenSet[t.Tuple[int, int]]]: 27 | return [ 28 | mp.solutions.face_mesh.FACEMESH_TESSELATION, 29 | mp.solutions.face_mesh.FACEMESH_CONTOURS, 30 | mp.solutions.face_mesh.FACEMESH_IRISES, 31 | ] 32 | 33 | @cached_property 34 | def drawing_specs_list( 35 | self, 36 | ) -> t.List[t.Dict[str, mp.solutions.drawing_utils.DrawingSpec]]: 37 | return [ 38 | {"connection_drawing_spec": style, "landmark_drawing_spec": None} 39 | for style in ( 40 | mp.solutions.drawing_styles.get_default_face_mesh_tesselation_style(), 41 | mp.solutions.drawing_styles.get_default_face_mesh_contours_style(), 42 | mp.solutions.drawing_styles.get_default_face_mesh_iris_connections_style(), 43 | ) 44 | ] 45 | -------------------------------------------------------------------------------- /utils/misc.py: -------------------------------------------------------------------------------- 1 | from base64 import b64decode 2 | from io import BytesIO 3 | 4 | import streamlit as st 5 | from PIL import Image 6 | 7 | st_ss = st.session_state 8 | 9 | 10 | def generate_logo_link(url: str, img_url: str) -> str: 11 | return f'' 12 | 13 | 14 | def show_source_code(path: str): 15 | st.markdown( 16 | "[![source code](https://img.shields.io/badge/source_code-gray?logo=github)]" 17 | f"(https://github.com/daltunay/my-superapp/tree/main/{path})" 18 | ) 19 | 20 | 21 | def show_logos(linkedin: bool = True, github: bool = True): 22 | logos = [] 23 | 24 | if linkedin: 25 | logos.append( 26 | generate_logo_link( 27 | url="https://linkedin.com/in/daltunay", 28 | img_url="https://img.icons8.com/?id=13930&format=png", 29 | ) 30 | ) 31 | 32 | if github: 33 | logos.append( 34 | generate_logo_link( 35 | url="https://github.com/daltunay", 36 | img_url="https://img.icons8.com/?id=AZOZNnY73haj&format=png", 37 | ) 38 | ) 39 | 40 | logos_html = "".join(logos) 41 | html_content = f""" 42 |
43 | Made by Daniel Altunay
44 | {logos_html} 45 |
46 | """ 47 | 48 | st.markdown(html_content, unsafe_allow_html=True) 49 | 50 | 51 | def base64_to_img(base64: str) -> Image.Image: 52 | return Image.open(BytesIO(b64decode(base64))) 53 | 54 | 55 | def reset_session_state_key(key: str): 56 | if hasattr(st_ss, key): 57 | delattr(st_ss, key) 58 | -------------------------------------------------------------------------------- /pages/clustering/dbscan.py: -------------------------------------------------------------------------------- 1 | import streamlit as st 2 | 3 | import utils 4 | from src.machine_learning.clustering import DBScanManager 5 | from src.machine_learning.datasets import Dataset 6 | 7 | loader = utils.PageConfigLoader(__file__) 8 | loader.set_page_config(globals()) 9 | 10 | logger = utils.CustomLogger(__file__) 11 | 12 | st_ss = st.session_state 13 | 14 | 15 | def main(): 16 | utils.tabs_config() 17 | utils.show_source_code("src/statistics/dimensionality_rediction/dbscan_manager.py") 18 | 19 | st.header("Dataset", divider="gray") 20 | dataset = Dataset(type=None) 21 | raw_dataset_dict = Dataset.get_dataset(**dataset.params, split=False) 22 | dataset.set(raw_dataset_dict) 23 | 24 | with st.expander(label="Dataset description"): 25 | st.markdown(dataset.description) 26 | 27 | X, y = dataset.X, dataset.y 28 | if label_mapping := dataset.label_mapping: 29 | y = y.map(label_mapping) 30 | 31 | st.subheader("Visualize data") 32 | with st.container(border=True): 33 | utils.display_tab_content("data", X, y) 34 | 35 | st.subheader("DBSCAN") 36 | with st.container(border=True): 37 | dbscan_manager = DBScanManager() 38 | dbscan_manager.set_model() 39 | 40 | dbscan_manager.fit(data=X) 41 | 42 | st.subheader("Scatter plot", divider="gray") 43 | col_x, col_y = st.columns(2) 44 | x_col_scatter = col_x.selectbox( 45 | label="X column", key="scatter_x", options=X.columns, index=0 46 | ) 47 | y_col_scatter = col_y.selectbox( 48 | label="Y column", key="scatter_y", options=X.columns, index=1 49 | ) 50 | st.plotly_chart( 51 | dbscan_manager.scatter_plot(x_col_scatter, y_col_scatter), 52 | use_container_width=True, 53 | ) 54 | -------------------------------------------------------------------------------- /src/generative_ai/large_language_models/ingest.py: -------------------------------------------------------------------------------- 1 | import os 2 | import typing as t 3 | 4 | from langchain.document_loaders import DirectoryLoader, PyPDFLoader 5 | from langchain.embeddings import OpenAIEmbeddings 6 | from langchain.text_splitter import RecursiveCharacterTextSplitter 7 | from langchain.vectorstores import FAISS 8 | 9 | import utils 10 | 11 | 12 | def get_loader( 13 | file: str | None = None, 14 | mode: t.Literal["local"] | t.Literal["upload"] = "local", 15 | ) -> DirectoryLoader | PyPDFLoader: 16 | if mode == "local": 17 | return DirectoryLoader( 18 | path="data/documents/", 19 | glob="./*.pdf", 20 | loader_cls=PyPDFLoader, 21 | show_progress=True, 22 | ) 23 | elif mode == "upload": 24 | return PyPDFLoader(file) 25 | 26 | 27 | def get_vector_store( 28 | file: str | None = None, 29 | mode: t.Literal["local"] | t.Literal["upload"] = "local", 30 | ) -> None: 31 | loader = get_loader(file=file, mode=mode) 32 | documents = loader.load() 33 | splitter = RecursiveCharacterTextSplitter( 34 | chunk_size=1000, 35 | chunk_overlap=50, 36 | length_function=len, 37 | ) 38 | documents_chunked = splitter.split_documents(documents) 39 | embeddings = OpenAIEmbeddings() 40 | db = FAISS.from_documents(documents=documents_chunked, embedding=embeddings) 41 | 42 | if mode == "local": 43 | db.save_local( 44 | folder_path="faiss_index", 45 | index_name="index" if mode == "local" else os.path.splitext(file)[0], 46 | ) 47 | elif mode == "upload": 48 | return db 49 | 50 | 51 | def main(): 52 | get_vector_store(file=None, mode="local") 53 | 54 | 55 | if __name__ == "__main__": 56 | utils.load_secrets() 57 | main() 58 | -------------------------------------------------------------------------------- /pages/large_language_models/chatbot_web_summary.py: -------------------------------------------------------------------------------- 1 | import streamlit as st 2 | import validators 3 | 4 | import utils 5 | from pages.large_language_models import LLM_CONFIG 6 | from src.generative_ai.large_language_models import ChatbotWebSummary 7 | 8 | loader = utils.PageConfigLoader(__file__) 9 | loader.set_page_config(globals()) 10 | 11 | logger = utils.CustomLogger(__file__) 12 | 13 | st_ss = st.session_state 14 | 15 | 16 | def main(): 17 | utils.show_source_code( 18 | path="src/generative_ai/large_language_models/chatbots/chatbot_web.py" 19 | ) 20 | chosen_model = st.selectbox( 21 | label="Large Language Model:", 22 | placeholder="Choose an option", 23 | options=LLM_CONFIG.keys(), 24 | index=0, 25 | on_change=utils.reset_session_state_key, 26 | kwargs={"key": "chatbot_web_summary"}, 27 | ) 28 | 29 | chosen_chain_type = st.selectbox( 30 | label="Chain type:", 31 | options=ChatbotWebSummary.available_chain_types, 32 | index=None, 33 | on_change=utils.reset_session_state_key, 34 | kwargs={"key": "chatbot_web_summary"}, 35 | ) 36 | 37 | if chosen_model and chosen_chain_type: 38 | chatbot = st_ss.setdefault( 39 | "chatbot_web_summary", ChatbotWebSummary(**LLM_CONFIG[chosen_model]) 40 | ) 41 | else: 42 | st.info("Choose a chain type for the LLM", icon="ℹ️") 43 | 44 | if input_url := st.text_input( 45 | label="URL of the page to summarize:", 46 | disabled=not (chosen_model and chosen_chain_type), 47 | ): 48 | if validators.url(input_url): 49 | st.chat_message("human").write(input_url) 50 | with st.chat_message("ai"): 51 | chatbot.summarize(url=input_url) 52 | else: 53 | st.error("Invalid URL", icon="❌") 54 | -------------------------------------------------------------------------------- /src/computer_vision/object_detection/multi_objects.py: -------------------------------------------------------------------------------- 1 | from functools import cached_property 2 | 3 | import streamlit_webrtc as st_webrtc 4 | from av import VideoFrame 5 | from numpy import ndarray 6 | from ultralytics import YOLO 7 | from ultralytics.engine.results import Results 8 | 9 | import utils 10 | 11 | logger = utils.CustomLogger(__file__) 12 | 13 | 14 | class MultiObjectsDetectionApp: 15 | def __init__(self): 16 | pass 17 | 18 | @cached_property 19 | def detector(self) -> YOLO: 20 | return YOLO(model="yolov8n.pt", task=None) 21 | 22 | def detect_objects(self, image: ndarray) -> Results: 23 | return self.detector.predict( 24 | source=image, 25 | stream=False, 26 | show=False, 27 | show_labels=True, 28 | show_conf=True, 29 | verbose=False, 30 | ) 31 | 32 | def video_frame_callback(self, frame: VideoFrame) -> VideoFrame: 33 | image = frame.to_ndarray(format="bgr24") 34 | 35 | detections = self.detect_objects(image) 36 | image = self.annotate_detections(detections) 37 | utils.annotate_time(image) 38 | return VideoFrame.from_ndarray(image, format="bgr24") 39 | 40 | def stream(self) -> None: 41 | st_webrtc.webrtc_streamer( 42 | video_frame_callback=self.video_frame_callback, 43 | key="multi_objects_streamer", 44 | mode=st_webrtc.WebRtcMode.SENDRECV, 45 | rtc_configuration=st_webrtc.RTCConfiguration( 46 | {"iceServers": utils.get_ice_servers(), "iceTransportPolicy": "relay"} 47 | ), 48 | media_stream_constraints={"video": True, "audio": False}, 49 | async_processing=True, 50 | desired_playing_state=None, 51 | ) 52 | 53 | @staticmethod 54 | def annotate_detections(detections: Results) -> ndarray: 55 | return detections[0].plot() 56 | -------------------------------------------------------------------------------- /utils/widgets/lakera.py: -------------------------------------------------------------------------------- 1 | import os 2 | import typing as t 3 | 4 | import requests 5 | import streamlit as st 6 | 7 | import utils 8 | 9 | logger = utils.CustomLogger(__file__) 10 | 11 | st_ss = st.session_state 12 | 13 | 14 | class LakeraWidget: 15 | widget_key = "lakera_widget" 16 | checkbox_key = f"{widget_key}.checkbox" 17 | 18 | def __init__( 19 | self, 20 | default: bool = False, 21 | ): 22 | logger.info(f"Initializing {self.__class__.__name__}") 23 | self.api_key = os.getenv("LAKERA_GUARD_API_KEY") 24 | self.default = default 25 | 26 | @property 27 | def lakera_activated(self): 28 | return st.checkbox( 29 | label="Prompt injection security", 30 | value=st_ss.get(self.checkbox_key, self.default), 31 | key=self.checkbox_key, 32 | help="Use Lakera Guard API to defend against LLM prompt injections", 33 | on_change=self.authentificate, 34 | ) 35 | 36 | def request_api(self, input: str) -> requests.Response: 37 | return requests.post( 38 | url="https://api.lakera.ai/v1/prompt_injection", 39 | json={"input": input}, 40 | headers={"Authorization": f"Bearer {self.api_key}"}, 41 | ) 42 | 43 | def authentificate(self): 44 | if not st_ss.get(self.checkbox_key): 45 | return 46 | try: 47 | response = self.request_api("") 48 | except requests.exceptions.SSLError: 49 | st.toast("SSL CERTIFICATE VERIFY FAILED", icon="🚫") 50 | else: 51 | success = response.ok 52 | st.toast("Lakera Guard API authentication", icon="✅" if success else "🚫") 53 | 54 | def flag_prompt(self, prompt: str) -> t.Tuple[bool, t.Dict]: 55 | response = self.request_api(prompt).json() 56 | flagged = response["results"][0]["flagged"] 57 | return flagged, response 58 | -------------------------------------------------------------------------------- /pages/dimensionality_reduction/t-sne.py: -------------------------------------------------------------------------------- 1 | import streamlit as st 2 | 3 | import utils 4 | from src.machine_learning.datasets import Dataset 5 | from src.statistics.dimensionality_reduction import TSNEManager 6 | 7 | loader = utils.PageConfigLoader(__file__) 8 | loader.set_page_config(globals()) 9 | 10 | logger = utils.CustomLogger(__file__) 11 | 12 | st_ss = st.session_state 13 | 14 | 15 | def main(): 16 | utils.tabs_config() 17 | utils.show_source_code("src/statistics/dimensionality_rediction/tsne_manager.py") 18 | 19 | st.header("Dataset", divider="gray") 20 | dataset = Dataset(type=None) 21 | raw_dataset_dict = Dataset.get_dataset(**dataset.params, split=False) 22 | dataset.set(raw_dataset_dict) 23 | 24 | with st.expander(label="Dataset description"): 25 | st.markdown(dataset.description) 26 | 27 | X, y = dataset.X, dataset.y 28 | if label_mapping := dataset.label_mapping: 29 | y = y.map(label_mapping) 30 | 31 | st.subheader("Visualize data") 32 | with st.container(border=True): 33 | utils.display_tab_content("data", X, y) 34 | 35 | st.subheader("t-SNE") 36 | with st.container(border=True): 37 | tsne_manager = TSNEManager(max_n_components=3) 38 | tsne_manager.set_model() 39 | 40 | tsne_manager.fit(data=X, target_col=y) 41 | 42 | st.subheader("Scatter matrix plot", divider="gray") 43 | st.plotly_chart(tsne_manager.scatter_matrix_plot(), use_container_width=True) 44 | 45 | st.subheader("Scatter 2D plot", divider="gray") 46 | try: 47 | st.plotly_chart(tsne_manager.scatter_2d_plot(), use_container_width=True) 48 | except ValueError: 49 | st.error("Number of principal components not sufficient for the plot") 50 | 51 | st.subheader("Scatter 3D plot", divider="gray") 52 | try: 53 | st.plotly_chart(tsne_manager.scatter_3d_plot(), use_container_width=True) 54 | except ValueError: 55 | st.error("Number of principal components not sufficient for the plot") 56 | -------------------------------------------------------------------------------- /pages/dimensionality_reduction/umap.py: -------------------------------------------------------------------------------- 1 | import streamlit as st 2 | 3 | import utils 4 | from src.machine_learning.datasets import Dataset 5 | from src.statistics.dimensionality_reduction import UMAPManager 6 | 7 | loader = utils.PageConfigLoader(__file__) 8 | loader.set_page_config(globals()) 9 | 10 | logger = utils.CustomLogger(__file__) 11 | 12 | st_ss = st.session_state 13 | 14 | 15 | def main(): 16 | utils.tabs_config() 17 | utils.show_source_code("src/statistics/dimensionality_rediction/umap_manager.py") 18 | 19 | st.header("Dataset", divider="gray") 20 | dataset = Dataset(type=None) 21 | raw_dataset_dict = Dataset.get_dataset(**dataset.params, split=False) 22 | dataset.set(raw_dataset_dict) 23 | 24 | with st.expander(label="Dataset description"): 25 | st.markdown(dataset.description) 26 | 27 | X, y = dataset.X, dataset.y 28 | if label_mapping := dataset.label_mapping: 29 | y = y.map(label_mapping) 30 | 31 | st.subheader("Visualize data") 32 | with st.container(border=True): 33 | utils.display_tab_content("data", X, y) 34 | 35 | st.subheader("UMAP") 36 | with st.container(border=True): 37 | umap_manager = UMAPManager(max_n_components=3) 38 | umap_manager.set_model() 39 | 40 | umap_manager.fit(data=X, target_col=y) 41 | 42 | st.subheader("Scatter matrix plot", divider="gray") 43 | st.plotly_chart(umap_manager.scatter_matrix_plot(), use_container_width=True) 44 | 45 | st.subheader("Scatter 2D plot", divider="gray") 46 | try: 47 | st.plotly_chart(umap_manager.scatter_2d_plot(), use_container_width=True) 48 | except ValueError: 49 | st.error("Number of principal components not sufficient for the plot") 50 | 51 | st.subheader("Scatter 3D plot", divider="gray") 52 | try: 53 | st.plotly_chart(umap_manager.scatter_3d_plot(), use_container_width=True) 54 | except ValueError: 55 | st.error("Number of principal components not sufficient for the plot") 56 | -------------------------------------------------------------------------------- /utils/streamlit_display.py: -------------------------------------------------------------------------------- 1 | import typing as t 2 | 3 | import pandas as pd 4 | import streamlit as st 5 | 6 | 7 | def tabs_config(): 8 | st.markdown( 9 | """ 10 | 17 | """, 18 | unsafe_allow_html=True, 19 | ) 20 | 21 | 22 | def display_tab_content( 23 | label: t.Literal["train", "test"], 24 | X_data: pd.DataFrame, 25 | y_data: pd.DataFrame, 26 | label_mapping: t.Dict[int, str] | None = None, 27 | ): 28 | data_container = st.container() 29 | col1, col2 = data_container.columns([0.65, 0.35], gap="medium") 30 | with col1: 31 | st.markdown( 32 | f"

X_{label}

", unsafe_allow_html=True 33 | ) 34 | st.dataframe(data=X_data, use_container_width=True) 35 | 36 | with col2: 37 | st.markdown( 38 | f"

y_{label}

", unsafe_allow_html=True 39 | ) 40 | st.dataframe( 41 | data=y_data.map(label_mapping or (lambda x: x)), use_container_width=True 42 | ) 43 | 44 | describe_container = st.expander("Data statistics").container() 45 | col1, col2 = describe_container.columns([0.65, 0.35], gap="medium") 46 | with col1: 47 | st.dataframe(X_data.describe(), use_container_width=True) 48 | with col2: 49 | if label_mapping: 50 | st.dataframe( 51 | pd.concat( 52 | [ 53 | y_data.map(label_mapping).value_counts().sort_index(), 54 | y_data.map(label_mapping) 55 | .value_counts(normalize=True) 56 | .sort_index(), 57 | ], 58 | axis=1, 59 | ).round(3) 60 | ) 61 | else: 62 | st.dataframe(y_data.describe(), use_container_width=True) 63 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | [![Streamlit App](https://static.streamlit.io/badges/streamlit_badge_black_white.svg)](https://data-science-superapp.streamlit.app) 3 | 4 | 5 | ## Prerequisites 6 | 7 | **Poetry**: If [Poetry](https://python-poetry.org/) is not installed, you can do so using pip: 8 | 9 | 10 | ```bash 11 | pip install poetry 12 | ``` 13 | 14 | **Docker**: If [Docker](https://www.docker.com/) is not installed, you can do so following [this link](https://docs.docker.com/get-docker/) 15 | 16 | ## Installation 17 | 18 | 1. Clone the repository: 19 | 20 | ```bash 21 | git clone https://github.com/daltunay/my-superapp.git 22 | cd my-superapp 23 | ``` 24 | 25 | 2. Set up the project dependencies using Poetry: 26 | 27 | ```bash 28 | poetry install 29 | ``` 30 | 31 | This command will create a virtual environment and install the necessary dependencies. 32 | 33 | ## Setting up API Keys 34 | 35 | The application uses several APIs to function properly. 36 | You can specifiy the API keys in `.streamlit/secrets.toml`: 37 | 38 | ```toml 39 | [twilio] 40 | TWILIO_ACCOUNT_SID = "<...>" 41 | TWILIO_AUTH_TOKEN = "<...>" 42 | 43 | [openai] 44 | OPENAI_API_KEY = "<...>" 45 | 46 | [together] 47 | TOGETHER_API_KEY = "<...>" 48 | 49 | [lakera_guard] 50 | LAKERA_GUARD_API_KEY = "<...>" 51 | 52 | [google] 53 | GOOGLE_API_KEY = "<...>" 54 | GOOGLE_CSE_ID = "<...>" 55 | ``` 56 | 57 | 58 | ## Running the Application 59 | The _my-superapp_ application can be run using either Poetry or Docker. 60 | 61 | ### Using Poetry 62 | 63 | To run the application using Poetry: 64 | 65 | ```bash 66 | poetry run streamlit run app.py 67 | ``` 68 | 69 | ### Using Docker 70 | 71 | 1. Build the Docker image: 72 | 73 | ```bash 74 | docker build -t my-superapp . 75 | ``` 76 | 77 | 2. Run the application as a Docker container: 78 | 79 | ```bash 80 | docker run -p 8501:8501 my-superapp 81 | ``` 82 | 83 | Alternatively, you can just run the following: 84 | 85 | ```bash 86 | chmod +x ./bin/run.sh 87 | ./bin/run.sh 88 | ``` 89 | 90 | Once the application is running, it will be accessible at http://localhost:8501 in your web browser. 91 | -------------------------------------------------------------------------------- /src/generative_ai/large_language_models/chatbots/chatbot_web_summary.py: -------------------------------------------------------------------------------- 1 | import typing as t 2 | from functools import cached_property 3 | 4 | from langchain.chains.combine_documents.base import BaseCombineDocumentsChain 5 | from langchain.chains.summarize import load_summarize_chain 6 | from langchain.docstore.document import Document 7 | from langchain.document_loaders import UnstructuredURLLoader 8 | from unstructured.cleaners.core import (clean, clean_extra_whitespace, 9 | remove_punctuation) 10 | 11 | from src.generative_ai.large_language_models.chatbots import Chatbot, ModelArgs 12 | 13 | 14 | class ChatbotWebSummary(Chatbot): 15 | available_chain_types = ["stuff", "map_reduce"] 16 | 17 | def __init__( 18 | self, 19 | chain_type: t.Literal["stuff"] | t.Literal["map_reduce"] = "stuff", 20 | **model_kwargs: t.Unpack[ModelArgs], 21 | ) -> None: 22 | super().__init__(**model_kwargs) 23 | self.chain_type = chain_type 24 | 25 | @staticmethod 26 | def url_to_doc(source_url: str) -> Document: 27 | url_loader = UnstructuredURLLoader( 28 | urls=[source_url], 29 | mode="elements", 30 | post_processors=[clean, remove_punctuation, clean_extra_whitespace], 31 | ) 32 | 33 | narrative_elements = [ 34 | element 35 | for element in url_loader.load() 36 | if element.metadata.get("category") == "NarrativeText" 37 | ] 38 | cleaned_content = " ".join( 39 | element.page_content for element in narrative_elements 40 | ) 41 | 42 | return Document(page_content=cleaned_content, metadata={"source": source_url}) 43 | 44 | @cached_property 45 | def chain(self) -> BaseCombineDocumentsChain: 46 | return load_summarize_chain(self.llm, chain_type=self.chain_type, verbose=True) 47 | 48 | def summarize(self, url: str) -> str: 49 | document = self.url_to_doc(url) 50 | return self.chain.run( 51 | [document], 52 | callbacks=self.callbacks, 53 | ) 54 | -------------------------------------------------------------------------------- /pages/dimensionality_reduction/pca.py: -------------------------------------------------------------------------------- 1 | import streamlit as st 2 | 3 | import utils 4 | from src.machine_learning.datasets import Dataset 5 | from src.statistics.dimensionality_reduction import PCAManager 6 | 7 | loader = utils.PageConfigLoader(__file__) 8 | loader.set_page_config(globals()) 9 | 10 | logger = utils.CustomLogger(__file__) 11 | 12 | st_ss = st.session_state 13 | 14 | 15 | def main(): 16 | utils.tabs_config() 17 | utils.show_source_code("src/statistics/dimensionality_rediction/pca_manager.py") 18 | 19 | st.header("Dataset", divider="gray") 20 | dataset = Dataset(type=None) 21 | raw_dataset_dict = Dataset.get_dataset(**dataset.params, split=False) 22 | dataset.set(raw_dataset_dict) 23 | 24 | with st.expander(label="Dataset description"): 25 | st.markdown(dataset.description) 26 | 27 | X, y = dataset.X, dataset.y 28 | if label_mapping := dataset.label_mapping: 29 | y = y.map(label_mapping) 30 | 31 | st.subheader("Visualize data") 32 | with st.container(border=True): 33 | utils.display_tab_content("data", X, y) 34 | 35 | st.subheader("PCA") 36 | with st.container(border=True): 37 | pca_manager = PCAManager(max_n_components=3) 38 | pca_manager.set_model() 39 | 40 | pca_manager.fit(data=X, target_col=y) 41 | 42 | st.subheader("Scatter matrix plot", divider="gray") 43 | st.plotly_chart(pca_manager.scatter_matrix_plot(), use_container_width=True) 44 | 45 | st.subheader("Explained variance plot", divider="gray") 46 | st.plotly_chart(pca_manager.explained_variance_plot(), use_container_width=True) 47 | 48 | st.subheader("Scatter 2D + Loadings plot", divider="gray") 49 | try: 50 | st.plotly_chart(pca_manager.loadings_plot(), use_container_width=True) 51 | except ValueError: 52 | st.error("Number of principal components not sufficient for the plot") 53 | 54 | st.subheader("Scatter 3D plot", divider="gray") 55 | try: 56 | st.plotly_chart(pca_manager.scatter_3d_plot(), use_container_width=True) 57 | except ValueError: 58 | st.error("Number of principal components not sufficient for the plot") 59 | -------------------------------------------------------------------------------- /pages/regression/xgboost.py: -------------------------------------------------------------------------------- 1 | import streamlit as st 2 | 3 | import utils 4 | from src.machine_learning import XGBoostManager 5 | from src.machine_learning.datasets import Dataset 6 | 7 | loader = utils.PageConfigLoader(__file__) 8 | loader.set_page_config(globals()) 9 | 10 | logger = utils.CustomLogger(__file__) 11 | 12 | st_ss = st.session_state 13 | 14 | 15 | def main(): 16 | utils.tabs_config() 17 | utils.show_source_code("src/machine_learning/xgboost_manager.py") 18 | 19 | st.header("Dataset", divider="gray") 20 | dataset = Dataset(type="regression") 21 | raw_dataset_dict = Dataset.get_dataset(**dataset.params, split=True) 22 | dataset.set(raw_dataset_dict) 23 | 24 | with st.expander(label="Dataset description"): 25 | st.markdown(dataset.description) 26 | 27 | X_train, X_test = dataset.X 28 | y_train, y_test = dataset.y 29 | label_mapping = dataset.label_mapping 30 | 31 | st.subheader("Visualize data") 32 | train_tab, test_tab = st.tabs(tabs=["Train", "Test"]) 33 | with train_tab: 34 | with st.container(border=True): 35 | utils.display_tab_content("train", X_train, y_train, label_mapping) 36 | with test_tab: 37 | with st.container(border=True): 38 | utils.display_tab_content("test", X_test, y_test, label_mapping) 39 | 40 | st.header("Regression", divider="gray") 41 | st.markdown( 42 | "Regression model: `XGBRegressor` from `xgboost` " 43 | "([official documentation](https://xgboost.readthedocs.io/en/stable/python/python_api.html#xgboost.XGBRegressor))" 44 | ) 45 | regression_manager = XGBoostManager(task="regression") 46 | 47 | st.subheader("Hyperparameters") 48 | with st.container(border=True): 49 | regression_manager.set_model() 50 | 51 | st.subheader("Evaluation") 52 | regression_manager.fit(X_train, y_train) 53 | regression_manager.evaluate(X_test, y_test) 54 | st.markdown("Metrics Report") 55 | st.columns([0.5, 1, 0.5])[1].dataframe( 56 | data=regression_manager.metrics_report.round(2), use_container_width=True 57 | ) 58 | st.subheader("Explainability") 59 | st.markdown("SHAP force plot") 60 | utils.st_shap(plot=regression_manager.shap_force_plot(X_test), height=400) 61 | -------------------------------------------------------------------------------- /pages/clustering/kmeans.py: -------------------------------------------------------------------------------- 1 | import streamlit as st 2 | 3 | import utils 4 | from src.machine_learning.clustering import KMeansManager 5 | from src.machine_learning.datasets import Dataset 6 | 7 | loader = utils.PageConfigLoader(__file__) 8 | loader.set_page_config(globals()) 9 | 10 | logger = utils.CustomLogger(__file__) 11 | 12 | st_ss = st.session_state 13 | 14 | 15 | def main(): 16 | utils.tabs_config() 17 | utils.show_source_code("src/statistics/dimensionality_rediction/kmeans_manager.py") 18 | 19 | st.header("Dataset", divider="gray") 20 | dataset = Dataset(type=None) 21 | raw_dataset_dict = Dataset.get_dataset(**dataset.params, split=False) 22 | dataset.set(raw_dataset_dict) 23 | 24 | with st.expander(label="Dataset description"): 25 | st.markdown(dataset.description) 26 | 27 | X, y = dataset.X, dataset.y 28 | if label_mapping := dataset.label_mapping: 29 | y = y.map(label_mapping) 30 | 31 | st.subheader("Visualize data") 32 | with st.container(border=True): 33 | utils.display_tab_content("data", X, y) 34 | 35 | st.subheader("K-Means") 36 | with st.container(border=True): 37 | kmeans_manager = KMeansManager(max_n_clusters=10) 38 | kmeans_manager.set_model() 39 | 40 | kmeans_manager.fit(data=X) 41 | 42 | st.subheader("Scatter plot", divider="gray") 43 | col_x, col_y = st.columns(2) 44 | x_col_scatter = col_x.selectbox( 45 | label="X column", key="scatter_x", options=X.columns, index=0 46 | ) 47 | y_col_scatter = col_y.selectbox( 48 | label="Y column", key="scatter_y", options=X.columns, index=1 49 | ) 50 | st.plotly_chart( 51 | kmeans_manager.scatter_plot(x_col_scatter, y_col_scatter), 52 | use_container_width=True, 53 | ) 54 | 55 | st.subheader("Centroids plot", divider="gray") 56 | col_x, col_y = st.columns(2) 57 | x_col_centroids = col_x.selectbox( 58 | label="X column", key="centroids_x", options=X.columns, index=0 59 | ) 60 | y_col_centroids = col_y.selectbox( 61 | label="Y column", key="centroids_y", options=X.columns, index=1 62 | ) 63 | st.plotly_chart( 64 | kmeans_manager.centroids_plot(x_col_centroids, y_col_centroids), 65 | use_container_width=True, 66 | ) 67 | -------------------------------------------------------------------------------- /src/machine_learning/clustering/dbscan_manager.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import plotly.express as px 3 | import streamlit as st 4 | from sklearn.cluster import DBSCAN 5 | 6 | 7 | class DBScanManager: 8 | def __init__(self): 9 | self.model: DBSCAN | None = None 10 | 11 | @property 12 | def params(self) -> dict: 13 | columns = st.columns(2) 14 | return { 15 | "eps": columns[0].slider( 16 | label="Maximum Distance (eps)", 17 | min_value=0.1, 18 | max_value=5.0, 19 | value=1.0, 20 | step=0.1, 21 | help="Maximum distance between two samples for one to be considered as in the neighborhood of the other.", 22 | ), 23 | "min_samples": columns[1].slider( 24 | label="Minimum Samples", 25 | min_value=1, 26 | max_value=10, 27 | value=5, 28 | step=1, 29 | help="The number of samples in a neighborhood for a point to be considered as a core point.", 30 | ), 31 | } 32 | 33 | @staticmethod 34 | @st.cache_resource(show_spinner=True) 35 | def _get_model(eps: float, min_samples: int) -> DBSCAN: 36 | return DBSCAN(eps=eps, min_samples=min_samples) 37 | 38 | def set_model(self) -> None: 39 | self.model = self._get_model(**self.params) 40 | 41 | @staticmethod 42 | @st.cache_resource( 43 | show_spinner=True, 44 | hash_funcs={DBSCAN: lambda model: (model.eps, model.min_samples)}, 45 | ) 46 | def _perform_clustering(model: DBSCAN, data: pd.DataFrame) -> pd.DataFrame: 47 | clusters = model.fit_predict(data) 48 | data = data.assign(Cluster=clusters) 49 | data["Cluster"] = data["Cluster"].astype(str) 50 | return model, data 51 | 52 | def fit(self, data: pd.DataFrame): 53 | self.model, self.data_clustered = self._perform_clustering( 54 | model=self.model, data=data 55 | ) 56 | 57 | def scatter_plot(self, x_col: str, y_col: str) -> None: 58 | return px.scatter( 59 | self.data_clustered, 60 | x=x_col, 61 | y=y_col, 62 | color="Cluster", 63 | labels={"color": "Cluster"}, 64 | ) 65 | -------------------------------------------------------------------------------- /src/generative_ai/large_language_models/chatbots/chatbot_rag.py: -------------------------------------------------------------------------------- 1 | import typing as t 2 | from functools import cached_property 3 | 4 | from langchain.chains import ConversationalRetrievalChain 5 | from langchain.chains.conversational_retrieval.base import \ 6 | BaseConversationalRetrievalChain 7 | from langchain.embeddings import OpenAIEmbeddings 8 | from langchain.vectorstores import FAISS 9 | from langchain.vectorstores.base import VectorStoreRetriever 10 | 11 | from src.generative_ai.large_language_models.chatbots import Chatbot, ModelArgs 12 | 13 | 14 | class ChatbotRAG(Chatbot): 15 | def __init__( 16 | self, 17 | vector_store: FAISS | None = None, 18 | embeddings_kwargs: t.Dict | None = None, 19 | search_kwargs: t.Dict | None = None, 20 | **model_kwargs: t.Unpack[ModelArgs], 21 | ) -> None: 22 | super().__init__(**model_kwargs) 23 | if vector_store: 24 | self.vector_store = vector_store 25 | self.embeddings_kwargs = embeddings_kwargs or {} 26 | self.search_kwargs = search_kwargs or {} 27 | 28 | @cached_property 29 | def embeddings(self) -> OpenAIEmbeddings: 30 | return OpenAIEmbeddings(**self.embeddings_kwargs) 31 | 32 | @cached_property 33 | def vector_store(self) -> FAISS: 34 | return FAISS.load_local(folder_path="faiss_index", embeddings=self.embeddings) 35 | 36 | @cached_property 37 | def retriever(self) -> VectorStoreRetriever: 38 | return self.vector_store.as_retriever( 39 | search_type="similarity", 40 | search_kwargs=self.search_kwargs, 41 | ) 42 | 43 | @cached_property 44 | def chain(self) -> BaseConversationalRetrievalChain: 45 | return ConversationalRetrievalChain.from_llm( 46 | llm=self.llm, 47 | memory=self.memory, 48 | verbose=True, 49 | combine_docs_chain_kwargs={"prompt": self.template}, 50 | chain_type="stuff", 51 | retriever=self.retriever, 52 | ) 53 | 54 | def ask( 55 | self, 56 | query: str, 57 | language: str | None = None, 58 | ) -> str: 59 | return self.chain.run( 60 | question=query, 61 | language=language or "the input language", 62 | callbacks=self.callbacks, 63 | ) 64 | -------------------------------------------------------------------------------- /pages/large_language_models/chatbot.py: -------------------------------------------------------------------------------- 1 | import streamlit as st 2 | 3 | import utils 4 | from pages.large_language_models import LLM_CONFIG 5 | from src.generative_ai.large_language_models import Chatbot 6 | 7 | loader = utils.PageConfigLoader(__file__) 8 | loader.set_page_config(globals()) 9 | 10 | st_ss = st.session_state 11 | 12 | 13 | def main(): 14 | utils.show_source_code("src/generative_ai/large_language_models/chatbots/chatbot.py") 15 | with st.expander(label="Chat parameters", expanded=True): 16 | col1, col2 = st.columns(2) 17 | with col1: 18 | selected_language = st_ss.setdefault( 19 | "language_widget", utils.LanguageWidget() 20 | ).selected_language 21 | with col2: 22 | lakera_activated = st_ss.setdefault( 23 | "lakera_widget", utils.LakeraWidget() 24 | ).lakera_activated 25 | 26 | chosen_model = st.selectbox( 27 | label="Large Language Model:", 28 | placeholder="Choose an option", 29 | options=LLM_CONFIG.keys(), 30 | index=0, 31 | on_change=utils.reset_session_state_key, 32 | kwargs={"key": "chatbot"}, 33 | ) 34 | 35 | provided_context = st.text_area( 36 | label="Context:", 37 | value="", 38 | help="This context will be passed to the chatbot.", 39 | ) 40 | 41 | if chosen_model: 42 | chatbot = st_ss.setdefault("chatbot", Chatbot(**LLM_CONFIG[chosen_model])) 43 | for message in chatbot.history: 44 | st.chat_message(message["role"]).write(message["content"]) 45 | else: 46 | pass 47 | 48 | if prompt := st.chat_input( 49 | placeholder=f"Chat with {chosen_model}!" if chosen_model else "", 50 | disabled=not chosen_model, 51 | ): 52 | st.chat_message("human").write(prompt) 53 | if lakera_activated: 54 | flag, response = st_ss.setdefault( 55 | "lakera_widget", utils.LakeraWidget() 56 | ).flag_prompt(prompt=prompt) 57 | if flag: 58 | st.warning(body="Prompt injection detected", icon="🚨") 59 | st.expander(label="LOGS").json(response) 60 | with st.chat_message("ai"): 61 | chatbot.ask( 62 | query=prompt, 63 | context=provided_context, 64 | language=selected_language, 65 | ) 66 | -------------------------------------------------------------------------------- /pages/statistical_tests/ab_test.py: -------------------------------------------------------------------------------- 1 | import streamlit as st 2 | 3 | import utils 4 | from src.statistics.statistical_tests import ABTesting, input_group_data 5 | 6 | loader = utils.PageConfigLoader(__file__) 7 | loader.set_page_config(globals()) 8 | 9 | 10 | def main(): 11 | st.header("Data", divider="gray") 12 | a_col, b_col = st.columns(2, gap="small") 13 | with a_col.container(border=True): 14 | st.subheader("Group A") 15 | a_visitors, a_conversions, a_rate = input_group_data( 16 | group_name="A", default_visitors=1000, default_conversions=50 17 | ) 18 | with b_col.container(border=True): 19 | st.subheader("Group B") 20 | b_visitors, b_conversions, b_rate = input_group_data( 21 | group_name="B", default_visitors=200, default_conversions=35 22 | ) 23 | 24 | st.header("Settings", divider="gray") 25 | settings_container = st.container(border=True) 26 | test_type = settings_container.selectbox( 27 | label="Test type", 28 | key="ab_test.test_type", 29 | options=["one-sided", "two-sided"], 30 | index=1, 31 | format_func=lambda x: x.replace("-", " ").capitalize(), 32 | ) 33 | confidence_col, alpha_col = settings_container.columns(2) 34 | confidence = confidence_col.columns([0.15, 1, 0.15])[1].select_slider( 35 | "Confidence level", 36 | options=[0.9, 0.95, 0.99], 37 | value=0.95, 38 | key="ab_test.confidence", 39 | format_func=lambda x: f"{100*x}%", 40 | on_change=utils.update_slider_callback, 41 | kwargs={"updated": "ab_test.confidence", "to_update": "ab_test.alpha"}, 42 | ) 43 | alpha = alpha_col.columns([0.15, 1, 0.15])[1].select_slider( 44 | "Alpha value", 45 | options=[0.01, 0.05, 0.1], 46 | value=0.05, 47 | key="ab_test.alpha", 48 | format_func=lambda x: f"{100*x}%", 49 | on_change=utils.update_slider_callback, 50 | kwargs={"updated": "ab_test.alpha", "to_update": "ab_test.confidence"}, 51 | ) 52 | 53 | ab_testing = ABTesting(a_visitors, a_rate, b_visitors, b_rate, alpha, test_type) 54 | 55 | st.header("Results", divider="gray") 56 | result = ab_testing.perform_ab_test() 57 | 58 | if result["is_significant"]: 59 | st.success("The difference is significant", icon="✅") 60 | else: 61 | st.error("The difference is not significant", icon="❌") 62 | 63 | st.expander(label="Test details").json(result) 64 | -------------------------------------------------------------------------------- /pages/classification/xgboost.py: -------------------------------------------------------------------------------- 1 | import streamlit as st 2 | 3 | import utils 4 | from src.machine_learning import XGBoostManager 5 | from src.machine_learning.datasets import Dataset 6 | 7 | loader = utils.PageConfigLoader(__file__) 8 | loader.set_page_config(globals()) 9 | 10 | logger = utils.CustomLogger(__file__) 11 | 12 | st_ss = st.session_state 13 | 14 | 15 | def main(): 16 | utils.tabs_config() 17 | utils.show_source_code("src/machine_learning/xgboost_manager.py") 18 | 19 | st.header("Dataset", divider="gray") 20 | dataset = Dataset(type="classification") 21 | raw_dataset_dict = Dataset.get_dataset(**dataset.params, split=True) 22 | dataset.set(raw_dataset_dict) 23 | 24 | with st.expander(label="Dataset description"): 25 | st.markdown(dataset.description) 26 | 27 | X_train, X_test = dataset.X 28 | y_train, y_test = dataset.y 29 | label_mapping = dataset.label_mapping 30 | 31 | st.subheader("Visualize data") 32 | train_tab, test_tab = st.tabs(tabs=["Train", "Test"]) 33 | with train_tab: 34 | with st.container(border=True): 35 | utils.display_tab_content("train", X_train, y_train, label_mapping) 36 | with test_tab: 37 | with st.container(border=True): 38 | utils.display_tab_content("test", X_test, y_test, label_mapping) 39 | 40 | st.header("Classification", divider="gray") 41 | st.markdown( 42 | "Classification model: `XGBClassifier` from `xgboost` " 43 | "([official documentation](https://xgboost.readthedocs.io/en/stable/python/python_api.html#xgboost.XGBClassifier))" 44 | ) 45 | classification_manager = XGBoostManager(task="classification") 46 | 47 | st.subheader("Hyperparameters") 48 | with st.container(border=True): 49 | classification_manager.set_model(label_mapping=label_mapping) 50 | 51 | st.subheader("Evaluation") 52 | classification_manager.fit(X_train, y_train) 53 | classification_manager.evaluate( 54 | X_test, y_test, target_names=list(label_mapping.values()) 55 | ) 56 | st.markdown("Classification Report") 57 | st.columns(3)[1].dataframe( 58 | data=classification_manager.classification_report, use_container_width=True 59 | ) 60 | st.markdown("Confusion Matrix") 61 | st.columns([0.1, 1, 0.1])[1].pyplot( 62 | fig=classification_manager.confusion_matrix_display( 63 | display_labels=list(label_mapping.values()) 64 | ) 65 | ) 66 | st.subheader("Explainability") 67 | st.markdown("SHAP force plot") 68 | utils.st_shap(plot=classification_manager.shap_force_plot(X_test), height=400) 69 | -------------------------------------------------------------------------------- /pages/large_language_models/chatbot_tools.py: -------------------------------------------------------------------------------- 1 | import streamlit as st 2 | 3 | import utils 4 | from pages.large_language_models import LLM_CONFIG 5 | from src.generative_ai.large_language_models import ChatbotTools 6 | 7 | loader = utils.PageConfigLoader(__file__) 8 | loader.set_page_config(globals()) 9 | 10 | st_ss = st.session_state 11 | 12 | 13 | def main(): 14 | utils.show_source_code( 15 | "src/generative_ai/large_language_models/chatbots/chatbot_tools.py" 16 | ) 17 | with st.expander(label="Chat parameters", expanded=True): 18 | col1, col2 = st.columns(2) 19 | with col1: 20 | selected_language = st_ss.setdefault( 21 | "language_widget", utils.LanguageWidget() 22 | ).selected_language 23 | with col2: 24 | lakera_activated = st_ss.setdefault( 25 | "lakera_widget", utils.LakeraWidget() 26 | ).lakera_activated 27 | 28 | chosen_model = st.selectbox( 29 | label="Large Language Model:", 30 | placeholder="Choose an option", 31 | options=LLM_CONFIG.keys(), 32 | index=0, 33 | on_change=utils.reset_session_state_key, 34 | kwargs={"key": "chatbot_tools"}, 35 | ) 36 | 37 | chosen_tools = st.multiselect( 38 | label="Tools:", 39 | options=ChatbotTools.available_tools, 40 | default=None, 41 | on_change=utils.reset_session_state_key, 42 | kwargs={"key": "chatbot_tools"}, 43 | ) 44 | 45 | if chosen_model and chosen_tools: 46 | chatbot = st_ss.setdefault( 47 | "chatbot_tools", 48 | ChatbotTools(**LLM_CONFIG[chosen_model], tool_names=chosen_tools), 49 | ) 50 | for message in chatbot.history: 51 | st.chat_message(message["role"]).write(message["content"]) 52 | else: 53 | st.info("Choose tools for the LLM", icon="ℹ️") 54 | 55 | if prompt := st.chat_input( 56 | placeholder=f"Chat with {chosen_model}!" 57 | if (chosen_model and chosen_tools) 58 | else "", 59 | disabled=not (chosen_model and chosen_tools), 60 | ): 61 | st.chat_message("human").write(prompt) 62 | if lakera_activated: 63 | flag, response = st_ss.setdefault( 64 | "lakera_widget", utils.LakeraWidget() 65 | ).flag_prompt(prompt=prompt) 66 | if flag: 67 | st.warning(body="Prompt injection detected", icon="🚨") 68 | st.expander(label="LOGS").json(response) 69 | with st.chat_message("ai"): 70 | st.write(chatbot.ask( 71 | query=prompt, 72 | language=selected_language, 73 | )) 74 | -------------------------------------------------------------------------------- /src/generative_ai/large_language_models/chatbots/chatbot_tools.py: -------------------------------------------------------------------------------- 1 | import typing as t 2 | from functools import cached_property 3 | 4 | from langchain.agents import (AgentExecutor, AgentType, initialize_agent, 5 | load_tools) 6 | from langchain.callbacks.base import BaseCallbackHandler 7 | from langchain.tools import BaseTool 8 | 9 | from src.generative_ai.large_language_models.chatbots import Chatbot, ModelArgs 10 | 11 | 12 | class ChatbotTools(Chatbot): 13 | available_tools = ["google-search", "arxiv", "wikipedia", "stackexchange", "human"] 14 | 15 | def __init__( 16 | self, 17 | tool_names: t.List[str] | None = None, 18 | **model_kwargs: t.Unpack[ModelArgs], 19 | ) -> None: 20 | super().__init__(**model_kwargs) 21 | self.tool_names = tool_names or [] 22 | self.memory.input_key = "input" 23 | 24 | # @property 25 | # def callbacks(self) -> t.List[BaseCallbackHandler]: 26 | # return [super().callbacks[1]] 27 | 28 | @cached_property 29 | def tools(self) -> t.List[BaseTool]: 30 | return load_tools(tool_names=self.tool_names) 31 | 32 | @staticmethod 33 | def update_agent_prompt_template( 34 | agent: AgentExecutor, 35 | text: str, 36 | input_variable: str | None = None, 37 | ): 38 | template = agent.agent.llm_chain.prompt.template 39 | newline_index = agent.agent.llm_chain.prompt.template.find("\n\n") 40 | agent.agent.llm_chain.prompt.template = text + template[newline_index:] 41 | if input_variable: 42 | agent.agent.llm_chain.prompt.input_variables.append(input_variable) 43 | return agent 44 | 45 | @cached_property 46 | def chain(self) -> AgentExecutor: 47 | agent = initialize_agent( 48 | llm=self.llm, 49 | memory=self.memory, 50 | verbose=True, 51 | agent=AgentType.CONVERSATIONAL_REACT_DESCRIPTION, 52 | agent_kwargs={ 53 | "input_variables": [ 54 | "input", 55 | "chat_history", 56 | "agent_scratchpad", 57 | "language", 58 | ] 59 | }, 60 | tools=self.tools, 61 | handle_parsing_errors=True, 62 | return_intermediate_steps=False, 63 | ) 64 | agent = self.update_agent_prompt_template( 65 | agent=agent, 66 | text="Assistant is a large language model, speaking in {language}.", 67 | input_variable="language", 68 | ) 69 | return agent 70 | 71 | def ask( 72 | self, 73 | query: str, 74 | language: str | None = None, 75 | ) -> str: 76 | return self.chain.run( 77 | input=query, 78 | language=language or "the input language", 79 | callbacks=self.callbacks, 80 | ) 81 | -------------------------------------------------------------------------------- /src/machine_learning/clustering/kmeans_manager.py: -------------------------------------------------------------------------------- 1 | import typing as t 2 | 3 | import pandas as pd 4 | import plotly.express as px 5 | import plotly.graph_objects as go 6 | import streamlit as st 7 | from sklearn.cluster import KMeans 8 | 9 | 10 | class KMeansManager: 11 | def __init__(self, max_n_clusters: int): 12 | self.max_n_clusters = max_n_clusters 13 | self.model: KMeans | None = None 14 | 15 | @property 16 | def params(self) -> t.Dict[str, int]: 17 | columns = st.columns(2) 18 | return { 19 | "n_clusters": columns[0].slider( 20 | label="Number of Clusters", 21 | min_value=1, 22 | max_value=self.max_n_clusters, 23 | value=2, 24 | step=1, 25 | help="Number of clusters to form.", 26 | ), 27 | } 28 | 29 | @staticmethod 30 | @st.cache_resource(show_spinner=True) 31 | def _get_model(n_clusters: int) -> KMeans: 32 | return KMeans(n_clusters=n_clusters, n_init="auto") 33 | 34 | def set_model(self) -> None: 35 | params = self.params 36 | self.model = self._get_model(params["n_clusters"]) 37 | 38 | @staticmethod 39 | @st.cache_resource( 40 | show_spinner=True, 41 | hash_funcs={KMeans: lambda model: model.n_clusters}, 42 | ) 43 | def _perform_clustering(model: KMeans, data: pd.DataFrame) -> pd.DataFrame: 44 | model = model.fit(data) 45 | clusters = model.predict(data) 46 | data = data.assign(Cluster=clusters) 47 | data["Cluster"] = data["Cluster"].astype(str) 48 | return model, data 49 | 50 | def fit(self, data: pd.DataFrame): 51 | self.model, self.data_clustered = self._perform_clustering( 52 | model=self.model, data=data 53 | ) 54 | 55 | def scatter_plot(self, x_col: str, y_col: str) -> None: 56 | return px.scatter( 57 | self.data_clustered, 58 | x=x_col, 59 | y=y_col, 60 | color="Cluster", 61 | labels={"color": "Cluster"}, 62 | ) 63 | 64 | def centroids_plot(self, x_col: str, y_col: str) -> None: 65 | centroids = pd.DataFrame( 66 | self.model.cluster_centers_, 67 | columns=[f"{col}_centroid" for col in self.data_clustered.columns[:-1]], 68 | ) 69 | centroids[x_col] = centroids[f"{x_col}_centroid"] 70 | centroids[y_col] = centroids[f"{y_col}_centroid"] 71 | 72 | fig = px.scatter( 73 | self.data_clustered, 74 | x=x_col, 75 | y=y_col, 76 | color="Cluster", 77 | labels={"color": "Cluster"}, 78 | ) 79 | 80 | fig.add_trace( 81 | go.Scatter( 82 | x=centroids[x_col], 83 | y=centroids[y_col], 84 | mode="markers", 85 | marker=dict(size=20, symbol="x", color="white"), 86 | name="Centroids", 87 | ) 88 | ) 89 | return fig 90 | -------------------------------------------------------------------------------- /pages/large_language_models/chatbot_rag.py: -------------------------------------------------------------------------------- 1 | import streamlit as st 2 | 3 | import utils 4 | from pages.large_language_models import LLM_CONFIG 5 | from src.generative_ai.large_language_models import (ChatbotRAG, 6 | get_vector_store) 7 | 8 | loader = utils.PageConfigLoader(__file__) 9 | loader.set_page_config(globals()) 10 | 11 | st_ss = st.session_state 12 | 13 | 14 | def main(): 15 | utils.show_source_code( 16 | path="src/generative_ai/large_language_models/chatbots/chatbot_rag.py" 17 | ) 18 | with st.expander(label="Chat parameters", expanded=True): 19 | col1, col2 = st.columns(2) 20 | with col1: 21 | selected_language = st_ss.setdefault( 22 | "language_widget", utils.LanguageWidget() 23 | ).selected_language 24 | with col2: 25 | lakera_activated = st_ss.setdefault( 26 | "lakera_widget", utils.LakeraWidget() 27 | ).lakera_activated 28 | 29 | chosen_model = st.selectbox( 30 | label="Large Language Model:", 31 | placeholder="Choose an option", 32 | options=LLM_CONFIG.keys(), 33 | index=0, 34 | on_change=utils.reset_session_state_key, 35 | kwargs={"key": "chatbot_rag"}, 36 | ) 37 | 38 | if uploaded_file := st.file_uploader( 39 | "Upload a PDF file", 40 | type="pdf", 41 | accept_multiple_files=False, 42 | help="https://python.langchain.com/docs/use_cases/question_answering/#what-is-rag", 43 | on_change=utils.reset_session_state_key, 44 | kwargs={"key": "chatbot_rag"}, 45 | ): 46 | with open(uploaded_file.name, "wb") as f: 47 | f.write(uploaded_file.getbuffer()) 48 | vector_db = get_vector_store(file=uploaded_file.name, mode="upload") 49 | 50 | if chosen_model and uploaded_file: 51 | chatbot = st_ss.setdefault( 52 | "chatbot_rag", 53 | ChatbotRAG(vector_store=vector_db, **LLM_CONFIG[chosen_model]), 54 | ) 55 | for message in chatbot.history: 56 | st.chat_message(message["role"]).write(message["content"]) 57 | else: 58 | st.info("Please upload a PDF file for the RAG", icon="ℹ️") 59 | 60 | if prompt := st.chat_input( 61 | placeholder=f"Chat with {chosen_model}!" 62 | if (chosen_model and uploaded_file) 63 | else "", 64 | disabled=not (chosen_model and uploaded_file), 65 | ): 66 | st.chat_message("human").write(prompt) 67 | if lakera_activated: 68 | flag, response = st_ss.setdefault( 69 | "lakera_widget", utils.LakeraWidget() 70 | ).flag_prompt(prompt=prompt) 71 | if flag: 72 | st.warning(body="Prompt injection detected", icon="🚨") 73 | st.expander(label="LOGS").json(response) 74 | with st.chat_message("ai"): 75 | chatbot.ask( 76 | query=prompt, 77 | language=selected_language, 78 | ) 79 | -------------------------------------------------------------------------------- /src/computer_vision/object_detection/face_detection.py: -------------------------------------------------------------------------------- 1 | import os 2 | import typing as t 3 | from functools import cached_property 4 | 5 | import cv2 6 | import mediapipe as mp 7 | import streamlit_webrtc as st_webrtc 8 | from av import VideoFrame 9 | from mediapipe.framework.formats import detection_pb2 10 | from numpy import ndarray 11 | 12 | import utils 13 | 14 | logger = utils.CustomLogger(__file__) 15 | 16 | os.environ["MEDIAPIPE_DISABLE_GPU"] = "1" 17 | 18 | 19 | class FaceDetectionApp: 20 | def __init__(self): 21 | pass 22 | 23 | @cached_property 24 | def detector(self): 25 | return mp.solutions.face_detection.FaceDetection( 26 | min_detection_confidence=0.5, 27 | model_selection=0, 28 | ) 29 | 30 | def detect_faces(self, image: ndarray) -> t.Any: 31 | return self.detector.process(image).detections 32 | 33 | def video_frame_callback(self, frame: VideoFrame) -> VideoFrame: 34 | image = frame.to_ndarray(format="bgr24") 35 | 36 | detection_list = self.detect_faces(image) 37 | self.annotate_faces( 38 | image=image, 39 | detection_list=detection_list, 40 | ) 41 | utils.annotate_time(image=image) 42 | return VideoFrame.from_ndarray(image, format="bgr24") 43 | 44 | def stream(self) -> None: 45 | st_webrtc.webrtc_streamer( 46 | video_frame_callback=self.video_frame_callback, 47 | key="face_streamer", 48 | mode=st_webrtc.WebRtcMode.SENDRECV, 49 | rtc_configuration=st_webrtc.RTCConfiguration( 50 | {"iceServers": utils.get_ice_servers(), "iceTransportPolicy": "relay"} 51 | ), 52 | media_stream_constraints={"video": True, "audio": False}, 53 | async_processing=True, 54 | desired_playing_state=None, 55 | ) 56 | 57 | @staticmethod 58 | def annotate_faces( 59 | image: ndarray, 60 | detection_list: t.List[detection_pb2.Detection], 61 | ) -> None: 62 | if not detection_list: 63 | return 64 | 65 | for detection in detection_list: 66 | score = detection.score[0] 67 | bbox = detection.location_data.relative_bounding_box 68 | height, width, _ = image.shape 69 | xmin, ymin = int(bbox.xmin * width), int(bbox.ymin * height) 70 | xmax, ymax = int((bbox.xmin + bbox.width) * width), int( 71 | (bbox.ymin + bbox.height) * height 72 | ) 73 | cv2.rectangle( 74 | img=image, 75 | pt1=(xmin, ymin), 76 | pt2=(xmax, ymax), 77 | color=(0, 255, 0), 78 | thickness=3, 79 | ) 80 | cv2.putText( 81 | img=image, 82 | text=f"score: {score:.3f}", 83 | org=(xmin, ymin - 10), 84 | fontFace=cv2.FONT_HERSHEY_SIMPLEX, 85 | fontScale=0.5, 86 | color=(0, 255, 0), 87 | thickness=2, 88 | ) 89 | -------------------------------------------------------------------------------- /pages/statistical_tests/chi2_test.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import streamlit as st 3 | 4 | import utils 5 | from src.statistics.statistical_tests import Chi2Testing 6 | 7 | loader = utils.PageConfigLoader(__file__) 8 | loader.set_page_config(globals()) 9 | 10 | 11 | def main(): 12 | st.header("Data", divider="gray") 13 | observed_template = pd.DataFrame( 14 | data=[["Group A", 30, 20], ["Group B", 70, 80]], 15 | index=None, 16 | columns=["group", "category_1", "category_2"], 17 | ) 18 | col_df, col_sum = st.columns([0.8, 0.2]) 19 | with col_df: 20 | observed = st.data_editor( 21 | data=observed_template, 22 | hide_index=True, 23 | column_config={ 24 | "group": st.column_config.TextColumn( 25 | "Group", 26 | help="The name of the considered group.", 27 | ), 28 | "category_1": st.column_config.NumberColumn( 29 | "Category 1", 30 | min_value=1, 31 | required=True, 32 | help="The observed values for the category 1.", 33 | ), 34 | "category_2": st.column_config.NumberColumn( 35 | "Category 2", 36 | min_value=1, 37 | required=True, 38 | help="The observed values for the category 2.", 39 | ), 40 | }, 41 | disabled=False, 42 | use_container_width=True, 43 | ) 44 | st.info("Click on any cell to change its content.", icon="💡") 45 | with col_sum: 46 | total_col = observed.drop("group", axis=1).sum(axis=1).to_frame(name="Total") 47 | st.dataframe(total_col, hide_index=True, use_container_width=True) 48 | 49 | st.header("Settings", divider="gray") 50 | settings_container = st.container(border=True) 51 | confidence_col, alpha_col = settings_container.columns(2) 52 | confidence = confidence_col.columns([0.15, 1, 0.15])[1].select_slider( 53 | "Confidence level", 54 | options=[0.9, 0.95, 0.99], 55 | value=0.95, 56 | key="chi2_test.confidence", 57 | format_func=lambda x: f"{100*x}%", 58 | on_change=utils.update_slider_callback, 59 | kwargs={"updated": "chi2_test.confidence", "to_update": "chi2_test.alpha"}, 60 | ) 61 | alpha = alpha_col.columns([0.15, 1, 0.15])[1].select_slider( 62 | "Alpha value", 63 | options=[0.01, 0.05, 0.1], 64 | value=0.05, 65 | key="chi2_test.alpha", 66 | format_func=lambda x: f"{100*x}%", 67 | on_change=utils.update_slider_callback, 68 | kwargs={"updated": "chi2_test.alpha", "to_update": "chi2_test.confidence"}, 69 | ) 70 | 71 | chi2_testing = Chi2Testing(observed.drop("group", axis=1), alpha) 72 | 73 | st.header("Results", divider="gray") 74 | result = chi2_testing.perform_chi2_test() 75 | 76 | if result["is_significant"]: 77 | st.success("The difference is significant", icon="✅") 78 | else: 79 | st.error("The difference is not significant", icon="❌") 80 | 81 | st.expander(label="Test details").json(result) 82 | -------------------------------------------------------------------------------- /src/statistics/dimensionality_reduction/umap_manager.py: -------------------------------------------------------------------------------- 1 | import typing as t 2 | 3 | import pandas as pd 4 | import plotly.express as px 5 | import streamlit as st 6 | from umap import UMAP 7 | 8 | 9 | class UMAPManager: 10 | def __init__(self, max_n_components: int): 11 | self.max_n_components = max_n_components 12 | self.model: UMAP | None = None 13 | self.target_col: pd.Series | None = None 14 | self.embedded_data_df: pd.DataFrame | None = None 15 | 16 | @property 17 | def params(self) -> t.Dict[str, int | float]: 18 | columns = st.columns(3) 19 | return { 20 | "n_components": columns[0].slider( 21 | label="Number of Components", 22 | min_value=1, 23 | max_value=self.max_n_components, 24 | value=3, 25 | step=1, 26 | help="Number of components to compute.", 27 | ), 28 | "n_neighbors": columns[1].slider( 29 | label="Number of Neighbors", 30 | min_value=2, 31 | max_value=100, 32 | value=15, 33 | step=1, 34 | help="Size of local neighborhood used for manifold approximation.", 35 | ), 36 | "min_dist": columns[2].slider( 37 | label="Minimum Distance", 38 | min_value=0.1, 39 | max_value=1.0, 40 | value=0.5, 41 | step=0.1, 42 | help="Minimum distance between embedded points.", 43 | ), 44 | } 45 | 46 | @st.cache_resource(show_spinner=True) 47 | def _get_model(_self, params: t.Dict[str, int | float]) -> UMAP: 48 | return UMAP( 49 | n_components=params["n_components"], 50 | n_neighbors=params["n_neighbors"], 51 | min_dist=params["min_dist"], 52 | ) 53 | 54 | def set_model(self) -> None: 55 | params = self.params 56 | self.model = self._get_model(params) 57 | 58 | @st.cache_resource( 59 | show_spinner=True, 60 | hash_funcs={ 61 | UMAP: lambda model: (model.n_components, model.n_neighbors, model.min_dist) 62 | }, 63 | ) 64 | def _compute_umap(_self, model: UMAP, data: pd.DataFrame) -> pd.DataFrame: 65 | embedded_data = model.fit_transform(data) 66 | column_names = [f"D{i}" for i in range(1, model.n_components + 1)] 67 | return pd.DataFrame(embedded_data, columns=column_names) 68 | 69 | def fit(self, data: pd.DataFrame, target_col: pd.Series): 70 | self.embedded_data_df = self._compute_umap(model=self.model, data=data) 71 | self.target_col = target_col 72 | 73 | def scatter_matrix_plot(self) -> None: 74 | return px.scatter_matrix( 75 | self.embedded_data_df, color=self.target_col, labels={"color": "target"} 76 | ) 77 | 78 | def scatter_2d_plot(self) -> None: 79 | return px.scatter( 80 | self.embedded_data_df, 81 | x="D1", 82 | y="D2", 83 | color=self.target_col, 84 | labels={"color": "target"}, 85 | ) 86 | 87 | def scatter_3d_plot(self) -> None: 88 | return px.scatter_3d( 89 | self.embedded_data_df, 90 | x="D1", 91 | y="D2", 92 | z="D3", 93 | color=self.target_col, 94 | labels={"color": "target"}, 95 | ) 96 | -------------------------------------------------------------------------------- /src/generative_ai/large_language_models/chatbots/chatbot.py: -------------------------------------------------------------------------------- 1 | import typing as t 2 | from functools import cached_property 3 | 4 | from langchain.callbacks import StreamingStdOutCallbackHandler 5 | from langchain.callbacks.base import BaseCallbackHandler 6 | from langchain.chains import LLMChain 7 | from langchain.chains.base import Chain 8 | from langchain.chat_models import ChatOpenAI 9 | from langchain.llms import Together 10 | from langchain.memory import ConversationBufferMemory 11 | from langchain.prompts import PromptTemplate 12 | 13 | from src.generative_ai.large_language_models.callbacks import \ 14 | StreamingChatCallbackHandler 15 | 16 | 17 | class ModelArgs(t.TypedDict): 18 | provider: t.Literal["openai", "together"] 19 | owner: t.Literal["mistralai", "togethercomputer"] | None 20 | string: t.Literal["gpt-3.5-turbo", "llama-2-7b-chat", "Mistral-7B-Instruct-v0.1"] 21 | 22 | 23 | class Chatbot: 24 | BASE_TEMPLATE = """ 25 | Use the following context and chat history to answer the question: 26 | 27 | Context: {context} 28 | Chat history: {chat_history} 29 | Question: {question} 30 | 31 | Your answer (in {language}): 32 | """ 33 | 34 | def __init__(self, **model_kwargs: t.Unpack[ModelArgs]) -> None: 35 | self.model_provider = model_kwargs.get("provider", "openai") 36 | self.model_owner = model_kwargs.get("owner", None) 37 | self.model_string = model_kwargs.get("string", "gpt-3.5-turbo") 38 | 39 | @cached_property 40 | def llm(self) -> ChatOpenAI | Together: 41 | if self.model_provider == "openai": 42 | return ChatOpenAI( 43 | model=self.model_string, 44 | streaming=True, 45 | model_kwargs={}, 46 | ) 47 | elif self.model_provider == "together": 48 | return Together( 49 | model=f"{self.model_owner}/{self.model_string}", 50 | max_tokens=1024, 51 | ) 52 | 53 | @cached_property 54 | def memory(self) -> ConversationBufferMemory: 55 | return ConversationBufferMemory( 56 | memory_key="chat_history", 57 | input_key="question", 58 | return_messages=True, 59 | ) 60 | 61 | @property 62 | def history(self) -> t.List[t.Dict[str, str]]: 63 | return [ 64 | {"role": message.type, "content": message.content} 65 | for message in self.memory.buffer 66 | ] 67 | 68 | @cached_property 69 | def template(self) -> PromptTemplate: 70 | return PromptTemplate( 71 | template=self.BASE_TEMPLATE, 72 | input_variables=["context", "chat_history", "question", "language"], 73 | ) 74 | 75 | @cached_property 76 | def chain(self) -> Chain: 77 | return LLMChain( 78 | llm=self.llm, 79 | memory=self.memory, 80 | verbose=True, 81 | prompt=self.template, 82 | ) 83 | 84 | @property 85 | def callbacks(self) -> t.List[BaseCallbackHandler]: 86 | return [StreamingChatCallbackHandler(), StreamingStdOutCallbackHandler()] 87 | 88 | def ask( 89 | self, 90 | query: str, 91 | context: str | None = None, 92 | language: str | None = None, 93 | ) -> str: 94 | return self.chain.run( 95 | question=query, 96 | context=context or "", 97 | language=language or "the input language", 98 | callbacks=self.callbacks, 99 | ) 100 | -------------------------------------------------------------------------------- /src/statistics/dimensionality_reduction/tsne_manager.py: -------------------------------------------------------------------------------- 1 | import typing as t 2 | 3 | import pandas as pd 4 | import plotly.express as px 5 | import streamlit as st 6 | from sklearn.manifold import TSNE 7 | 8 | 9 | class TSNEManager: 10 | def __init__(self, max_n_components: int): 11 | self.max_n_components = max_n_components 12 | self.model: TSNE | None = None 13 | self.target_col: pd.Series | None = None 14 | self.embedded_data_df: pd.DataFrame | None = None 15 | 16 | @property 17 | def params(self) -> t.Dict[str, int]: 18 | columns = st.columns(3) 19 | return { 20 | "n_components": columns[0].slider( 21 | label="Number of Components", 22 | min_value=1, 23 | max_value=self.max_n_components, 24 | value=3, 25 | step=1, 26 | help="Number of components to compute.", 27 | ), 28 | "perplexity": columns[1].slider( 29 | label="Perplexity", 30 | min_value=1, 31 | max_value=100, 32 | value=30, 33 | step=1, 34 | help="A measure of how to balance attention between local and global aspects of the data.", 35 | ), 36 | "learning_rate": columns[2].slider( 37 | label="Learning Rate", 38 | min_value=10.0, 39 | max_value=500.0, 40 | value=200.0, 41 | step=50.0, 42 | help="Step size for each iteration in optimizing the cost function.", 43 | ), 44 | } 45 | 46 | @st.cache_resource(show_spinner=True) 47 | def _get_model(_self, params: t.Dict[str, int]) -> TSNE: 48 | return TSNE( 49 | n_components=params["n_components"], 50 | perplexity=params["perplexity"], 51 | learning_rate=params["learning_rate"], 52 | ) 53 | 54 | def set_model(self) -> None: 55 | params = self.params 56 | self.model = self._get_model(params) 57 | 58 | @st.cache_resource( 59 | show_spinner=True, 60 | hash_funcs={ 61 | TSNE: lambda model: ( 62 | model.n_components, 63 | model.perplexity, 64 | model.learning_rate, 65 | ) 66 | }, 67 | ) 68 | def _compute_tsne(_self, model: TSNE, data: pd.DataFrame) -> pd.DataFrame: 69 | embedded_data = model.fit_transform(data) 70 | column_names = [f"D{i}" for i in range(1, model.n_components + 1)] 71 | return pd.DataFrame(embedded_data, columns=column_names) 72 | 73 | def fit(self, data: pd.DataFrame, target_col: pd.Series): 74 | self.embedded_data_df = self._compute_tsne(model=self.model, data=data) 75 | self.target_col = target_col 76 | 77 | def scatter_matrix_plot(self) -> None: 78 | return px.scatter_matrix( 79 | self.embedded_data_df, color=self.target_col, labels={"color": "target"} 80 | ) 81 | 82 | def scatter_2d_plot(self) -> None: 83 | return px.scatter( 84 | self.embedded_data_df, 85 | x="D1", 86 | y="D2", 87 | color=self.target_col, 88 | labels={"color": "target"}, 89 | ) 90 | 91 | def scatter_3d_plot(self) -> None: 92 | return px.scatter_3d( 93 | self.embedded_data_df, 94 | x="D1", 95 | y="D2", 96 | z="D3", 97 | color=self.target_col, 98 | labels={"color": "target"}, 99 | ) 100 | -------------------------------------------------------------------------------- /src/computer_vision/landmarks/base.py: -------------------------------------------------------------------------------- 1 | import os 2 | import typing as t 3 | from datetime import datetime 4 | 5 | import cv2 6 | import mediapipe as mp 7 | import streamlit_webrtc as st_webrtc 8 | from av import VideoFrame 9 | from mediapipe.framework.formats import landmark_pb2 10 | from numpy import ndarray 11 | 12 | import utils 13 | 14 | logger = utils.CustomLogger(__file__) 15 | 16 | os.environ["MEDIAPIPE_DISABLE_GPU"] = "1" 17 | 18 | 19 | class BaseLandmarkerApp: 20 | def __init__(self): 21 | pass 22 | 23 | def get_landmarks(self, image: ndarray) -> landmark_pb2.NormalizedLandmarkList: 24 | detection_result = self.landmarker.process(image) 25 | landmark_list = getattr(detection_result, self.landmarks_type) 26 | return landmark_list[0] if isinstance(landmark_list, list) else landmark_list 27 | 28 | def video_frame_callback(self, frame: VideoFrame) -> VideoFrame: 29 | image = frame.to_ndarray(format="bgr24") 30 | 31 | landmark_list = self.get_landmarks(image) 32 | self.annotate_landmarks( 33 | image=image, 34 | connections_list=self.connections_list, 35 | landmark_list=landmark_list, 36 | drawing_specs_list=self.drawing_specs_list, 37 | ) 38 | utils.annotate_time(image=image) 39 | return VideoFrame.from_ndarray(image, format="bgr24") 40 | 41 | def stream(self) -> None: 42 | st_webrtc.webrtc_streamer( 43 | video_frame_callback=self.video_frame_callback, 44 | key=f"{self.landmarks_type}_streamer", 45 | mode=st_webrtc.WebRtcMode.SENDRECV, 46 | rtc_configuration=st_webrtc.RTCConfiguration( 47 | {"iceServers": utils.get_ice_servers(), "iceTransportPolicy": "relay"} 48 | ), 49 | media_stream_constraints={"video": True, "audio": False}, 50 | async_processing=True, 51 | desired_playing_state=None, 52 | ) 53 | 54 | @staticmethod 55 | def annotate_time(image: ndarray) -> None: 56 | text = datetime.now().strftime("%Y-%m-%d %H:%M:%S.%f")[:-3] 57 | text_args = { 58 | "text": text, 59 | "fontFace": cv2.FONT_HERSHEY_SIMPLEX, 60 | "fontScale": 1, 61 | "thickness": 2, 62 | } 63 | text_size = cv2.getTextSize(**text_args)[0] 64 | rect_width, rect_height = text_size[0] + 20, text_size[1] + 20 65 | cv2.rectangle( 66 | img=image, 67 | pt1=(0, 0), 68 | pt2=(rect_width, rect_height), 69 | color=(255, 255, 255), 70 | thickness=cv2.FILLED, 71 | ) 72 | cv2.rectangle( 73 | img=image, 74 | pt1=(0, 0), 75 | pt2=(rect_width, rect_height), 76 | color=(0, 0, 0), 77 | thickness=2, 78 | ) 79 | cv2.putText( 80 | img=image, 81 | org=(10, text_size[1] + 10), 82 | color=(0, 0, 0), 83 | lineType=cv2.LINE_AA, 84 | **text_args, 85 | ) 86 | 87 | @staticmethod 88 | def annotate_landmarks( 89 | image: ndarray, 90 | connections_list: t.List[t.FrozenSet[t.Tuple[int, int]]], 91 | landmark_list: landmark_pb2.NormalizedLandmarkList, 92 | drawing_specs_list: t.List[t.Dict[str, mp.solutions.drawing_utils.DrawingSpec]], 93 | ) -> None: 94 | if not landmark_list: 95 | return 96 | 97 | for connections, drawing_specs in zip(connections_list, drawing_specs_list): 98 | mp.solutions.drawing_utils.draw_landmarks( 99 | image=image, 100 | landmark_list=landmark_list, 101 | connections=connections, 102 | **drawing_specs, 103 | ) 104 | -------------------------------------------------------------------------------- /src/machine_learning/datasets.py: -------------------------------------------------------------------------------- 1 | import typing as t 2 | 3 | import pandas as pd 4 | import streamlit as st 5 | from sklearn import datasets 6 | from sklearn.model_selection import train_test_split 7 | 8 | 9 | class DatasetParams(t.TypedDict): 10 | source: t.Literal["iris", "digits", "breast_cancer"] 11 | test_size: float | None 12 | shuffle: bool 13 | stratify: bool 14 | 15 | 16 | class Dataset: 17 | def __init__( 18 | self, 19 | type: t.Literal["classification", "regression"] | None = None, 20 | ): 21 | self.type = type 22 | self.X: t.Tuple[pd.DataFrame, pd.DataFrame] | None = None 23 | self.y: t.Tuple[pd.Series, pd.Series] | None = None 24 | self.label_mapping: t.Dict[int, str] | None = None 25 | self.description: str | None = None 26 | 27 | @property 28 | def params(self) -> t.Dict[str, t.Any]: 29 | columns = st.columns(3) 30 | return { 31 | "source": columns[0].selectbox( 32 | label="source", 33 | options=["iris", "digits", "breast_cancer"] 34 | if self.type == "classification" 35 | else ["diabetes"] 36 | if self.type == "regression" 37 | else ["iris", "digits", "breast_cancer", "diabetes"], 38 | help="The scikit-learn toy dataset to use.", 39 | ), 40 | "test_size": columns[1].slider( 41 | "test_size", 42 | min_value=0.05, 43 | max_value=0.3, 44 | value=0.2, 45 | step=0.05, 46 | help="The proportion of the dataset to include in the test split", 47 | ) 48 | if self.type is not None 49 | else None, 50 | "shuffle": columns[2].checkbox( 51 | label="shuffle", 52 | value=True, 53 | help="Whether to shuffle the dataset or not.", 54 | ) 55 | if self.type is not None 56 | else None, 57 | "stratify": columns[2].checkbox( 58 | label="stratify", 59 | value=False, 60 | help="Whether to stratify the dataset or not. " 61 | "Stratifying means keeping the same label distribution in the initial, train and test datasets. " 62 | "Available for classification only.", 63 | disabled=self.type == "regression", 64 | ) 65 | if self.type is not None 66 | else None, 67 | } 68 | 69 | @staticmethod 70 | @st.cache_data(show_spinner=False) 71 | def get_dataset( 72 | split: bool = False, **params: t.Unpack[DatasetParams] 73 | ) -> t.Dict[str, t.Any]: 74 | raw_dataset = getattr(datasets, f"load_{params['source']}")(as_frame=True) 75 | X, y = raw_dataset.data, raw_dataset.target 76 | if split: 77 | X_train, X_test, y_train, y_test = train_test_split( 78 | X, 79 | y, 80 | test_size=params["test_size"], 81 | shuffle=params["shuffle"], 82 | stratify=y if params["stratify"] else None, 83 | random_state=0, 84 | ) 85 | X = X_train, X_test 86 | y = y_train, y_test 87 | return { 88 | "X": X, 89 | "y": y, 90 | "label_mapping": dict(enumerate(raw_dataset.target_names)) 91 | if "target_names" in raw_dataset 92 | else None, 93 | "description": raw_dataset.DESCR, 94 | } 95 | 96 | def set(self, raw_dataset_dict: t.Dict[str, t.Any]): 97 | self.X = raw_dataset_dict["X"] 98 | self.y = raw_dataset_dict["y"] 99 | self.label_mapping = raw_dataset_dict["label_mapping"] 100 | self.description = raw_dataset_dict["description"] 101 | -------------------------------------------------------------------------------- /src/statistics/statistical_tests/ab_test.py: -------------------------------------------------------------------------------- 1 | import typing as t 2 | 3 | import streamlit as st 4 | from scipy import stats 5 | 6 | 7 | def input_group_data( 8 | group_name: t.Literal["A", "B"], 9 | default_visitors: int, 10 | default_conversions: int, 11 | ) -> t.Tuple[int, int, float]: 12 | visitors = st.number_input( 13 | "Visitors", 14 | key=f"ab_test.{group_name.lower()}_visitors", 15 | min_value=1, 16 | value=default_visitors, 17 | step=1, 18 | ) 19 | conversion_col, rate_col = st.columns(2) 20 | conversions = conversion_col.number_input( 21 | "Conversions", 22 | key=f"ab_test.{group_name.lower()}_conversions", 23 | min_value=0, 24 | max_value=visitors, 25 | value=default_conversions, 26 | step=1, 27 | ) 28 | rate = rate_col.number_input( 29 | "Conversion rate", 30 | key=f"ab_test.{group_name.lower()}_rate", 31 | min_value=0.0, 32 | max_value=1.0, 33 | value=conversions / visitors, 34 | disabled=True, 35 | ) 36 | return visitors, conversions, rate 37 | 38 | 39 | class ABTesting: 40 | def __init__( 41 | self, 42 | a_visitors: int, 43 | a_rate: float, 44 | b_visitors: int, 45 | b_rate: float, 46 | alpha: float, 47 | test_type: t.Literal["one-sided", "two-sided"], 48 | ): 49 | self.a_visitors, self.a_rate = a_visitors, a_rate 50 | self.b_visitors, self.b_rate = b_visitors, b_rate 51 | self.alpha = alpha 52 | self.test_type = test_type 53 | 54 | @staticmethod 55 | @st.cache_data(show_spinner=False) 56 | def compute_standard_deviation(rate: float, visitors: int) -> float: 57 | return (rate * (1 - rate) / visitors) ** 0.5 58 | 59 | @classmethod 60 | @st.cache_data(show_spinner=False) 61 | def compute_confidence_interval( 62 | _cls, 63 | a_rate: float, 64 | b_rate: float, 65 | a_visitors: int, 66 | b_visitors: int, 67 | alpha: float, 68 | ) -> t.Tuple[float, float]: 69 | a_std = _cls.compute_standard_deviation(a_rate, a_visitors) 70 | b_std = _cls.compute_standard_deviation(b_rate, b_visitors) 71 | interval = ( 72 | stats.norm.ppf(1 - alpha / 2) 73 | * ((a_std**2 / a_visitors) + (b_std**2 / b_visitors)) ** 0.5 74 | ) 75 | return b_rate - a_rate - interval, b_rate - a_rate + interval 76 | 77 | @staticmethod 78 | @st.cache_data(show_spinner=False) 79 | def is_statistically_significant(p_value: float, alpha: float) -> bool: 80 | return p_value < alpha 81 | 82 | @staticmethod 83 | @st.cache_data(show_spinner=False) 84 | def t_test(a_rate, a_std, a_visitors, b_rate, b_std, b_visitors, test_type): 85 | t_statistic, p_value = stats.ttest_ind_from_stats( 86 | mean1=a_rate, 87 | std1=a_std, 88 | nobs1=a_visitors, 89 | mean2=b_rate, 90 | std2=b_std, 91 | nobs2=b_visitors, 92 | ) 93 | if test_type == "one-sided": 94 | p_value /= 2 95 | 96 | return t_statistic, p_value 97 | 98 | def perform_ab_test(self) -> t.Dict[str, any]: 99 | a_std = self.compute_standard_deviation(self.a_rate, self.a_visitors) 100 | b_std = self.compute_standard_deviation(self.b_rate, self.b_visitors) 101 | 102 | t_statistic, p_value = self.t_test( 103 | self.a_rate, 104 | a_std, 105 | self.a_visitors, 106 | self.b_rate, 107 | b_std, 108 | self.b_visitors, 109 | self.test_type, 110 | ) 111 | 112 | confidence_interval = self.compute_confidence_interval( 113 | self.a_rate, 114 | self.b_rate, 115 | self.a_visitors, 116 | self.b_visitors, 117 | self.alpha, 118 | ) 119 | 120 | is_significant = self.is_statistically_significant(p_value, self.alpha) 121 | 122 | return { 123 | "t_statistic": t_statistic, 124 | "p_value": p_value, 125 | "confidence_interval": confidence_interval, 126 | "is_significant": is_significant, 127 | } 128 | -------------------------------------------------------------------------------- /src/statistics/dimensionality_reduction/pca_manager.py: -------------------------------------------------------------------------------- 1 | import typing as t 2 | 3 | import numpy as np 4 | import pandas as pd 5 | import plotly.express as px 6 | import plotly.graph_objects as go 7 | import streamlit as st 8 | from sklearn.decomposition import PCA 9 | 10 | 11 | class PCAManager: 12 | def __init__(self, max_n_components: int): 13 | self.max_n_components = max_n_components 14 | self.normalize: bool | None = None 15 | self.model: PCA | None = None 16 | self.target_col: pd.Series | None = None 17 | 18 | @property 19 | def params(self) -> t.Dict[str, int]: 20 | columns = st.columns(2) 21 | return { 22 | "n_components": columns[0].slider( 23 | label="Number of Components", 24 | min_value=1, 25 | max_value=self.max_n_components, 26 | value=3, 27 | step=1, 28 | help="Number of principal components to compute.", 29 | ), 30 | "normalize": ( 31 | columns[1] 32 | .columns([0.5, 1, 0.5])[1] 33 | .toggle("Normalize data", value=False) 34 | ), 35 | } 36 | 37 | @st.cache_resource(show_spinner=True) 38 | def _get_model(_self, n_components: int) -> PCA: 39 | return PCA(n_components) 40 | 41 | def set_model(self) -> None: 42 | params = self.params 43 | self.model = self._get_model(params["n_components"]) 44 | self.model.normalize = params["normalize"] 45 | 46 | @st.cache_resource( 47 | show_spinner=True, 48 | hash_funcs={PCA: lambda model: (model.n_components, model.normalize)}, 49 | ) 50 | def _compute_pca( 51 | _self, model: PCA, data: pd.DataFrame 52 | ) -> t.Tuple[pd.DataFrame, PCA]: 53 | data_normalized = ( 54 | (data - data.mean()) / (data.std() + 1e-5) if model.normalize else data 55 | ) 56 | components = model.fit_transform(data_normalized) 57 | 58 | return pd.DataFrame( 59 | components, columns=[f"PC{i+1}" for i in range(components.shape[1])] 60 | ) 61 | 62 | def fit(self, data: pd.DataFrame, target_col: pd.Series): 63 | self.components_df = self._compute_pca(model=self.model, data=data) 64 | self.target_col = target_col 65 | 66 | def scatter_matrix_plot(self) -> None: 67 | return px.scatter_matrix( 68 | self.components_df, color=self.target_col, labels={"color": "target"} 69 | ) 70 | 71 | def explained_variance_plot(self) -> None: 72 | exp_var_cumul = np.cumsum(self.model.explained_variance_ratio_) 73 | x_ticks = list(range(1, exp_var_cumul.shape[0] + 1)) 74 | fig = px.bar( 75 | x=x_ticks, 76 | y=exp_var_cumul, 77 | labels={"x": "# Components", "y": "Explained Variance"}, 78 | ) 79 | fig.update_xaxes(tickvals=x_ticks, ticktext=list(map(str, x_ticks))) 80 | fig.add_trace( 81 | go.Scatter( 82 | x=x_ticks, 83 | y=exp_var_cumul, 84 | mode="lines+markers", 85 | line=dict(color="red", width=3), 86 | marker=dict(size=10), 87 | showlegend=False, 88 | ) 89 | ) 90 | return fig 91 | 92 | def scatter_2d_plot(self) -> None: 93 | return px.scatter( 94 | self.components_df, 95 | x="PC1", 96 | y="PC2", 97 | color=self.target_col, 98 | labels={"color": "target"}, 99 | ) 100 | 101 | def scatter_3d_plot(self) -> None: 102 | return px.scatter_3d( 103 | self.components_df, 104 | x="PC1", 105 | y="PC2", 106 | z="PC3", 107 | color=self.target_col, 108 | labels={"color": "target"}, 109 | ) 110 | 111 | def loadings_plot(self) -> None: 112 | loadings = self.model.components_.T * np.sqrt(self.model.explained_variance_) 113 | 114 | fig = px.scatter( 115 | self.components_df, 116 | x="PC1", 117 | y="PC2", 118 | color=self.target_col, 119 | labels={"color": "target"}, 120 | ) 121 | 122 | for i, feature in enumerate(self.components_df.columns): 123 | fig.add_annotation( 124 | ax=0, 125 | ay=0, 126 | axref="x", 127 | ayref="y", 128 | x=loadings[i, 0], 129 | y=loadings[i, 1], 130 | showarrow=True, 131 | arrowsize=2, 132 | arrowhead=2, 133 | xanchor="right", 134 | yanchor="top", 135 | ) 136 | fig.add_annotation( 137 | x=loadings[i, 0], 138 | y=loadings[i, 1], 139 | ax=0, 140 | ay=0, 141 | xanchor="center", 142 | yanchor="bottom", 143 | text=feature, 144 | yshift=5, 145 | ) 146 | return fig 147 | -------------------------------------------------------------------------------- /pages/pages_config.yaml: -------------------------------------------------------------------------------- 1 | __init__.py: 2 | NAME: Home 3 | ICON: 🏠 4 | DESCRIPTION: | 5 | Welcome to my superapp, a comprehensive toolset for data science and machine learning 🚀 6 | [![source code](https://img.shields.io/badge/source_code-gray?logo=github)](https://github.com/daltunay/my-superapp/) 7 | 8 | Technologies used: 9 | - **Programming language**: Python 10 | - **Libraries**: pandas, numpy, scikit-learn, OpenCV, Mediapipe, plotly, XGBoost, SHAP, LangChain, OpenAI, Together, FAISS, ultralytics, umap 11 | - **Deployment**: Docker, Streamlit 12 | 13 | Feel free to provide feedback and make this superapp even more powerful! 14 | 15 | 16 | > _Made by Daniel Altunay_ 17 | [![LinkedIn URL](https://img.icons8.com/?id=13930&format=png)](https://linkedin.com/in/daltunay) 18 | [![GitHub URL](https://img.icons8.com/?id=AZOZNnY73haj&format=png)](https://github.com/daltunay) 19 | 20 | --- 21 | SIDEBAR: radio 22 | TAG: 23 | 24 | statistical_tests: 25 | __init__.py: 26 | NAME: Statistical Tests 27 | ICON: 🔢 28 | DESCRIPTION: Perform several statistical tests! 29 | SIDEBAR: radio 30 | TAG: 📚 Statistics 31 | 32 | ab_test.py: 33 | NAME: A/B Test 34 | ICON: 🆎 35 | DESCRIPTION: | 36 | Perform A/B tests! 37 | > pandas, numpy, scipy 38 | TAG: 39 | 40 | chi2_test.py: 41 | NAME: Chi-squared Test 42 | ICON: 🆇 43 | DESCRIPTION: | 44 | Perform chi-squared tests! 45 | > pandas, numpy, scipy 46 | TAG: 47 | 48 | dimensionality_reduction: 49 | __init__.py: 50 | NAME: Dimensionality Reduction 51 | ICON: 🔽 52 | DESCRIPTION: Reduce dimensionality for high-D data! 53 | SIDEBAR: radio 54 | TAG: 📚 Statistics 55 | 56 | pca.py: 57 | NAME: PCA 58 | ICON: ⭕ 59 | DESCRIPTION: | 60 | Perform Principal Component Analysis! 61 | > pandas, scikit-learn, plotly 62 | TAG: 63 | 64 | t-sne.py: 65 | NAME: t-SNE 66 | ICON: 📊 67 | DESCRIPTION: | 68 | Perform t-distributed Stochastic Neighbor Embedding! 69 | > pandas, scikit-learn, plotly 70 | TAG: 71 | 72 | umap.py: 73 | NAME: UMAP 74 | ICON: 🗺️ 75 | DESCRIPTION: | 76 | Perform Uniform Manifold Approximation and Projection! 77 | > pandas, scikit-learn, umap, plotly 78 | TAG: 79 | 80 | landmarks: 81 | __init__.py: 82 | NAME: Landmarks Detection 83 | ICON: 📍 84 | DESCRIPTION: Perform live landmark detection using your webcam! 85 | SIDEBAR: radio 86 | TAG: 👁️ Computer Vision 87 | 88 | face_landmarks.py: 89 | NAME: Face Mesh 90 | ICON: 👤 91 | DESCRIPTION: | 92 | Detect face landmarks using Mediapipe! 93 | > OpenCV, Mediapipe, WebRTC 94 | TAG: 95 | 96 | pose_landmarks.py: 97 | NAME: Pose Landmarks 98 | ICON: 🤸‍♂️ 99 | DESCRIPTION: | 100 | Detect body pose landmarks using Mediapipe! 101 | > OpenCV, Mediapipe, WebRTC 102 | TAG: 103 | 104 | object_detection: 105 | __init__.py: 106 | NAME: Object Detection 107 | ICON: 🔍 108 | DESCRIPTION: Perform live object detection using your webcam! 109 | SIDEBAR: radio 110 | TAG: 👁️ Computer Vision 111 | 112 | face_detection.py: 113 | NAME: Face Detection 114 | ICON: 👀 115 | DESCRIPTION: | 116 | Detect one or several faces using Mediapipe! 117 | > OpenCV, Mediapipe, WebRTC 118 | TAG: 119 | 120 | multi_objects.py: 121 | NAME: Multi-Object Detection 122 | ICON: 📦 123 | DESCRIPTION: | 124 | Detect 80 unique labels using YOLOv8! 125 | > OpenCV, ultralytics, WebRTC 126 | TAG: 127 | 128 | image_generation: 129 | __init__.py: 130 | NAME: Image Generation 131 | ICON: 🎨 132 | DESCRIPTION: Generate pictures with AI! 133 | SIDEBAR: radio 134 | TAG: 🧠 Generative AI 135 | 136 | dall_e.py: 137 | NAME: DALL·E 138 | ICON: 🖼️ 139 | DESCRIPTION: DALL·E model from OpenAI 140 | TAG: 141 | 142 | stable_diffusion.py: 143 | NAME: Stable Diffusion 144 | ICON: 🖼️ 145 | DESCRIPTION: Stable Diffusion model from Stability AI 146 | TAG: 147 | 148 | large_language_models: 149 | __init__.py: 150 | NAME: Large Language Models 151 | ICON: 💬 152 | DESCRIPTION: Interact with large language models! 153 | SIDEBAR: radio 154 | TAG: 🧠 Generative AI 155 | 156 | chatbot.py: 157 | NAME: Basic Chatbot 158 | ICON: 👋 159 | DESCRIPTION: | 160 | A regular chatbot. 161 | > LangChain, OpenAI, Together 162 | TAG: 🤖 Chatbots 163 | 164 | chatbot_rag.py: 165 | NAME: Chatbot with RAG 166 | ICON: 📄 167 | DESCRIPTION: | 168 | A chatbot with RAG (retrieval augmented generation). 169 | > LangChain, OpenAI, Together, FAISS 170 | TAG: 🤖 Chatbots 171 | 172 | chatbot_tools.py: 173 | NAME: Chatbot with Tools 174 | ICON: 🛠️ 175 | DESCRIPTION: | 176 | A chatbot augmented with tools (web access, code interpreter, etc.) 177 | > LangChain (Agents), OpenAI, Together 178 | TAG: 🤖 Chatbots 179 | 180 | chatbot_web_summary.py: 181 | NAME: Webpage summary 182 | ICON: 🌐 183 | DESCRIPTION: | 184 | A model to synthetize to text content of a webpage. 185 | > LangChain, OpenAI, Together, unstructured 186 | TAG: 🔄 Other 187 | 188 | classification: 189 | __init__.py: 190 | NAME: Classification 191 | ICON: 🎯 192 | DESCRIPTION: Perform several types of classification! 193 | SIDEBAR: radio 194 | TAG: ⚙️ Machine Learning 195 | 196 | xgboost.py: 197 | NAME: Gradient Boosting 198 | ICON: 🌲 199 | DESCRIPTION: | 200 | Use gradient boosting for binary & multi-class classification! 201 | > pandas, XGBoost, scikit-learn, SHAP, plotly 202 | TAG: 203 | 204 | regression: 205 | __init__.py: 206 | NAME: Regression 207 | ICON: 📈 208 | DESCRIPTION: Perform several types of regression! 209 | SIDEBAR: radio 210 | TAG: ⚙️ Machine Learning 211 | 212 | xgboost.py: 213 | NAME: Gradient Boosting 214 | ICON: 🌲 215 | DESCRIPTION: | 216 | Use gradient boosting for regression! 217 | > pandas, XGBoost, scikit-learn, SHAP, plotly 218 | TAG: 219 | 220 | clustering: 221 | __init__.py: 222 | NAME: Clustering 223 | ICON: 🕸️ 224 | DESCRIPTION: Perform different types of clustering! 225 | SIDEBAR: radio 226 | TAG: ⚙️ Machine Learning 227 | 228 | kmeans.py: 229 | NAME: K-Means 230 | ICON: 🇰 231 | DESCRIPTION: | 232 | Perform a K-Means clustering! 233 | > pandas, scikit-learn, plotly 234 | TAG: 235 | 236 | dbscan.py: 237 | NAME: DBSCAN 238 | ICON: 🇩 239 | DESCRIPTION: | 240 | Perform a DBSCAN clustering! 241 | > pandas, scikit-learn, plotly 242 | TAG: 243 | -------------------------------------------------------------------------------- /src/machine_learning/xgboost_manager.py: -------------------------------------------------------------------------------- 1 | import typing as t 2 | 3 | import matplotlib.pyplot as plt 4 | import numpy as np 5 | import pandas as pd 6 | import shap 7 | import sklearn.metrics 8 | import streamlit as st 9 | from matplotlib.figure import Figure 10 | from xgboost import XGBClassifier, XGBRegressor 11 | 12 | 13 | def xgb_hash_func(model: XGBClassifier | XGBRegressor): 14 | return {key: val for key, val in vars(model).items() if key != "_Booster"} 15 | 16 | 17 | class XGBoostManager: 18 | def __init__(self, task: t.Literal["classification", "regression"]) -> None: 19 | self.task = task 20 | self.model: XGBClassifier | None = None 21 | self.classification_report: pd.DataFrame | None = None 22 | self.confusion_matrix: pd.DataFrame | None = None 23 | self.metrics_report: pd.DataFrame | None = None 24 | 25 | @property 26 | def params(self) -> t.Dict[str, float | int]: 27 | columns = st.columns(3) 28 | return { 29 | "max_depth": columns[0].slider( 30 | label="`max_depth`", 31 | min_value=1, 32 | max_value=5, 33 | value=3, 34 | step=1, 35 | help="Maximum depth of a tree. " 36 | "Increasing this value will make the model more complex and more likely to overfit. " 37 | "0 indicates no limit on depth.", 38 | ), 39 | "learning_rate": columns[0].select_slider( 40 | label="`learning_rate`", 41 | options=[0.01, 0.05, 0.1, 0.25, 0.5, 1.0], 42 | value=0.01, 43 | help="Step size shrinkage used in update to prevents overfitting. " 44 | "After each boosting step, we can directly get the weights of new features, and `learning_rate` shrinks the feature weights to make the boosting process more conservative.", 45 | ), 46 | "n_estimators": columns[0].slider( 47 | label="`n_estimators`", 48 | min_value=10, 49 | max_value=50, 50 | value=50, 51 | step=10, 52 | help="Number of gradient boosted trees. " 53 | "Equivalent to number of boosting rounds.", 54 | ), 55 | "subsample": columns[1].slider( 56 | label="`subsample`", 57 | min_value=0.1, 58 | max_value=1.0, 59 | value=0.8, 60 | step=0.1, 61 | help="Subsample ratio of the training instances. " 62 | "Setting it to 0.5 means that XGBoost would randomly sample half of the training data prior to growing trees and this will prevent overfitting. " 63 | "Subsampling will occur once in every boosting iteration.", 64 | ), 65 | "colsample_bytree": columns[1].slider( 66 | label="`colsample_bytree`", 67 | min_value=0.1, 68 | max_value=1.0, 69 | value=0.8, 70 | step=0.1, 71 | help="Subsample ratio of columns when constructing each tree. " 72 | "Subsampling occurs once for every tree constructed.", 73 | ), 74 | "min_split_loss": columns[1].slider( 75 | label="`min_split_loss`", 76 | min_value=0.0, 77 | max_value=5.0, 78 | value=0.0, 79 | step=0.5, 80 | help="Minimum loss reduction required to make a further partition on a leaf node of the tree. " 81 | "The larger `min_split_loss` is, the more conservative the algorithm will be.", 82 | ), 83 | "min_child_weight": columns[2].slider( 84 | label="`min_child_weight`", 85 | min_value=0.0, 86 | max_value=5.0, 87 | value=1.0, 88 | step=0.5, 89 | help="Minimum sum of instance weight (hessian) needed in a child. " 90 | "If the tree partition step results in a leaf node with the sum of instance weight less than `min_child_weight`, then the building process will give up further partitioning. " 91 | "In linear regression task, this simply corresponds to minimum number of instances needed to be in each node. " 92 | "The larger `min_child_weight` is, the more conservative the algorithm will be.", 93 | ), 94 | "reg_alpha": columns[2].slider( 95 | label="`reg_alpha`", 96 | min_value=0.0, 97 | max_value=5.0, 98 | value=1.0, 99 | step=0.5, 100 | help="L1 regularization term on weights. " 101 | "Increasing this value will make model more conservative.", 102 | ), 103 | "reg_lambda": columns[2].slider( 104 | label="`reg_lambda`", 105 | min_value=0.0, 106 | max_value=5.0, 107 | value=0.0, 108 | step=0.5, 109 | help="L2 regularization term on weights. " 110 | "Increasing this value will make model more conservative.", 111 | ), 112 | } 113 | 114 | @staticmethod 115 | @st.cache_resource(show_spinner=True) 116 | def _get_model( 117 | task: t.Literal["classification", "regression"], 118 | label_mapping: t.Dict[int, str] | None = None, 119 | **params 120 | ) -> XGBClassifier: 121 | if task == "classification": 122 | return XGBClassifier(**params) 123 | elif task == "regression": 124 | return XGBRegressor(**params) 125 | 126 | def set_model(self, label_mapping: t.Dict[int, str] | None = None) -> None: 127 | self.model = self._get_model(self.task, label_mapping, **self.params) 128 | 129 | @staticmethod 130 | @st.cache_resource( 131 | show_spinner=True, 132 | hash_funcs={XGBClassifier: xgb_hash_func, XGBRegressor: xgb_hash_func}, 133 | ) 134 | def _fit_model( 135 | model: XGBClassifier | XGBRegressor, X_train: pd.DataFrame, y_train: pd.Series 136 | ) -> XGBClassifier | XGBRegressor: 137 | return model.fit(X_train, y_train) 138 | 139 | def fit(self, X_train: pd.DataFrame, y_train: pd.Series): 140 | self.model = self._fit_model(self.model, X_train, y_train) 141 | 142 | @staticmethod 143 | @st.cache_data(show_spinner=True) 144 | def _classification_report( 145 | y_true: pd.Series, y_pred: pd.Series, target_names: t.List[str] 146 | ): 147 | return ( 148 | pd.DataFrame( 149 | sklearn.metrics.classification_report( 150 | y_true=y_true, 151 | y_pred=y_pred, 152 | target_names=target_names, 153 | output_dict=True, 154 | zero_division=np.nan, 155 | ) 156 | ) 157 | .astype(float) 158 | .round(4) 159 | .transpose() 160 | ) 161 | 162 | @staticmethod 163 | @st.cache_data(show_spinner=True) 164 | def _confusion_matrix(y_true: pd.Series, y_pred: pd.Series): 165 | return pd.DataFrame( 166 | sklearn.metrics.confusion_matrix(y_true=y_true, y_pred=y_pred) 167 | ) 168 | 169 | @staticmethod 170 | @st.cache_data(show_spinner=True) 171 | def _metrics_report(y_true: pd.Series, y_pred: pd.Series): 172 | mean_absolute_error = sklearn.metrics.mean_absolute_error(y_true, y_pred) 173 | median_absolute_error = sklearn.metrics.median_absolute_error(y_true, y_pred) 174 | mean_squared_error = sklearn.metrics.mean_squared_error(y_true, y_pred) 175 | r2 = sklearn.metrics.r2_score(y_true, y_pred) 176 | explained_variance = sklearn.metrics.explained_variance_score(y_true, y_pred) 177 | return pd.DataFrame( 178 | { 179 | "Mean Absolute Error": [mean_absolute_error], 180 | "Median Absolute Error": [median_absolute_error], 181 | "Mean Squared Error": [mean_squared_error], 182 | "Root Mean Squared Error": [mean_squared_error**0.5], 183 | "R^2": [r2], 184 | "Explained Variance": [explained_variance], 185 | }, 186 | index=["Value"], 187 | ).transpose() 188 | 189 | @staticmethod 190 | @st.cache_data(show_spinner=True) 191 | def _confusion_matrix_display( 192 | confusion_matrix: pd.DataFrame, display_labels: t.List[str] 193 | ) -> sklearn.metrics.ConfusionMatrixDisplay: 194 | return sklearn.metrics.ConfusionMatrixDisplay( 195 | confusion_matrix=confusion_matrix, 196 | display_labels=display_labels, 197 | ) 198 | 199 | def confusion_matrix_display(self, display_labels: t.List[str]) -> Figure: 200 | confusion_matrix_display = self._confusion_matrix_display( 201 | confusion_matrix=self.confusion_matrix.to_numpy(), 202 | display_labels=display_labels, 203 | ) 204 | fig, ax = plt.subplots() 205 | confusion_matrix_display.plot(ax=ax) 206 | return fig 207 | 208 | def evaluate( 209 | self, 210 | X_test: pd.DataFrame, 211 | y_test: pd.Series, 212 | target_names: t.List[str] | None = None, 213 | ): 214 | y_pred = self.model.predict(X_test) 215 | if self.task == "classification": 216 | self.classification_report = self._classification_report( 217 | y_true=y_test, 218 | y_pred=y_pred, 219 | target_names=target_names, 220 | ) 221 | self.confusion_matrix = self._confusion_matrix( 222 | y_true=y_test, 223 | y_pred=y_pred, 224 | ) 225 | elif self.task == "regression": 226 | self.metrics_report = self._metrics_report( 227 | y_true=y_test, 228 | y_pred=y_pred, 229 | ) 230 | 231 | @staticmethod 232 | @st.cache_data( 233 | show_spinner=True, 234 | hash_funcs={XGBClassifier: xgb_hash_func, XGBRegressor: xgb_hash_func}, 235 | ) 236 | def _shap_values(model: XGBClassifier | XGBRegressor, X_test: pd.DataFrame): 237 | explainer = shap.TreeExplainer(model) 238 | shap_values = explainer.shap_values(X_test) 239 | return explainer, shap_values 240 | 241 | def shap_force_plot(self, X_test: pd.DataFrame): 242 | explainer, shap_values = self._shap_values(self.model, X_test) 243 | base_value = explainer.expected_value 244 | if isinstance(self.model, XGBClassifier): 245 | base_value = base_value[0] 246 | shap_values = shap_values[0] 247 | return shap.force_plot( 248 | base_value=base_value, shap_values=shap_values, features=X_test 249 | ) 250 | --------------------------------------------------------------------------------