├── src
    ├── __init__.py
    ├── computer_vision
    │   ├── __init__.py
    │   ├── object_detection
    │   │   ├── __init__.py
    │   │   ├── multi_objects.py
    │   │   └── face_detection.py
    │   └── landmarks
    │   │   ├── __init__.py
    │   │   ├── pose_landmarks.py
    │   │   ├── face_landmarks.py
    │   │   └── base.py
    ├── machine_learning
    │   ├── __init__.py
    │   ├── clustering
    │   │   ├── __init__.py
    │   │   ├── dbscan_manager.py
    │   │   └── kmeans_manager.py
    │   ├── datasets.py
    │   └── xgboost_manager.py
    ├── statistics
    │   ├── statistical_tests
    │   │   ├── __init__.py
    │   │   ├── chi_squared.py
    │   │   └── ab_test.py
    │   └── dimensionality_reduction
    │   │   ├── __init__.py
    │   │   ├── umap_manager.py
    │   │   ├── tsne_manager.py
    │   │   └── pca_manager.py
    └── generative_ai
    │   ├── image_generation
    │       ├── __init__.py
    │       ├── dall_e.py
    │       └── stable_diffusion.py
    │   └── large_language_models
    │       ├── __init__.py
    │       ├── chatbots
    │           ├── __init__.py
    │           ├── chatbot_web_summary.py
    │           ├── chatbot_rag.py
    │           ├── chatbot_tools.py
    │           └── chatbot.py
    │       ├── callbacks.py
    │       └── ingest.py
├── data
    └── documents
    │   └── .gitkeep
├── faiss_index
    └── .gitkeep
├── notebooks
    └── draft.ipynb
├── packages.txt
├── pages
    ├── __init__.py
    ├── landmarks
    │   ├── __init__.py
    │   ├── face_landmarks.py
    │   └── pose_landmarks.py
    ├── classification
    │   ├── __init__.py
    │   └── xgboost.py
    ├── clustering
    │   ├── __init__.py
    │   ├── dbscan.py
    │   └── kmeans.py
    ├── regression
    │   ├── __init__.py
    │   └── xgboost.py
    ├── image_generation
    │   ├── __init__.py
    │   ├── dall_e.py
    │   └── stable_diffusion.py
    ├── object_detection
    │   ├── __init__.py
    │   ├── face_detection.py
    │   └── multi_objects.py
    ├── statistical_tests
    │   ├── __init__.py
    │   ├── ab_test.py
    │   └── chi2_test.py
    ├── dimensionality_reduction
    │   ├── __init__.py
    │   ├── t-sne.py
    │   ├── umap.py
    │   └── pca.py
    ├── large_language_models
    │   ├── __init__.py
    │   ├── chatbot_web_summary.py
    │   ├── chatbot.py
    │   ├── chatbot_tools.py
    │   └── chatbot_rag.py
    └── pages_config.yaml
├── utils
    ├── widgets
    │   ├── __init__.py
    │   ├── language.py
    │   └── lakera.py
    ├── callbacks.py
    ├── shap.py
    ├── secrets.py
    ├── turn.py
    ├── __init__.py
    ├── image_annotation.py
    ├── logging.py
    ├── pages_config.py
    ├── misc.py
    └── streamlit_display.py
├── .streamlit
    ├── config.toml
    └── secrets.toml.example
├── .gitignore
├── app.py
├── Dockerfile
├── bin
    └── run.sh
├── config
    ├── providers.yaml
    └── models.yaml
├── LICENSE
├── pyproject.toml
└── README.md


/src/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/data/documents/.gitkeep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/faiss_index/.gitkeep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/notebooks/draft.ipynb:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/packages.txt:
--------------------------------------------------------------------------------
1 | python3-opencv


--------------------------------------------------------------------------------
/src/computer_vision/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/pages/__init__.py:
--------------------------------------------------------------------------------
1 | import utils
2 | 
3 | loader = utils.PageConfigLoader(__file__)
4 | loader.set_page_config(globals())
5 | 


--------------------------------------------------------------------------------
/pages/landmarks/__init__.py:
--------------------------------------------------------------------------------
1 | import utils
2 | 
3 | loader = utils.PageConfigLoader(__file__)
4 | loader.set_page_config(globals())
5 | 


--------------------------------------------------------------------------------
/pages/classification/__init__.py:
--------------------------------------------------------------------------------
1 | import utils
2 | 
3 | loader = utils.PageConfigLoader(__file__)
4 | loader.set_page_config(globals())
5 | 


--------------------------------------------------------------------------------
/pages/clustering/__init__.py:
--------------------------------------------------------------------------------
1 | import utils
2 | 
3 | loader = utils.PageConfigLoader(__file__)
4 | loader.set_page_config(globals())
5 | 


--------------------------------------------------------------------------------
/pages/regression/__init__.py:
--------------------------------------------------------------------------------
1 | import utils
2 | 
3 | loader = utils.PageConfigLoader(__file__)
4 | loader.set_page_config(globals())
5 | 


--------------------------------------------------------------------------------
/pages/image_generation/__init__.py:
--------------------------------------------------------------------------------
1 | import utils
2 | 
3 | loader = utils.PageConfigLoader(__file__)
4 | loader.set_page_config(globals())
5 | 


--------------------------------------------------------------------------------
/pages/object_detection/__init__.py:
--------------------------------------------------------------------------------
1 | import utils
2 | 
3 | loader = utils.PageConfigLoader(__file__)
4 | loader.set_page_config(globals())
5 | 


--------------------------------------------------------------------------------
/pages/statistical_tests/__init__.py:
--------------------------------------------------------------------------------
1 | import utils
2 | 
3 | loader = utils.PageConfigLoader(__file__)
4 | loader.set_page_config(globals())
5 | 


--------------------------------------------------------------------------------
/pages/dimensionality_reduction/__init__.py:
--------------------------------------------------------------------------------
1 | import utils
2 | 
3 | loader = utils.PageConfigLoader(__file__)
4 | loader.set_page_config(globals())
5 | 


--------------------------------------------------------------------------------
/utils/widgets/__init__.py:
--------------------------------------------------------------------------------
1 | from utils.widgets.lakera import LakeraWidget
2 | from utils.widgets.language import LanguageWidget
3 | 
4 | __all__ = ["LakeraWidget", "LanguageWidget"]
5 | 


--------------------------------------------------------------------------------
/.streamlit/config.toml:
--------------------------------------------------------------------------------
 1 | [browser]
 2 | gatherUsageStats = true
 3 | 
 4 | [server]
 5 | address = "0.0.0.0"
 6 | port = 8501
 7 | 
 8 | [global]
 9 | disableWidgetStateDuplicationWarning = true
10 | 


--------------------------------------------------------------------------------
/utils/callbacks.py:
--------------------------------------------------------------------------------
1 | import streamlit as st
2 | 
3 | 
4 | def update_slider_callback(updated: str, to_update: str):
5 |     setattr(st.session_state, to_update, 1 - st.session_state.get(updated))
6 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | __pycache__/
 2 | .DS_store
 3 | .mypy_cache/
 4 | .ruff_cache/
 5 | .__pycache__/
 6 | .streamlit/secrets.toml
 7 | *.pem
 8 | *.pyc
 9 | cache/
10 | TODO.md
11 | *.pt
12 | *.faiss
13 | *.pkl


--------------------------------------------------------------------------------
/src/machine_learning/__init__.py:
--------------------------------------------------------------------------------
1 | from src.machine_learning.datasets import Dataset
2 | from src.machine_learning.xgboost_manager import XGBoostManager
3 | 
4 | __all__ = ["Dataset", "XGBoostManager"]
5 | 


--------------------------------------------------------------------------------
/src/machine_learning/clustering/__init__.py:
--------------------------------------------------------------------------------
1 | from src.machine_learning.clustering.dbscan_manager import DBScanManager
2 | from src.machine_learning.clustering.kmeans_manager import KMeansManager
3 | 
4 | __all__ = ["KMeansManager", "DBScanManager"]
5 | 


--------------------------------------------------------------------------------
/utils/shap.py:
--------------------------------------------------------------------------------
1 | import shap
2 | import streamlit.components.v1 as components
3 | 
4 | 
5 | def st_shap(plot, height=None):
6 |     shap_html = f"<head>{shap.getjs()}</head><body>{plot.html()}</body>"
7 |     components.html(shap_html, height=height)
8 | 


--------------------------------------------------------------------------------
/app.py:
--------------------------------------------------------------------------------
 1 | import streamlit as st
 2 | import streamlit_superapp as st_superapp
 3 | 
 4 | import utils
 5 | 
 6 | utils.load_secrets()
 7 | 
 8 | st.set_page_config(page_title="daltunay", page_icon="🚀", layout="centered")
 9 | 
10 | st_superapp.run()
11 | 


--------------------------------------------------------------------------------
/src/computer_vision/object_detection/__init__.py:
--------------------------------------------------------------------------------
1 | from src.computer_vision.object_detection.face_detection import \
2 |     FaceDetectionApp
3 | from src.computer_vision.object_detection.multi_objects import \
4 |     MultiObjectsDetectionApp
5 | 
6 | __all__ = ["FaceDetectionApp", "MultiObjectsDetectionApp"]
7 | 


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM python:3.11-slim
 2 | 
 3 | WORKDIR /app
 4 | 
 5 | COPY . /app
 6 | 
 7 | RUN pip install "poetry==1.7.0" \
 8 |     && poetry config virtualenvs.create false \
 9 |     && poetry install --no-interaction --no-dev
10 | 
11 | EXPOSE 8501
12 | 
13 | ENTRYPOINT ["streamlit", "run"]
14 | 
15 | CMD ["app.py"]
16 | 


--------------------------------------------------------------------------------
/src/statistics/statistical_tests/__init__.py:
--------------------------------------------------------------------------------
1 | from src.statistics.statistical_tests.ab_test import (ABTesting,
2 |                                                       input_group_data)
3 | from src.statistics.statistical_tests.chi_squared import Chi2Testing
4 | 
5 | __all__ = ["ABTesting", "input_group_data", "Chi2Testing"]
6 | 


--------------------------------------------------------------------------------
/bin/run.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | export DOCKER_CLI_HINTS=false
 4 | 
 5 | if [ "$(docker ps -q --filter ancestor=daltunay)" ]; then
 6 |     docker stop $(docker ps -q --filter ancestor=daltunay)
 7 |     docker rm $(docker ps -q --filter ancestor=daltunay)
 8 | fi
 9 | 
10 | docker build -t daltunay . &&
11 | docker run -p 8501:8501 daltunay
12 | 


--------------------------------------------------------------------------------
/src/computer_vision/landmarks/__init__.py:
--------------------------------------------------------------------------------
1 | from src.computer_vision.landmarks.base import BaseLandmarkerApp
2 | from src.computer_vision.landmarks.face_landmarks import FaceLandmarkerApp
3 | from src.computer_vision.landmarks.pose_landmarks import PoseLandmarkerApp
4 | 
5 | __all__ = ["BaseLandmarkerApp", "FaceLandmarkerApp", "PoseLandmarkerApp"]
6 | 


--------------------------------------------------------------------------------
/.streamlit/secrets.toml.example:
--------------------------------------------------------------------------------
 1 | [twilio]
 2 | TWILIO_ACCOUNT_SID = "<...>"
 3 | TWILIO_AUTH_TOKEN = "<...>"
 4 | 
 5 | [openai]
 6 | OPENAI_API_KEY = "<...>"
 7 | 
 8 | [together]
 9 | TOGETHER_API_KEY = "<...>"
10 | 
11 | [lakera_guard]
12 | LAKERA_GUARD_API_KEY = "<...>"
13 | 
14 | [google]
15 | GOOGLE_API_KEY = "<...>"
16 | GOOGLE_CSE_ID = "<...>"
17 | 


--------------------------------------------------------------------------------
/src/statistics/dimensionality_reduction/__init__.py:
--------------------------------------------------------------------------------
1 | from src.statistics.dimensionality_reduction.pca_manager import PCAManager
2 | from src.statistics.dimensionality_reduction.tsne_manager import TSNEManager
3 | from src.statistics.dimensionality_reduction.umap_manager import UMAPManager
4 | 
5 | __all__ = ["PCAManager", "TSNEManager", "UMAPManager"]
6 | 


--------------------------------------------------------------------------------
/pages/large_language_models/__init__.py:
--------------------------------------------------------------------------------
 1 | import typing as t
 2 | 
 3 | import yaml
 4 | 
 5 | import utils
 6 | 
 7 | loader = utils.PageConfigLoader(__file__)
 8 | loader.set_page_config(globals())
 9 | 
10 | with open("config/models.yaml") as f:
11 |     LLM_CONFIG: t.Dict[str, str] = yaml.safe_load(f)["generative_ai"][
12 |         "large_language_models"
13 |     ]
14 | 
15 | __all__ = ["LLM_CONFIG"]
16 | 


--------------------------------------------------------------------------------
/utils/secrets.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | import streamlit as st
 4 | 
 5 | import utils
 6 | 
 7 | logger = utils.CustomLogger(__file__)
 8 | 
 9 | 
10 | def load_secrets():
11 |     for secrets in st.secrets.values():
12 |         for secret_name, secret in secrets.items():
13 |             masked_secret = secret[:4] + "*" * (len(secret) - 4)
14 |             logger.info(f"Setting {secret_name}={masked_secret}")
15 |             os.environ[secret_name] = secret
16 | 


--------------------------------------------------------------------------------
/config/providers.yaml:
--------------------------------------------------------------------------------
 1 | openai:
 2 |   name: OpenAI
 3 |   url: https://openai.com/
 4 |   api:
 5 |     help: https://platform.openai.com/account/api-keys
 6 |     endpoint: https://api.openai.com/v1/images/generations
 7 |     key: OPENAI_API_KEY
 8 | 
 9 | together:
10 |   name: Together AI
11 |   url: https://www.together.ai/
12 |   api:
13 |     help: https://api.together.xyz/settings/api-keys
14 |     endpoint: https://api.together.xyz/inference
15 |     key: TOGETHER_API_KEY
16 | 


--------------------------------------------------------------------------------
/utils/turn.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | import streamlit as st
 4 | from twilio.rest import Client
 5 | 
 6 | import utils
 7 | 
 8 | logger = utils.CustomLogger(__file__)
 9 | 
10 | 
11 | @st.cache_data(show_spinner=False)
12 | def get_ice_servers():
13 |     account_sid = os.getenv("TWILIO_ACCOUNT_SID")
14 |     auth_token = os.getenv("TWILIO_AUTH_TOKEN")
15 | 
16 |     client = Client(account_sid, auth_token)
17 |     token = client.tokens.create()
18 | 
19 |     return token.ice_servers
20 | 


--------------------------------------------------------------------------------
/pages/landmarks/face_landmarks.py:
--------------------------------------------------------------------------------
 1 | import streamlit as st
 2 | 
 3 | import utils
 4 | from src.computer_vision.landmarks import FaceLandmarkerApp
 5 | 
 6 | loader = utils.PageConfigLoader(__file__)
 7 | loader.set_page_config(globals())
 8 | 
 9 | logger = utils.CustomLogger(__file__)
10 | 
11 | st_ss = st.session_state
12 | 
13 | 
14 | def main():
15 |     utils.show_source_code("src/computer_vision/landmarks/face_landmarks.py")
16 | 
17 |     st_ss.setdefault("face_app", FaceLandmarkerApp()).stream()
18 | 


--------------------------------------------------------------------------------
/pages/landmarks/pose_landmarks.py:
--------------------------------------------------------------------------------
 1 | import streamlit as st
 2 | 
 3 | import utils
 4 | from src.computer_vision.landmarks import PoseLandmarkerApp
 5 | 
 6 | loader = utils.PageConfigLoader(__file__)
 7 | loader.set_page_config(globals())
 8 | 
 9 | logger = utils.CustomLogger(__file__)
10 | 
11 | st_ss = st.session_state
12 | 
13 | 
14 | def main():
15 |     utils.show_source_code("src/computer_vision/landmarks/pose_landmarks.py")
16 | 
17 |     st_ss.setdefault("pose_app", PoseLandmarkerApp()).stream()
18 | 


--------------------------------------------------------------------------------
/src/generative_ai/image_generation/__init__.py:
--------------------------------------------------------------------------------
 1 | import typing as t
 2 | 
 3 | import yaml
 4 | 
 5 | from src.generative_ai.image_generation.dall_e import dall_e_image
 6 | from src.generative_ai.image_generation.stable_diffusion import \
 7 |     stable_diffusion_image
 8 | 
 9 | with open("config/models.yaml") as f:
10 |     IMAGE_GEN_CONFIG: t.Dict[str, str] = yaml.safe_load(f)["generative_ai"][
11 |         "image_creation"
12 |     ]
13 | 
14 | __all__ = ["IMAGE_GEN_CONFIG", "dall_e_image", "stable_diffusion_image"]
15 | 


--------------------------------------------------------------------------------
/pages/object_detection/face_detection.py:
--------------------------------------------------------------------------------
 1 | import streamlit as st
 2 | 
 3 | import utils
 4 | from src.computer_vision.object_detection import FaceDetectionApp
 5 | 
 6 | loader = utils.PageConfigLoader(__file__)
 7 | loader.set_page_config(globals())
 8 | 
 9 | logger = utils.CustomLogger(__file__)
10 | 
11 | st_ss = st.session_state
12 | 
13 | 
14 | def main():
15 |     utils.show_source_code(
16 |         path="src/computer_vision/object_detection/face_detection.py"
17 |     )
18 | 
19 |     st_ss.setdefault("face_detection_app", FaceDetectionApp()).stream()
20 | 


--------------------------------------------------------------------------------
/pages/object_detection/multi_objects.py:
--------------------------------------------------------------------------------
 1 | import streamlit as st
 2 | 
 3 | import utils
 4 | from src.computer_vision.object_detection import MultiObjectsDetectionApp
 5 | 
 6 | loader = utils.PageConfigLoader(__file__)
 7 | loader.set_page_config(globals())
 8 | 
 9 | logger = utils.CustomLogger(__file__)
10 | 
11 | st_ss = st.session_state
12 | 
13 | 
14 | def main():
15 |     utils.show_source_code(path="src/computer_vision/object_detection/multi_objects.py")
16 | 
17 |     st_ss.setdefault("multi_objects_detection_app", MultiObjectsDetectionApp()).stream()
18 | 


--------------------------------------------------------------------------------
/src/generative_ai/large_language_models/__init__.py:
--------------------------------------------------------------------------------
 1 | from src.generative_ai.large_language_models.callbacks import \
 2 |     StreamingChatCallbackHandler
 3 | from src.generative_ai.large_language_models.chatbots import (
 4 |     Chatbot, ChatbotRAG, ChatbotTools, ChatbotWebSummary)
 5 | from src.generative_ai.large_language_models.ingest import get_vector_store
 6 | 
 7 | __all__ = [
 8 |     "Chatbot",
 9 |     "ChatbotRAG",
10 |     "ChatbotTools",
11 |     "ChatbotWebSummary",
12 |     "StreamingChatCallbackHandler",
13 |     "get_vector_store",
14 | ]
15 | 


--------------------------------------------------------------------------------
/src/generative_ai/large_language_models/chatbots/__init__.py:
--------------------------------------------------------------------------------
 1 | from src.generative_ai.large_language_models.chatbots.chatbot import (
 2 |     Chatbot, ModelArgs)
 3 | from src.generative_ai.large_language_models.chatbots.chatbot_rag import \
 4 |     ChatbotRAG
 5 | from src.generative_ai.large_language_models.chatbots.chatbot_tools import \
 6 |     ChatbotTools
 7 | from src.generative_ai.large_language_models.chatbots.chatbot_web_summary import \
 8 |     ChatbotWebSummary
 9 | 
10 | __all__ = ["Chatbot", "ModelArgs", "ChatbotRAG", "ChatbotTools", "ChatbotWebSummary"]
11 | 


--------------------------------------------------------------------------------
/src/generative_ai/large_language_models/callbacks.py:
--------------------------------------------------------------------------------
 1 | import streamlit as st
 2 | from langchain.callbacks.base import BaseCallbackHandler
 3 | 
 4 | 
 5 | class StreamingChatCallbackHandler(BaseCallbackHandler):
 6 |     def __init__(self):
 7 |         pass
 8 | 
 9 |     def on_llm_start(self, *args, **kwargs):
10 |         self.container = st.empty()
11 |         self.text = ""
12 | 
13 |     def on_llm_new_token(self, token: str, *args, **kwargs):
14 |         self.text += token
15 |         self.container.markdown(
16 |             body=self.text,
17 |             unsafe_allow_html=False,
18 |         )
19 | 
20 |     def on_llm_end(self, response: str, *args, **kwargs):
21 |         self.container.markdown(
22 |             body=response.generations[0][0].text,
23 |             unsafe_allow_html=False,
24 |         )
25 | 


--------------------------------------------------------------------------------
/src/generative_ai/image_generation/dall_e.py:
--------------------------------------------------------------------------------
 1 | import streamlit as st
 2 | from openai import OpenAI
 3 | from PIL import Image
 4 | 
 5 | import utils
 6 | from utils.misc import base64_to_img
 7 | 
 8 | logger = utils.CustomLogger(__file__)
 9 | 
10 | 
11 | @st.cache_data(show_spinner="Generating picture...")
12 | def dall_e_image(
13 |     prompt: str,
14 |     width: int = 1024,
15 |     height: int = 1024,
16 | ) -> Image.Image:
17 |     from src.generative_ai.image_generation import IMAGE_GEN_CONFIG
18 | 
19 |     model_config = IMAGE_GEN_CONFIG["DALL-E 2"]
20 | 
21 |     client = OpenAI()
22 |     response = client.images.generate(
23 |         model=model_config["string"],
24 |         prompt=prompt,
25 |         size=f"{width}x{height}",
26 |         n=1,
27 |         response_format="b64_json",
28 |     )
29 |     base64 = response.data[0].b64_json
30 |     return base64_to_img(base64)
31 | 


--------------------------------------------------------------------------------
/pages/image_generation/dall_e.py:
--------------------------------------------------------------------------------
 1 | import streamlit as st
 2 | 
 3 | import utils
 4 | from src.generative_ai.image_generation import dall_e_image
 5 | 
 6 | loader = utils.PageConfigLoader(__file__)
 7 | loader.set_page_config(globals())
 8 | 
 9 | logger = utils.CustomLogger(__file__)
10 | 
11 | 
12 | def main():
13 |     utils.show_source_code("src/generative_ai/image_generation/dall_e.py")
14 | 
15 |     submitted = False
16 |     with st.form(key="dall_e_form"):
17 |         prompt = st.text_input(label="Input prompt: ")
18 |         centered = st.columns(3)[1]
19 |         with centered:
20 |             submitted = st.form_submit_button(
21 |                 label="Generate with DALL·E", use_container_width=True
22 |             )
23 |     st.subheader(body="Output", anchor=False)
24 |     if submitted:
25 |         image = dall_e_image(prompt=prompt)
26 |         st.image(image=image, caption=f"{prompt} - Generated by DALL·E")
27 | 


--------------------------------------------------------------------------------
/src/generative_ai/image_generation/stable_diffusion.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | import streamlit as st
 4 | import together
 5 | from PIL import Image
 6 | 
 7 | import utils
 8 | from utils.misc import base64_to_img
 9 | 
10 | logger = utils.CustomLogger(__file__)
11 | 
12 | 
13 | @st.cache_data(show_spinner="Generating picture...")
14 | def stable_diffusion_image(
15 |     prompt: str,
16 |     width: int = 1024,
17 |     height: int = 1024,
18 | ) -> Image.Image:
19 |     from src.generative_ai.image_generation import IMAGE_GEN_CONFIG
20 | 
21 |     model_config = IMAGE_GEN_CONFIG["Stable Diffusion 2.1"]
22 |     together.api_key = os.getenv("TOGETHER_API_KEY")
23 | 
24 |     response = together.Image.create(
25 |         model=f"{model_config['owner']}/{model_config['string']}",
26 |         prompt=prompt,
27 |         width=width,
28 |         height=height,
29 |     )
30 | 
31 |     base64 = response["output"]["choices"][0]["image_base64"]
32 |     return base64_to_img(base64)
33 | 


--------------------------------------------------------------------------------
/pages/image_generation/stable_diffusion.py:
--------------------------------------------------------------------------------
 1 | import streamlit as st
 2 | 
 3 | import utils
 4 | from src.generative_ai.image_generation import stable_diffusion_image
 5 | 
 6 | loader = utils.PageConfigLoader(__file__)
 7 | loader.set_page_config(globals())
 8 | 
 9 | logger = utils.CustomLogger(__file__)
10 | 
11 | 
12 | def main():
13 |     utils.show_source_code(
14 |         path="src/generative_ai/image_generation/stable_diffusion.py"
15 |     )
16 | 
17 |     submitted = False
18 |     with st.form(key="stable_diffusion_form"):
19 |         prompt = st.text_input(label="Input prompt: ")
20 |         centered = st.columns(3)[1]
21 |         with centered:
22 |             submitted = st.form_submit_button(
23 |                 label="Generate with Stable Diffusion", use_container_width=True
24 |             )
25 |     st.subheader(body="Output", anchor=False)
26 |     if submitted:
27 |         image = stable_diffusion_image(prompt=prompt)
28 |         st.image(image=image, caption=f"{prompt} - Generated by Stable Diffusion")
29 | 


--------------------------------------------------------------------------------
/config/models.yaml:
--------------------------------------------------------------------------------
 1 | generative_ai:
 2 |   large_language_models:
 3 |     GPT-3.5 Turbo:
 4 |       provider: openai
 5 |       organization: OpenAI
 6 |       owner: null
 7 |       string: gpt-3.5-turbo
 8 |       experimental_flag: true
 9 | 
10 |     LLaMA-2 Chat (7B):
11 |       provider: together
12 |       organization: Meta
13 |       owner: togethercomputer
14 |       string: llama-2-7b-chat
15 |       experimental_flag: true
16 | 
17 |     Mistral (7B) Instruct:
18 |       provider: together
19 |       organization: mistralai
20 |       owner: mistralai
21 |       string: Mistral-7B-Instruct-v0.1
22 |       experimental_flag: true
23 | 
24 |   image_creation:
25 |     DALL-E 2:
26 |       provider: openai
27 |       organization: OpenAI
28 |       owner: null
29 |       string: dall-e-2
30 |       experimental_flag: true
31 | 
32 |     Stable Diffusion 2.1:
33 |       provider: together
34 |       organization: Stability AI
35 |       owner: stabilityai
36 |       string: stable-diffusion-2-1
37 |       experimental_flag: true
38 | 


--------------------------------------------------------------------------------
/utils/__init__.py:
--------------------------------------------------------------------------------
 1 | from utils.callbacks import update_slider_callback
 2 | from utils.image_annotation import annotate_time
 3 | from utils.logging import CustomLogger
 4 | from utils.misc import (base64_to_img, generate_logo_link,
 5 |                         reset_session_state_key, show_logos, show_source_code)
 6 | from utils.pages_config import PageConfigLoader
 7 | from utils.secrets import load_secrets
 8 | from utils.shap import st_shap
 9 | from utils.streamlit_display import display_tab_content, tabs_config
10 | from utils.turn import get_ice_servers
11 | from utils.widgets import LakeraWidget, LanguageWidget
12 | 
13 | __all__ = [
14 |     "base64_to_img",
15 |     "generate_logo_link",
16 |     "load_secrets",
17 |     "CustomLogger",
18 |     "show_logos",
19 |     "show_source_code",
20 |     "LakeraWidget",
21 |     "LanguageWidget",
22 |     "PageConfigLoader",
23 |     "reset_session_state_key",
24 |     "get_ice_servers",
25 |     "annotate_time",
26 |     "tabs_config",
27 |     "display_tab_content",
28 |     "update_slider_callback",
29 |     "st_shap",
30 | ]
31 | 


--------------------------------------------------------------------------------
/utils/image_annotation.py:
--------------------------------------------------------------------------------
 1 | from datetime import datetime
 2 | 
 3 | import cv2
 4 | from numpy import ndarray
 5 | 
 6 | 
 7 | def annotate_time(image: ndarray) -> None:
 8 |     text = datetime.now().strftime("%Y-%m-%d %H:%M:%S.%f")[:-3]
 9 |     text_args = {
10 |         "text": text,
11 |         "fontFace": cv2.FONT_HERSHEY_SIMPLEX,
12 |         "fontScale": .5,
13 |         "thickness": 1,
14 |     }
15 |     text_size = cv2.getTextSize(**text_args)[0]
16 |     rect_width, rect_height = text_size[0] + 20, text_size[1] + 20
17 |     cv2.rectangle(
18 |         img=image,
19 |         pt1=(0, 0),
20 |         pt2=(rect_width, rect_height),
21 |         color=(255, 255, 255),
22 |         thickness=cv2.FILLED,
23 |     )
24 |     cv2.rectangle(
25 |         img=image,
26 |         pt1=(0, 0),
27 |         pt2=(rect_width, rect_height),
28 |         color=(0, 0, 0),
29 |         thickness=2,
30 |     )
31 |     cv2.putText(
32 |         img=image,
33 |         org=(10, text_size[1] + 10),
34 |         color=(0, 0, 0),
35 |         lineType=cv2.LINE_AA,
36 |         **text_args,
37 |     )
38 | 


--------------------------------------------------------------------------------
/utils/widgets/language.py:
--------------------------------------------------------------------------------
 1 | import typing as t
 2 | 
 3 | import streamlit as st
 4 | 
 5 | import utils
 6 | 
 7 | logger = utils.CustomLogger(__file__)
 8 | 
 9 | st_ss = st.session_state
10 | 
11 | 
12 | class LanguageWidget:
13 |     widget_key = "language_widget"
14 |     selectbox_key = f"{widget_key}.selection"
15 | 
16 |     def __init__(
17 |         self,
18 |         languages: t.List[str] | None = None,
19 |         default: str | None = None,
20 |     ):
21 |         logger.info(f"Initializing {self.__class__.__name__}")
22 |         self.languages = languages or ["English", "French"]
23 |         self.default = default or "English"
24 | 
25 |     @property
26 |     def selected_language(self):
27 |         return st.selectbox(
28 |             label="Language:",
29 |             options=list(self.languages),
30 |             index=list(self.languages).index(
31 |                 st_ss.get(self.selectbox_key, self.default)
32 |             ),
33 |             key=self.selectbox_key,
34 |             help="Changes the **chat language only**, not the interface language",
35 |         )
36 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2023 Daniel Altunay
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/src/computer_vision/landmarks/pose_landmarks.py:
--------------------------------------------------------------------------------
 1 | import typing as t
 2 | from functools import cached_property
 3 | 
 4 | import mediapipe as mp
 5 | 
 6 | from src.computer_vision.landmarks import BaseLandmarkerApp
 7 | 
 8 | 
 9 | class PoseLandmarkerApp(BaseLandmarkerApp):
10 |     landmarks_type = "pose_landmarks"
11 | 
12 |     def __init__(self):
13 |         super().__init__()
14 | 
15 |     @cached_property
16 |     def landmarker(self) -> mp.solutions.pose.Pose:
17 |         return mp.solutions.pose.Pose(
18 |             static_image_mode=False,
19 |             model_complexity=1,
20 |             smooth_landmarks=True,
21 |             enable_segmentation=False,
22 |             min_detection_confidence=0.5,
23 |             min_tracking_confidence=0.5,
24 |         )
25 | 
26 |     @cached_property
27 |     def connections_list(self) -> t.List[t.FrozenSet[t.Tuple[int, int]]]:
28 |         return [mp.solutions.pose.POSE_CONNECTIONS]
29 | 
30 |     @cached_property
31 |     def drawing_specs_list(
32 |         self,
33 |     ) -> t.List[t.Dict[str, mp.solutions.drawing_utils.DrawingSpec]]:
34 |         return [
35 |             {
36 |                 "landmark_drawing_spec": mp.solutions.drawing_styles.get_default_pose_landmarks_style()
37 |             }
38 |         ]
39 | 


--------------------------------------------------------------------------------
/utils/logging.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import typing as t
 3 | from functools import cached_property
 4 | 
 5 | import streamlit as st
 6 | 
 7 | 
 8 | class CustomLogger:
 9 |     method_names = ["debug", "info", "warning", "error", "critical"]
10 | 
11 |     def __init__(self, file: str, level: str = "info"):
12 |         self.file = file.split("my-superapp")[1] if "my-superapp" in file else file
13 |         self.level = getattr(logging, level.upper())
14 |         self.cache_methods(methods_to_cache=self.method_names)
15 | 
16 |     @cached_property
17 |     def logger(self) -> logging.Logger:
18 |         logger = logging.getLogger(self.file)
19 |         logger.setLevel(self.level)
20 |         stream_handler = logging.StreamHandler()
21 |         formatter = logging.Formatter(
22 |             "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
23 |         )
24 |         stream_handler.setFormatter(formatter)
25 |         logger.addHandler(stream_handler)
26 | 
27 |         return logger
28 | 
29 |     def cache_methods(self, methods_to_cache: t.List[str]) -> None:
30 |         for method_name in methods_to_cache:
31 |             method = getattr(self.logger, method_name)
32 |             wrapped_method = st.cache_resource(func=method, show_spinner=False)
33 |             setattr(self, method_name, wrapped_method)
34 | 


--------------------------------------------------------------------------------
/src/statistics/statistical_tests/chi_squared.py:
--------------------------------------------------------------------------------
 1 | import typing as t
 2 | 
 3 | import pandas as pd
 4 | import streamlit as st
 5 | from scipy.stats import chi2_contingency
 6 | 
 7 | 
 8 | class Chi2Testing:
 9 |     def __init__(
10 |         self,
11 |         observed: pd.DataFrame,
12 |         alpha: float,
13 |     ):
14 |         self.observed = observed
15 |         self.alpha = alpha
16 | 
17 |     @staticmethod
18 |     @st.cache_data(show_spinner=False)
19 |     def chi2_test(
20 |         observed: pd.DataFrame,
21 |     ) -> t.Tuple[float, float, int, t.List[t.List[float]]]:
22 |         chi2, p_value, dof, expected = chi2_contingency(observed)
23 |         return chi2, p_value, dof, expected
24 | 
25 |     @staticmethod
26 |     @st.cache_data(show_spinner=False)
27 |     def is_statistically_significant(p_value: float, alpha: float) -> bool:
28 |         return p_value < alpha
29 | 
30 |     def perform_chi2_test(self) -> t.Dict[str, t.Any]:
31 |         chi2, p_value, dof, expected = self.chi2_test(self.observed)
32 |         is_significant = self.is_statistically_significant(p_value, self.alpha)
33 | 
34 |         return {
35 |             "chi2_statistic": chi2,
36 |             "p_value": p_value,
37 |             "degrees_of_freedom": dof,
38 |             "expected_frequencies": expected,
39 |             "is_significant": is_significant,
40 |         }
41 | 


--------------------------------------------------------------------------------
/utils/pages_config.py:
--------------------------------------------------------------------------------
 1 | import typing as t
 2 | from functools import cached_property
 3 | 
 4 | import yaml
 5 | 
 6 | import utils
 7 | 
 8 | 
 9 | class PageConfigLoader:
10 |     config_path = "pages/pages_config.yaml"
11 | 
12 |     def __init__(self, file):
13 |         self.file = file
14 |         self.logger = utils.CustomLogger(self.file)
15 | 
16 |     @cached_property
17 |     def pages_config(self) -> t.Dict:
18 |         with open(self.config_path, "r") as file:
19 |             pages_config = yaml.safe_load(file)
20 |         return pages_config
21 | 
22 |     @cached_property
23 |     def page_config(self) -> t.Dict:
24 |         path_keys = self.file.split("my-superapp/pages/")[1].split("/")
25 |         section = self.pages_config
26 | 
27 |         for path_key in path_keys:
28 |             section = section.get(path_key, {})
29 | 
30 |         return self._set_recursive(section, path_keys)
31 | 
32 |     def _set_recursive(self, section, keys) -> t.Dict:
33 |         return {
34 |             key: self._set_recursive(value, keys + [key])
35 |             if isinstance(value, dict)
36 |             else value
37 |             for key, value in section.items()
38 |         }
39 | 
40 |     def set_page_config(self, _globals):
41 |         self.logger.info(f"Setting page config: {self.page_config}")
42 |         for key, value in self.page_config.items():
43 |             _globals[key] = value
44 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [tool.poetry]
 2 | name = "my-superapp"
 3 | version = "0.0.0"
 4 | description = "Daniel ALTUNAY's superapp!"
 5 | authors = ["Daniel Altunay <daniel.altunay@gmail.com>"]
 6 | readme = "README.md"
 7 | homepage = "https://data-science-superapp.streamlit.app/"
 8 | repository = "https://github.com/daltunay/my-superapp"
 9 | packages = [{ include = "src" }, { include = "utils" }, { include = "pages" }]
10 | 
11 | [tool.poetry.dependencies]
12 | python = "~3.11"
13 | # Streamlit
14 | streamlit = "^1.29.0"
15 | streamlit-superapp = "^1.3.0"
16 | streamlit-webrtc = "^0.47.1"
17 | twilio = "^8.10.3"
18 | watchdog = "^3.0.0"
19 | # Computer Vision
20 | opencv-python-headless = "^4.8.1.78"
21 | av = ">=9.0.0,<11.0.0"
22 | ultralytics = "^8.0.222"
23 | mediapipe = "^0.10.8"
24 | # LLMs
25 | langchain = "^0.0.345"
26 | openai = "^1.3.5"
27 | together = "^0.2.8"
28 | tiktoken = "^0.5.1"
29 | faiss-cpu = "^1.7.4"
30 | transformers = "^4.35.2"
31 | pypdf = "^3.17.1"
32 | unstructured = "^0.11.2"
33 | validators = "^0.22.0"
34 | ## Tools
35 | google-api-python-client = "^2.108.0"
36 | arxiv = "^2.0.0"
37 | wikipedia = "^1.4.0"
38 | stackapi = "^0.3.0"
39 | # Machine Learning
40 | scipy = "^1.11.4"
41 | scikit-learn = "^1.3.2"
42 | xgboost = "^2.0.2"
43 | shap = "^0.44.0"
44 | umap-learn = "^0.5.5"
45 | # Data Visualization
46 | plotly = "^5.18.0"
47 | 
48 | [tool.poetry.group.dev.dependencies]
49 | ruff = "^0.1.3"
50 | isort = "^5.12.0"
51 | mypy = "^1.6.1"
52 | ipykernel = "^6.26.0"
53 | 
54 | [tool.poetry.group.types.dependencies]
55 | types-requests = "^2.31.0.10"
56 | 
57 | [build-system]
58 | requires = ["poetry-core"]
59 | build-backend = "poetry.core.masonry.api"
60 | 


--------------------------------------------------------------------------------
/src/computer_vision/landmarks/face_landmarks.py:
--------------------------------------------------------------------------------
 1 | import typing as t
 2 | from functools import cached_property
 3 | 
 4 | import mediapipe as mp
 5 | 
 6 | from src.computer_vision.landmarks import BaseLandmarkerApp
 7 | 
 8 | 
 9 | class FaceLandmarkerApp(BaseLandmarkerApp):
10 |     landmarks_type = "multi_face_landmarks"
11 | 
12 |     def __init__(self):
13 |         super().__init__()
14 | 
15 |     @cached_property
16 |     def landmarker(self) -> mp.solutions.face_mesh.FaceMesh:
17 |         return mp.solutions.face_mesh.FaceMesh(
18 |             static_image_mode=False,
19 |             max_num_faces=1,
20 |             refine_landmarks=True,
21 |             min_detection_confidence=0.5,
22 |             min_tracking_confidence=0.5,
23 |         )
24 | 
25 |     @cached_property
26 |     def connections_list(self) -> t.List[t.FrozenSet[t.Tuple[int, int]]]:
27 |         return [
28 |             mp.solutions.face_mesh.FACEMESH_TESSELATION,
29 |             mp.solutions.face_mesh.FACEMESH_CONTOURS,
30 |             mp.solutions.face_mesh.FACEMESH_IRISES,
31 |         ]
32 | 
33 |     @cached_property
34 |     def drawing_specs_list(
35 |         self,
36 |     ) -> t.List[t.Dict[str, mp.solutions.drawing_utils.DrawingSpec]]:
37 |         return [
38 |             {"connection_drawing_spec": style, "landmark_drawing_spec": None}
39 |             for style in (
40 |                 mp.solutions.drawing_styles.get_default_face_mesh_tesselation_style(),
41 |                 mp.solutions.drawing_styles.get_default_face_mesh_contours_style(),
42 |                 mp.solutions.drawing_styles.get_default_face_mesh_iris_connections_style(),
43 |             )
44 |         ]
45 | 


--------------------------------------------------------------------------------
/utils/misc.py:
--------------------------------------------------------------------------------
 1 | from base64 import b64decode
 2 | from io import BytesIO
 3 | 
 4 | import streamlit as st
 5 | from PIL import Image
 6 | 
 7 | st_ss = st.session_state
 8 | 
 9 | 
10 | def generate_logo_link(url: str, img_url: str) -> str:
11 |     return f'<a href="{url}"><img src="{img_url}"></a>'
12 | 
13 | 
14 | def show_source_code(path: str):
15 |     st.markdown(
16 |         "[![source code](https://img.shields.io/badge/source_code-gray?logo=github)]"
17 |         f"(https://github.com/daltunay/my-superapp/tree/main/{path})"
18 |     )
19 | 
20 | 
21 | def show_logos(linkedin: bool = True, github: bool = True):
22 |     logos = []
23 | 
24 |     if linkedin:
25 |         logos.append(
26 |             generate_logo_link(
27 |                 url="https://linkedin.com/in/daltunay",
28 |                 img_url="https://img.icons8.com/?id=13930&format=png",
29 |             )
30 |         )
31 | 
32 |     if github:
33 |         logos.append(
34 |             generate_logo_link(
35 |                 url="https://github.com/daltunay",
36 |                 img_url="https://img.icons8.com/?id=AZOZNnY73haj&format=png",
37 |             )
38 |         )
39 | 
40 |     logos_html = "".join(logos)
41 |     html_content = f"""
42 |         <div style="text-align: center;">
43 |             Made by Daniel Altunay<br>
44 |             {logos_html}
45 |         </div>
46 |     """
47 | 
48 |     st.markdown(html_content, unsafe_allow_html=True)
49 | 
50 | 
51 | def base64_to_img(base64: str) -> Image.Image:
52 |     return Image.open(BytesIO(b64decode(base64)))
53 | 
54 | 
55 | def reset_session_state_key(key: str):
56 |     if hasattr(st_ss, key):
57 |         delattr(st_ss, key)
58 | 


--------------------------------------------------------------------------------
/pages/clustering/dbscan.py:
--------------------------------------------------------------------------------
 1 | import streamlit as st
 2 | 
 3 | import utils
 4 | from src.machine_learning.clustering import DBScanManager
 5 | from src.machine_learning.datasets import Dataset
 6 | 
 7 | loader = utils.PageConfigLoader(__file__)
 8 | loader.set_page_config(globals())
 9 | 
10 | logger = utils.CustomLogger(__file__)
11 | 
12 | st_ss = st.session_state
13 | 
14 | 
15 | def main():
16 |     utils.tabs_config()
17 |     utils.show_source_code("src/statistics/dimensionality_rediction/dbscan_manager.py")
18 | 
19 |     st.header("Dataset", divider="gray")
20 |     dataset = Dataset(type=None)
21 |     raw_dataset_dict = Dataset.get_dataset(**dataset.params, split=False)
22 |     dataset.set(raw_dataset_dict)
23 | 
24 |     with st.expander(label="Dataset description"):
25 |         st.markdown(dataset.description)
26 | 
27 |     X, y = dataset.X, dataset.y
28 |     if label_mapping := dataset.label_mapping:
29 |         y = y.map(label_mapping)
30 | 
31 |     st.subheader("Visualize data")
32 |     with st.container(border=True):
33 |         utils.display_tab_content("data", X, y)
34 | 
35 |     st.subheader("DBSCAN")
36 |     with st.container(border=True):
37 |         dbscan_manager = DBScanManager()
38 |         dbscan_manager.set_model()
39 | 
40 |     dbscan_manager.fit(data=X)
41 | 
42 |     st.subheader("Scatter plot", divider="gray")
43 |     col_x, col_y = st.columns(2)
44 |     x_col_scatter = col_x.selectbox(
45 |         label="X column", key="scatter_x", options=X.columns, index=0
46 |     )
47 |     y_col_scatter = col_y.selectbox(
48 |         label="Y column", key="scatter_y", options=X.columns, index=1
49 |     )
50 |     st.plotly_chart(
51 |         dbscan_manager.scatter_plot(x_col_scatter, y_col_scatter),
52 |         use_container_width=True,
53 |     )
54 | 


--------------------------------------------------------------------------------
/src/generative_ai/large_language_models/ingest.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import typing as t
 3 | 
 4 | from langchain.document_loaders import DirectoryLoader, PyPDFLoader
 5 | from langchain.embeddings import OpenAIEmbeddings
 6 | from langchain.text_splitter import RecursiveCharacterTextSplitter
 7 | from langchain.vectorstores import FAISS
 8 | 
 9 | import utils
10 | 
11 | 
12 | def get_loader(
13 |     file: str | None = None,
14 |     mode: t.Literal["local"] | t.Literal["upload"] = "local",
15 | ) -> DirectoryLoader | PyPDFLoader:
16 |     if mode == "local":
17 |         return DirectoryLoader(
18 |             path="data/documents/",
19 |             glob="./*.pdf",
20 |             loader_cls=PyPDFLoader,
21 |             show_progress=True,
22 |         )
23 |     elif mode == "upload":
24 |         return PyPDFLoader(file)
25 | 
26 | 
27 | def get_vector_store(
28 |     file: str | None = None,
29 |     mode: t.Literal["local"] | t.Literal["upload"] = "local",
30 | ) -> None:
31 |     loader = get_loader(file=file, mode=mode)
32 |     documents = loader.load()
33 |     splitter = RecursiveCharacterTextSplitter(
34 |         chunk_size=1000,
35 |         chunk_overlap=50,
36 |         length_function=len,
37 |     )
38 |     documents_chunked = splitter.split_documents(documents)
39 |     embeddings = OpenAIEmbeddings()
40 |     db = FAISS.from_documents(documents=documents_chunked, embedding=embeddings)
41 | 
42 |     if mode == "local":
43 |         db.save_local(
44 |             folder_path="faiss_index",
45 |             index_name="index" if mode == "local" else os.path.splitext(file)[0],
46 |         )
47 |     elif mode == "upload":
48 |         return db
49 | 
50 | 
51 | def main():
52 |     get_vector_store(file=None, mode="local")
53 | 
54 | 
55 | if __name__ == "__main__":
56 |     utils.load_secrets()
57 |     main()
58 | 


--------------------------------------------------------------------------------
/pages/large_language_models/chatbot_web_summary.py:
--------------------------------------------------------------------------------
 1 | import streamlit as st
 2 | import validators
 3 | 
 4 | import utils
 5 | from pages.large_language_models import LLM_CONFIG
 6 | from src.generative_ai.large_language_models import ChatbotWebSummary
 7 | 
 8 | loader = utils.PageConfigLoader(__file__)
 9 | loader.set_page_config(globals())
10 | 
11 | logger = utils.CustomLogger(__file__)
12 | 
13 | st_ss = st.session_state
14 | 
15 | 
16 | def main():
17 |     utils.show_source_code(
18 |         path="src/generative_ai/large_language_models/chatbots/chatbot_web.py"
19 |     )
20 |     chosen_model = st.selectbox(
21 |         label="Large Language Model:",
22 |         placeholder="Choose an option",
23 |         options=LLM_CONFIG.keys(),
24 |         index=0,
25 |         on_change=utils.reset_session_state_key,
26 |         kwargs={"key": "chatbot_web_summary"},
27 |     )
28 | 
29 |     chosen_chain_type = st.selectbox(
30 |         label="Chain type:",
31 |         options=ChatbotWebSummary.available_chain_types,
32 |         index=None,
33 |         on_change=utils.reset_session_state_key,
34 |         kwargs={"key": "chatbot_web_summary"},
35 |     )
36 | 
37 |     if chosen_model and chosen_chain_type:
38 |         chatbot = st_ss.setdefault(
39 |             "chatbot_web_summary", ChatbotWebSummary(**LLM_CONFIG[chosen_model])
40 |         )
41 |     else:
42 |         st.info("Choose a chain type for the LLM", icon="ℹ️")
43 | 
44 |     if input_url := st.text_input(
45 |         label="URL of the page to summarize:",
46 |         disabled=not (chosen_model and chosen_chain_type),
47 |     ):
48 |         if validators.url(input_url):
49 |             st.chat_message("human").write(input_url)
50 |             with st.chat_message("ai"):
51 |                 chatbot.summarize(url=input_url)
52 |         else:
53 |             st.error("Invalid URL", icon="❌")
54 | 


--------------------------------------------------------------------------------
/src/computer_vision/object_detection/multi_objects.py:
--------------------------------------------------------------------------------
 1 | from functools import cached_property
 2 | 
 3 | import streamlit_webrtc as st_webrtc
 4 | from av import VideoFrame
 5 | from numpy import ndarray
 6 | from ultralytics import YOLO
 7 | from ultralytics.engine.results import Results
 8 | 
 9 | import utils
10 | 
11 | logger = utils.CustomLogger(__file__)
12 | 
13 | 
14 | class MultiObjectsDetectionApp:
15 |     def __init__(self):
16 |         pass
17 | 
18 |     @cached_property
19 |     def detector(self) -> YOLO:
20 |         return YOLO(model="yolov8n.pt", task=None)
21 | 
22 |     def detect_objects(self, image: ndarray) -> Results:
23 |         return self.detector.predict(
24 |             source=image,
25 |             stream=False,
26 |             show=False,
27 |             show_labels=True,
28 |             show_conf=True,
29 |             verbose=False,
30 |         )
31 | 
32 |     def video_frame_callback(self, frame: VideoFrame) -> VideoFrame:
33 |         image = frame.to_ndarray(format="bgr24")
34 | 
35 |         detections = self.detect_objects(image)
36 |         image = self.annotate_detections(detections)
37 |         utils.annotate_time(image)
38 |         return VideoFrame.from_ndarray(image, format="bgr24")
39 | 
40 |     def stream(self) -> None:
41 |         st_webrtc.webrtc_streamer(
42 |             video_frame_callback=self.video_frame_callback,
43 |             key="multi_objects_streamer",
44 |             mode=st_webrtc.WebRtcMode.SENDRECV,
45 |             rtc_configuration=st_webrtc.RTCConfiguration(
46 |                 {"iceServers": utils.get_ice_servers(), "iceTransportPolicy": "relay"}
47 |             ),
48 |             media_stream_constraints={"video": True, "audio": False},
49 |             async_processing=True,
50 |             desired_playing_state=None,
51 |         )
52 | 
53 |     @staticmethod
54 |     def annotate_detections(detections: Results) -> ndarray:
55 |         return detections[0].plot()
56 | 


--------------------------------------------------------------------------------
/utils/widgets/lakera.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import typing as t
 3 | 
 4 | import requests
 5 | import streamlit as st
 6 | 
 7 | import utils
 8 | 
 9 | logger = utils.CustomLogger(__file__)
10 | 
11 | st_ss = st.session_state
12 | 
13 | 
14 | class LakeraWidget:
15 |     widget_key = "lakera_widget"
16 |     checkbox_key = f"{widget_key}.checkbox"
17 | 
18 |     def __init__(
19 |         self,
20 |         default: bool = False,
21 |     ):
22 |         logger.info(f"Initializing {self.__class__.__name__}")
23 |         self.api_key = os.getenv("LAKERA_GUARD_API_KEY")
24 |         self.default = default
25 | 
26 |     @property
27 |     def lakera_activated(self):
28 |         return st.checkbox(
29 |             label="Prompt injection security",
30 |             value=st_ss.get(self.checkbox_key, self.default),
31 |             key=self.checkbox_key,
32 |             help="Use Lakera Guard API to defend against LLM prompt injections",
33 |             on_change=self.authentificate,
34 |         )
35 | 
36 |     def request_api(self, input: str) -> requests.Response:
37 |         return requests.post(
38 |             url="https://api.lakera.ai/v1/prompt_injection",
39 |             json={"input": input},
40 |             headers={"Authorization": f"Bearer {self.api_key}"},
41 |         )
42 | 
43 |     def authentificate(self):
44 |         if not st_ss.get(self.checkbox_key):
45 |             return
46 |         try:
47 |             response = self.request_api("<AUTHENTICATION TEST>")
48 |         except requests.exceptions.SSLError:
49 |             st.toast("SSL CERTIFICATE VERIFY FAILED", icon="🚫")
50 |         else:
51 |             success = response.ok
52 |             st.toast("Lakera Guard API authentication", icon="✅" if success else "🚫")
53 | 
54 |     def flag_prompt(self, prompt: str) -> t.Tuple[bool, t.Dict]:
55 |         response = self.request_api(prompt).json()
56 |         flagged = response["results"][0]["flagged"]
57 |         return flagged, response
58 | 


--------------------------------------------------------------------------------
/pages/dimensionality_reduction/t-sne.py:
--------------------------------------------------------------------------------
 1 | import streamlit as st
 2 | 
 3 | import utils
 4 | from src.machine_learning.datasets import Dataset
 5 | from src.statistics.dimensionality_reduction import TSNEManager
 6 | 
 7 | loader = utils.PageConfigLoader(__file__)
 8 | loader.set_page_config(globals())
 9 | 
10 | logger = utils.CustomLogger(__file__)
11 | 
12 | st_ss = st.session_state
13 | 
14 | 
15 | def main():
16 |     utils.tabs_config()
17 |     utils.show_source_code("src/statistics/dimensionality_rediction/tsne_manager.py")
18 | 
19 |     st.header("Dataset", divider="gray")
20 |     dataset = Dataset(type=None)
21 |     raw_dataset_dict = Dataset.get_dataset(**dataset.params, split=False)
22 |     dataset.set(raw_dataset_dict)
23 | 
24 |     with st.expander(label="Dataset description"):
25 |         st.markdown(dataset.description)
26 | 
27 |     X, y = dataset.X, dataset.y
28 |     if label_mapping := dataset.label_mapping:
29 |         y = y.map(label_mapping)
30 | 
31 |     st.subheader("Visualize data")
32 |     with st.container(border=True):
33 |         utils.display_tab_content("data", X, y)
34 | 
35 |     st.subheader("t-SNE")
36 |     with st.container(border=True):
37 |         tsne_manager = TSNEManager(max_n_components=3)
38 |         tsne_manager.set_model()
39 | 
40 |     tsne_manager.fit(data=X, target_col=y)
41 | 
42 |     st.subheader("Scatter matrix plot", divider="gray")
43 |     st.plotly_chart(tsne_manager.scatter_matrix_plot(), use_container_width=True)
44 | 
45 |     st.subheader("Scatter 2D plot", divider="gray")
46 |     try:
47 |         st.plotly_chart(tsne_manager.scatter_2d_plot(), use_container_width=True)
48 |     except ValueError:
49 |         st.error("Number of principal components not sufficient for the plot")
50 | 
51 |     st.subheader("Scatter 3D plot", divider="gray")
52 |     try:
53 |         st.plotly_chart(tsne_manager.scatter_3d_plot(), use_container_width=True)
54 |     except ValueError:
55 |         st.error("Number of principal components not sufficient for the plot")
56 | 


--------------------------------------------------------------------------------
/pages/dimensionality_reduction/umap.py:
--------------------------------------------------------------------------------
 1 | import streamlit as st
 2 | 
 3 | import utils
 4 | from src.machine_learning.datasets import Dataset
 5 | from src.statistics.dimensionality_reduction import UMAPManager
 6 | 
 7 | loader = utils.PageConfigLoader(__file__)
 8 | loader.set_page_config(globals())
 9 | 
10 | logger = utils.CustomLogger(__file__)
11 | 
12 | st_ss = st.session_state
13 | 
14 | 
15 | def main():
16 |     utils.tabs_config()
17 |     utils.show_source_code("src/statistics/dimensionality_rediction/umap_manager.py")
18 | 
19 |     st.header("Dataset", divider="gray")
20 |     dataset = Dataset(type=None)
21 |     raw_dataset_dict = Dataset.get_dataset(**dataset.params, split=False)
22 |     dataset.set(raw_dataset_dict)
23 | 
24 |     with st.expander(label="Dataset description"):
25 |         st.markdown(dataset.description)
26 | 
27 |     X, y = dataset.X, dataset.y
28 |     if label_mapping := dataset.label_mapping:
29 |         y = y.map(label_mapping)
30 | 
31 |     st.subheader("Visualize data")
32 |     with st.container(border=True):
33 |         utils.display_tab_content("data", X, y)
34 | 
35 |     st.subheader("UMAP")
36 |     with st.container(border=True):
37 |         umap_manager = UMAPManager(max_n_components=3)
38 |         umap_manager.set_model()
39 | 
40 |     umap_manager.fit(data=X, target_col=y)
41 |     
42 |     st.subheader("Scatter matrix plot", divider="gray")
43 |     st.plotly_chart(umap_manager.scatter_matrix_plot(), use_container_width=True)
44 | 
45 |     st.subheader("Scatter 2D plot", divider="gray")
46 |     try:
47 |         st.plotly_chart(umap_manager.scatter_2d_plot(), use_container_width=True)
48 |     except ValueError:
49 |         st.error("Number of principal components not sufficient for the plot")
50 | 
51 |     st.subheader("Scatter 3D plot", divider="gray")
52 |     try:
53 |         st.plotly_chart(umap_manager.scatter_3d_plot(), use_container_width=True)
54 |     except ValueError:
55 |         st.error("Number of principal components not sufficient for the plot")
56 | 


--------------------------------------------------------------------------------
/utils/streamlit_display.py:
--------------------------------------------------------------------------------
 1 | import typing as t
 2 | 
 3 | import pandas as pd
 4 | import streamlit as st
 5 | 
 6 | 
 7 | def tabs_config():
 8 |     st.markdown(
 9 |         """
10 |             <style>
11 |                 button[data-baseweb="tab"] {
12 |                 font-size: 24px;
13 |                 margin: 0;
14 |                 width: 100%;
15 |                 }
16 |             </style>
17 |             """,
18 |         unsafe_allow_html=True,
19 |     )
20 | 
21 | 
22 | def display_tab_content(
23 |     label: t.Literal["train", "test"],
24 |     X_data: pd.DataFrame,
25 |     y_data: pd.DataFrame,
26 |     label_mapping: t.Dict[int, str] | None = None,
27 | ):
28 |     data_container = st.container()
29 |     col1, col2 = data_container.columns([0.65, 0.35], gap="medium")
30 |     with col1:
31 |         st.markdown(
32 |             f"<h3 style='text-align: center;'>X_{label}</h3>", unsafe_allow_html=True
33 |         )
34 |         st.dataframe(data=X_data, use_container_width=True)
35 | 
36 |     with col2:
37 |         st.markdown(
38 |             f"<h3 style='text-align: center;'>y_{label}</h3>", unsafe_allow_html=True
39 |         )
40 |         st.dataframe(
41 |             data=y_data.map(label_mapping or (lambda x: x)), use_container_width=True
42 |         )
43 | 
44 |     describe_container = st.expander("Data statistics").container()
45 |     col1, col2 = describe_container.columns([0.65, 0.35], gap="medium")
46 |     with col1:
47 |         st.dataframe(X_data.describe(), use_container_width=True)
48 |     with col2:
49 |         if label_mapping:
50 |             st.dataframe(
51 |                 pd.concat(
52 |                     [
53 |                         y_data.map(label_mapping).value_counts().sort_index(),
54 |                         y_data.map(label_mapping)
55 |                         .value_counts(normalize=True)
56 |                         .sort_index(),
57 |                     ],
58 |                     axis=1,
59 |                 ).round(3)
60 |             )
61 |         else:
62 |             st.dataframe(y_data.describe(), use_container_width=True)
63 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | 
 2 | [![Streamlit App](https://static.streamlit.io/badges/streamlit_badge_black_white.svg)](https://data-science-superapp.streamlit.app)
 3 | 
 4 | 
 5 | ## Prerequisites
 6 | 
 7 | **Poetry**: If [Poetry](https://python-poetry.org/) is not installed, you can do so using pip:
 8 | 
 9 | 
10 | ```bash
11 | pip install poetry
12 | ```
13 | 
14 | **Docker**: If [Docker](https://www.docker.com/) is not installed, you can do so following [this link](https://docs.docker.com/get-docker/)
15 | 
16 | ## Installation
17 | 
18 | 1. Clone the repository:
19 | 
20 | ```bash
21 | git clone https://github.com/daltunay/my-superapp.git
22 | cd my-superapp
23 | ```
24 | 
25 | 2. Set up the project dependencies using Poetry:
26 | 
27 | ```bash
28 | poetry install
29 | ```
30 | 
31 | This command will create a virtual environment and install the necessary dependencies.
32 | 
33 | ## Setting up API Keys
34 | 
35 | The application uses several APIs to function properly. 
36 | You can specifiy the API keys in `.streamlit/secrets.toml`: 
37 | 
38 | ```toml
39 | [twilio]
40 | TWILIO_ACCOUNT_SID = "<...>"
41 | TWILIO_AUTH_TOKEN = "<...>"
42 | 
43 | [openai]
44 | OPENAI_API_KEY = "<...>"
45 | 
46 | [together]
47 | TOGETHER_API_KEY = "<...>"
48 | 
49 | [lakera_guard]
50 | LAKERA_GUARD_API_KEY = "<...>"
51 | 
52 | [google]
53 | GOOGLE_API_KEY = "<...>"
54 | GOOGLE_CSE_ID = "<...>"
55 | ```
56 | 
57 | 
58 | ## Running the Application
59 | The _my-superapp_ application can be run using either Poetry or Docker.
60 | 
61 | ### Using Poetry
62 | 
63 | To run the application using Poetry:
64 | 
65 | ```bash
66 | poetry run streamlit run app.py
67 | ```
68 | 
69 | ### Using Docker
70 | 
71 | 1. Build the Docker image:
72 | 
73 | ```bash
74 | docker build -t my-superapp .
75 | ```
76 | 
77 | 2. Run the application as a Docker container:
78 | 
79 | ```bash
80 | docker run -p 8501:8501 my-superapp
81 | ```
82 | 
83 | Alternatively, you can just run the following:
84 | 
85 | ```bash
86 | chmod +x ./bin/run.sh
87 | ./bin/run.sh
88 | ```
89 | 
90 | Once the application is running, it will be accessible at http://localhost:8501 in your web browser.
91 | 


--------------------------------------------------------------------------------
/src/generative_ai/large_language_models/chatbots/chatbot_web_summary.py:
--------------------------------------------------------------------------------
 1 | import typing as t
 2 | from functools import cached_property
 3 | 
 4 | from langchain.chains.combine_documents.base import BaseCombineDocumentsChain
 5 | from langchain.chains.summarize import load_summarize_chain
 6 | from langchain.docstore.document import Document
 7 | from langchain.document_loaders import UnstructuredURLLoader
 8 | from unstructured.cleaners.core import (clean, clean_extra_whitespace,
 9 |                                         remove_punctuation)
10 | 
11 | from src.generative_ai.large_language_models.chatbots import Chatbot, ModelArgs
12 | 
13 | 
14 | class ChatbotWebSummary(Chatbot):
15 |     available_chain_types = ["stuff", "map_reduce"]
16 | 
17 |     def __init__(
18 |         self,
19 |         chain_type: t.Literal["stuff"] | t.Literal["map_reduce"] = "stuff",
20 |         **model_kwargs: t.Unpack[ModelArgs],
21 |     ) -> None:
22 |         super().__init__(**model_kwargs)
23 |         self.chain_type = chain_type
24 | 
25 |     @staticmethod
26 |     def url_to_doc(source_url: str) -> Document:
27 |         url_loader = UnstructuredURLLoader(
28 |             urls=[source_url],
29 |             mode="elements",
30 |             post_processors=[clean, remove_punctuation, clean_extra_whitespace],
31 |         )
32 | 
33 |         narrative_elements = [
34 |             element
35 |             for element in url_loader.load()
36 |             if element.metadata.get("category") == "NarrativeText"
37 |         ]
38 |         cleaned_content = " ".join(
39 |             element.page_content for element in narrative_elements
40 |         )
41 | 
42 |         return Document(page_content=cleaned_content, metadata={"source": source_url})
43 | 
44 |     @cached_property
45 |     def chain(self) -> BaseCombineDocumentsChain:
46 |         return load_summarize_chain(self.llm, chain_type=self.chain_type, verbose=True)
47 | 
48 |     def summarize(self, url: str) -> str:
49 |         document = self.url_to_doc(url)
50 |         return self.chain.run(
51 |             [document],
52 |             callbacks=self.callbacks,
53 |         )
54 | 


--------------------------------------------------------------------------------
/pages/dimensionality_reduction/pca.py:
--------------------------------------------------------------------------------
 1 | import streamlit as st
 2 | 
 3 | import utils
 4 | from src.machine_learning.datasets import Dataset
 5 | from src.statistics.dimensionality_reduction import PCAManager
 6 | 
 7 | loader = utils.PageConfigLoader(__file__)
 8 | loader.set_page_config(globals())
 9 | 
10 | logger = utils.CustomLogger(__file__)
11 | 
12 | st_ss = st.session_state
13 | 
14 | 
15 | def main():
16 |     utils.tabs_config()
17 |     utils.show_source_code("src/statistics/dimensionality_rediction/pca_manager.py")
18 | 
19 |     st.header("Dataset", divider="gray")
20 |     dataset = Dataset(type=None)
21 |     raw_dataset_dict = Dataset.get_dataset(**dataset.params, split=False)
22 |     dataset.set(raw_dataset_dict)
23 | 
24 |     with st.expander(label="Dataset description"):
25 |         st.markdown(dataset.description)
26 | 
27 |     X, y = dataset.X, dataset.y
28 |     if label_mapping := dataset.label_mapping:
29 |         y = y.map(label_mapping)
30 | 
31 |     st.subheader("Visualize data")
32 |     with st.container(border=True):
33 |         utils.display_tab_content("data", X, y)
34 | 
35 |     st.subheader("PCA")
36 |     with st.container(border=True):
37 |         pca_manager = PCAManager(max_n_components=3)
38 |         pca_manager.set_model()
39 | 
40 |     pca_manager.fit(data=X, target_col=y)
41 | 
42 |     st.subheader("Scatter matrix plot", divider="gray")
43 |     st.plotly_chart(pca_manager.scatter_matrix_plot(), use_container_width=True)
44 | 
45 |     st.subheader("Explained variance plot", divider="gray")
46 |     st.plotly_chart(pca_manager.explained_variance_plot(), use_container_width=True)
47 | 
48 |     st.subheader("Scatter 2D + Loadings plot", divider="gray")
49 |     try:
50 |         st.plotly_chart(pca_manager.loadings_plot(), use_container_width=True)
51 |     except ValueError:
52 |         st.error("Number of principal components not sufficient for the plot")
53 | 
54 |     st.subheader("Scatter 3D plot", divider="gray")
55 |     try:
56 |         st.plotly_chart(pca_manager.scatter_3d_plot(), use_container_width=True)
57 |     except ValueError:
58 |         st.error("Number of principal components not sufficient for the plot")
59 | 


--------------------------------------------------------------------------------
/pages/regression/xgboost.py:
--------------------------------------------------------------------------------
 1 | import streamlit as st
 2 | 
 3 | import utils
 4 | from src.machine_learning import XGBoostManager
 5 | from src.machine_learning.datasets import Dataset
 6 | 
 7 | loader = utils.PageConfigLoader(__file__)
 8 | loader.set_page_config(globals())
 9 | 
10 | logger = utils.CustomLogger(__file__)
11 | 
12 | st_ss = st.session_state
13 | 
14 | 
15 | def main():
16 |     utils.tabs_config()
17 |     utils.show_source_code("src/machine_learning/xgboost_manager.py")
18 | 
19 |     st.header("Dataset", divider="gray")
20 |     dataset = Dataset(type="regression")
21 |     raw_dataset_dict = Dataset.get_dataset(**dataset.params, split=True)
22 |     dataset.set(raw_dataset_dict)
23 | 
24 |     with st.expander(label="Dataset description"):
25 |         st.markdown(dataset.description)
26 | 
27 |     X_train, X_test = dataset.X
28 |     y_train, y_test = dataset.y
29 |     label_mapping = dataset.label_mapping
30 | 
31 |     st.subheader("Visualize data")
32 |     train_tab, test_tab = st.tabs(tabs=["Train", "Test"])
33 |     with train_tab:
34 |         with st.container(border=True):
35 |             utils.display_tab_content("train", X_train, y_train, label_mapping)
36 |     with test_tab:
37 |         with st.container(border=True):
38 |             utils.display_tab_content("test", X_test, y_test, label_mapping)
39 | 
40 |     st.header("Regression", divider="gray")
41 |     st.markdown(
42 |         "Regression model: `XGBRegressor` from `xgboost` "
43 |         "([official documentation](https://xgboost.readthedocs.io/en/stable/python/python_api.html#xgboost.XGBRegressor))"
44 |     )
45 |     regression_manager = XGBoostManager(task="regression")
46 | 
47 |     st.subheader("Hyperparameters")
48 |     with st.container(border=True):
49 |         regression_manager.set_model()
50 | 
51 |     st.subheader("Evaluation")
52 |     regression_manager.fit(X_train, y_train)
53 |     regression_manager.evaluate(X_test, y_test)
54 |     st.markdown("Metrics Report")
55 |     st.columns([0.5, 1, 0.5])[1].dataframe(
56 |         data=regression_manager.metrics_report.round(2), use_container_width=True
57 |     )
58 |     st.subheader("Explainability")
59 |     st.markdown("SHAP force plot")
60 |     utils.st_shap(plot=regression_manager.shap_force_plot(X_test), height=400)
61 | 


--------------------------------------------------------------------------------
/pages/clustering/kmeans.py:
--------------------------------------------------------------------------------
 1 | import streamlit as st
 2 | 
 3 | import utils
 4 | from src.machine_learning.clustering import KMeansManager
 5 | from src.machine_learning.datasets import Dataset
 6 | 
 7 | loader = utils.PageConfigLoader(__file__)
 8 | loader.set_page_config(globals())
 9 | 
10 | logger = utils.CustomLogger(__file__)
11 | 
12 | st_ss = st.session_state
13 | 
14 | 
15 | def main():
16 |     utils.tabs_config()
17 |     utils.show_source_code("src/statistics/dimensionality_rediction/kmeans_manager.py")
18 | 
19 |     st.header("Dataset", divider="gray")
20 |     dataset = Dataset(type=None)
21 |     raw_dataset_dict = Dataset.get_dataset(**dataset.params, split=False)
22 |     dataset.set(raw_dataset_dict)
23 | 
24 |     with st.expander(label="Dataset description"):
25 |         st.markdown(dataset.description)
26 | 
27 |     X, y = dataset.X, dataset.y
28 |     if label_mapping := dataset.label_mapping:
29 |         y = y.map(label_mapping)
30 | 
31 |     st.subheader("Visualize data")
32 |     with st.container(border=True):
33 |         utils.display_tab_content("data", X, y)
34 | 
35 |     st.subheader("K-Means")
36 |     with st.container(border=True):
37 |         kmeans_manager = KMeansManager(max_n_clusters=10)
38 |         kmeans_manager.set_model()
39 | 
40 |     kmeans_manager.fit(data=X)
41 | 
42 |     st.subheader("Scatter plot", divider="gray")
43 |     col_x, col_y = st.columns(2)
44 |     x_col_scatter = col_x.selectbox(
45 |         label="X column", key="scatter_x", options=X.columns, index=0
46 |     )
47 |     y_col_scatter = col_y.selectbox(
48 |         label="Y column", key="scatter_y", options=X.columns, index=1
49 |     )
50 |     st.plotly_chart(
51 |         kmeans_manager.scatter_plot(x_col_scatter, y_col_scatter),
52 |         use_container_width=True,
53 |     )
54 | 
55 |     st.subheader("Centroids plot", divider="gray")
56 |     col_x, col_y = st.columns(2)
57 |     x_col_centroids = col_x.selectbox(
58 |         label="X column", key="centroids_x", options=X.columns, index=0
59 |     )
60 |     y_col_centroids = col_y.selectbox(
61 |         label="Y column", key="centroids_y", options=X.columns, index=1
62 |     )
63 |     st.plotly_chart(
64 |         kmeans_manager.centroids_plot(x_col_centroids, y_col_centroids),
65 |         use_container_width=True,
66 |     )
67 | 


--------------------------------------------------------------------------------
/src/machine_learning/clustering/dbscan_manager.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import plotly.express as px
 3 | import streamlit as st
 4 | from sklearn.cluster import DBSCAN
 5 | 
 6 | 
 7 | class DBScanManager:
 8 |     def __init__(self):
 9 |         self.model: DBSCAN | None = None
10 | 
11 |     @property
12 |     def params(self) -> dict:
13 |         columns = st.columns(2)
14 |         return {
15 |             "eps": columns[0].slider(
16 |                 label="Maximum Distance (eps)",
17 |                 min_value=0.1,
18 |                 max_value=5.0,
19 |                 value=1.0,
20 |                 step=0.1,
21 |                 help="Maximum distance between two samples for one to be considered as in the neighborhood of the other.",
22 |             ),
23 |             "min_samples": columns[1].slider(
24 |                 label="Minimum Samples",
25 |                 min_value=1,
26 |                 max_value=10,
27 |                 value=5,
28 |                 step=1,
29 |                 help="The number of samples in a neighborhood for a point to be considered as a core point.",
30 |             ),
31 |         }
32 | 
33 |     @staticmethod
34 |     @st.cache_resource(show_spinner=True)
35 |     def _get_model(eps: float, min_samples: int) -> DBSCAN:
36 |         return DBSCAN(eps=eps, min_samples=min_samples)
37 | 
38 |     def set_model(self) -> None:
39 |         self.model = self._get_model(**self.params)
40 | 
41 |     @staticmethod
42 |     @st.cache_resource(
43 |         show_spinner=True,
44 |         hash_funcs={DBSCAN: lambda model: (model.eps, model.min_samples)},
45 |     )
46 |     def _perform_clustering(model: DBSCAN, data: pd.DataFrame) -> pd.DataFrame:
47 |         clusters = model.fit_predict(data)
48 |         data = data.assign(Cluster=clusters)
49 |         data["Cluster"] = data["Cluster"].astype(str)
50 |         return model, data
51 | 
52 |     def fit(self, data: pd.DataFrame):
53 |         self.model, self.data_clustered = self._perform_clustering(
54 |             model=self.model, data=data
55 |         )
56 | 
57 |     def scatter_plot(self, x_col: str, y_col: str) -> None:
58 |         return px.scatter(
59 |             self.data_clustered,
60 |             x=x_col,
61 |             y=y_col,
62 |             color="Cluster",
63 |             labels={"color": "Cluster"},
64 |         )
65 | 


--------------------------------------------------------------------------------
/src/generative_ai/large_language_models/chatbots/chatbot_rag.py:
--------------------------------------------------------------------------------
 1 | import typing as t
 2 | from functools import cached_property
 3 | 
 4 | from langchain.chains import ConversationalRetrievalChain
 5 | from langchain.chains.conversational_retrieval.base import \
 6 |     BaseConversationalRetrievalChain
 7 | from langchain.embeddings import OpenAIEmbeddings
 8 | from langchain.vectorstores import FAISS
 9 | from langchain.vectorstores.base import VectorStoreRetriever
10 | 
11 | from src.generative_ai.large_language_models.chatbots import Chatbot, ModelArgs
12 | 
13 | 
14 | class ChatbotRAG(Chatbot):
15 |     def __init__(
16 |         self,
17 |         vector_store: FAISS | None = None,
18 |         embeddings_kwargs: t.Dict | None = None,
19 |         search_kwargs: t.Dict | None = None,
20 |         **model_kwargs: t.Unpack[ModelArgs],
21 |     ) -> None:
22 |         super().__init__(**model_kwargs)
23 |         if vector_store:
24 |             self.vector_store = vector_store
25 |         self.embeddings_kwargs = embeddings_kwargs or {}
26 |         self.search_kwargs = search_kwargs or {}
27 | 
28 |     @cached_property
29 |     def embeddings(self) -> OpenAIEmbeddings:
30 |         return OpenAIEmbeddings(**self.embeddings_kwargs)
31 | 
32 |     @cached_property
33 |     def vector_store(self) -> FAISS:
34 |         return FAISS.load_local(folder_path="faiss_index", embeddings=self.embeddings)
35 | 
36 |     @cached_property
37 |     def retriever(self) -> VectorStoreRetriever:
38 |         return self.vector_store.as_retriever(
39 |             search_type="similarity",
40 |             search_kwargs=self.search_kwargs,
41 |         )
42 | 
43 |     @cached_property
44 |     def chain(self) -> BaseConversationalRetrievalChain:
45 |         return ConversationalRetrievalChain.from_llm(
46 |             llm=self.llm,
47 |             memory=self.memory,
48 |             verbose=True,
49 |             combine_docs_chain_kwargs={"prompt": self.template},
50 |             chain_type="stuff",
51 |             retriever=self.retriever,
52 |         )
53 | 
54 |     def ask(
55 |         self,
56 |         query: str,
57 |         language: str | None = None,
58 |     ) -> str:
59 |         return self.chain.run(
60 |             question=query,
61 |             language=language or "the input language",
62 |             callbacks=self.callbacks,
63 |         )
64 | 


--------------------------------------------------------------------------------
/pages/large_language_models/chatbot.py:
--------------------------------------------------------------------------------
 1 | import streamlit as st
 2 | 
 3 | import utils
 4 | from pages.large_language_models import LLM_CONFIG
 5 | from src.generative_ai.large_language_models import Chatbot
 6 | 
 7 | loader = utils.PageConfigLoader(__file__)
 8 | loader.set_page_config(globals())
 9 | 
10 | st_ss = st.session_state
11 | 
12 | 
13 | def main():
14 |     utils.show_source_code("src/generative_ai/large_language_models/chatbots/chatbot.py")
15 |     with st.expander(label="Chat parameters", expanded=True):
16 |         col1, col2 = st.columns(2)
17 |         with col1:
18 |             selected_language = st_ss.setdefault(
19 |                 "language_widget", utils.LanguageWidget()
20 |             ).selected_language
21 |         with col2:
22 |             lakera_activated = st_ss.setdefault(
23 |                 "lakera_widget", utils.LakeraWidget()
24 |             ).lakera_activated
25 | 
26 |     chosen_model = st.selectbox(
27 |         label="Large Language Model:",
28 |         placeholder="Choose an option",
29 |         options=LLM_CONFIG.keys(),
30 |         index=0,
31 |         on_change=utils.reset_session_state_key,
32 |         kwargs={"key": "chatbot"},
33 |     )
34 | 
35 |     provided_context = st.text_area(
36 |         label="Context:",
37 |         value="",
38 |         help="This context will be passed to the chatbot.",
39 |     )
40 | 
41 |     if chosen_model:
42 |         chatbot = st_ss.setdefault("chatbot", Chatbot(**LLM_CONFIG[chosen_model]))
43 |         for message in chatbot.history:
44 |             st.chat_message(message["role"]).write(message["content"])
45 |     else:
46 |         pass
47 | 
48 |     if prompt := st.chat_input(
49 |         placeholder=f"Chat with {chosen_model}!" if chosen_model else "",
50 |         disabled=not chosen_model,
51 |     ):
52 |         st.chat_message("human").write(prompt)
53 |         if lakera_activated:
54 |             flag, response = st_ss.setdefault(
55 |                 "lakera_widget", utils.LakeraWidget()
56 |             ).flag_prompt(prompt=prompt)
57 |             if flag:
58 |                 st.warning(body="Prompt injection detected", icon="🚨")
59 |                 st.expander(label="LOGS").json(response)
60 |         with st.chat_message("ai"):
61 |             chatbot.ask(
62 |                 query=prompt,
63 |                 context=provided_context,
64 |                 language=selected_language,
65 |             )
66 | 


--------------------------------------------------------------------------------
/pages/statistical_tests/ab_test.py:
--------------------------------------------------------------------------------
 1 | import streamlit as st
 2 | 
 3 | import utils
 4 | from src.statistics.statistical_tests import ABTesting, input_group_data
 5 | 
 6 | loader = utils.PageConfigLoader(__file__)
 7 | loader.set_page_config(globals())
 8 | 
 9 | 
10 | def main():
11 |     st.header("Data", divider="gray")
12 |     a_col, b_col = st.columns(2, gap="small")
13 |     with a_col.container(border=True):
14 |         st.subheader("Group A")
15 |         a_visitors, a_conversions, a_rate = input_group_data(
16 |             group_name="A", default_visitors=1000, default_conversions=50
17 |         )
18 |     with b_col.container(border=True):
19 |         st.subheader("Group B")
20 |         b_visitors, b_conversions, b_rate = input_group_data(
21 |             group_name="B", default_visitors=200, default_conversions=35
22 |         )
23 | 
24 |     st.header("Settings", divider="gray")
25 |     settings_container = st.container(border=True)
26 |     test_type = settings_container.selectbox(
27 |         label="Test type",
28 |         key="ab_test.test_type",
29 |         options=["one-sided", "two-sided"],
30 |         index=1,
31 |         format_func=lambda x: x.replace("-", " ").capitalize(),
32 |     )
33 |     confidence_col, alpha_col = settings_container.columns(2)
34 |     confidence = confidence_col.columns([0.15, 1, 0.15])[1].select_slider(
35 |         "Confidence level",
36 |         options=[0.9, 0.95, 0.99],
37 |         value=0.95,
38 |         key="ab_test.confidence",
39 |         format_func=lambda x: f"{100*x}%",
40 |         on_change=utils.update_slider_callback,
41 |         kwargs={"updated": "ab_test.confidence", "to_update": "ab_test.alpha"},
42 |     )
43 |     alpha = alpha_col.columns([0.15, 1, 0.15])[1].select_slider(
44 |         "Alpha value",
45 |         options=[0.01, 0.05, 0.1],
46 |         value=0.05,
47 |         key="ab_test.alpha",
48 |         format_func=lambda x: f"{100*x}%",
49 |         on_change=utils.update_slider_callback,
50 |         kwargs={"updated": "ab_test.alpha", "to_update": "ab_test.confidence"},
51 |     )
52 | 
53 |     ab_testing = ABTesting(a_visitors, a_rate, b_visitors, b_rate, alpha, test_type)
54 | 
55 |     st.header("Results", divider="gray")
56 |     result = ab_testing.perform_ab_test()
57 | 
58 |     if result["is_significant"]:
59 |         st.success("The difference is significant", icon="✅")
60 |     else:
61 |         st.error("The difference is not significant", icon="❌")
62 | 
63 |     st.expander(label="Test details").json(result)
64 | 


--------------------------------------------------------------------------------
/pages/classification/xgboost.py:
--------------------------------------------------------------------------------
 1 | import streamlit as st
 2 | 
 3 | import utils
 4 | from src.machine_learning import XGBoostManager
 5 | from src.machine_learning.datasets import Dataset
 6 | 
 7 | loader = utils.PageConfigLoader(__file__)
 8 | loader.set_page_config(globals())
 9 | 
10 | logger = utils.CustomLogger(__file__)
11 | 
12 | st_ss = st.session_state
13 | 
14 | 
15 | def main():
16 |     utils.tabs_config()
17 |     utils.show_source_code("src/machine_learning/xgboost_manager.py")
18 | 
19 |     st.header("Dataset", divider="gray")
20 |     dataset = Dataset(type="classification")
21 |     raw_dataset_dict = Dataset.get_dataset(**dataset.params, split=True)
22 |     dataset.set(raw_dataset_dict)
23 | 
24 |     with st.expander(label="Dataset description"):
25 |         st.markdown(dataset.description)
26 | 
27 |     X_train, X_test = dataset.X
28 |     y_train, y_test = dataset.y
29 |     label_mapping = dataset.label_mapping
30 | 
31 |     st.subheader("Visualize data")
32 |     train_tab, test_tab = st.tabs(tabs=["Train", "Test"])
33 |     with train_tab:
34 |         with st.container(border=True):
35 |             utils.display_tab_content("train", X_train, y_train, label_mapping)
36 |     with test_tab:
37 |         with st.container(border=True):
38 |             utils.display_tab_content("test", X_test, y_test, label_mapping)
39 | 
40 |     st.header("Classification", divider="gray")
41 |     st.markdown(
42 |         "Classification model: `XGBClassifier` from `xgboost` "
43 |         "([official documentation](https://xgboost.readthedocs.io/en/stable/python/python_api.html#xgboost.XGBClassifier))"
44 |     )
45 |     classification_manager = XGBoostManager(task="classification")
46 | 
47 |     st.subheader("Hyperparameters")
48 |     with st.container(border=True):
49 |         classification_manager.set_model(label_mapping=label_mapping)
50 | 
51 |     st.subheader("Evaluation")
52 |     classification_manager.fit(X_train, y_train)
53 |     classification_manager.evaluate(
54 |         X_test, y_test, target_names=list(label_mapping.values())
55 |     )
56 |     st.markdown("Classification Report")
57 |     st.columns(3)[1].dataframe(
58 |         data=classification_manager.classification_report, use_container_width=True
59 |     )
60 |     st.markdown("Confusion Matrix")
61 |     st.columns([0.1, 1, 0.1])[1].pyplot(
62 |         fig=classification_manager.confusion_matrix_display(
63 |             display_labels=list(label_mapping.values())
64 |         )
65 |     )
66 |     st.subheader("Explainability")
67 |     st.markdown("SHAP force plot")
68 |     utils.st_shap(plot=classification_manager.shap_force_plot(X_test), height=400)
69 | 


--------------------------------------------------------------------------------
/pages/large_language_models/chatbot_tools.py:
--------------------------------------------------------------------------------
 1 | import streamlit as st
 2 | 
 3 | import utils
 4 | from pages.large_language_models import LLM_CONFIG
 5 | from src.generative_ai.large_language_models import ChatbotTools
 6 | 
 7 | loader = utils.PageConfigLoader(__file__)
 8 | loader.set_page_config(globals())
 9 | 
10 | st_ss = st.session_state
11 | 
12 | 
13 | def main():
14 |     utils.show_source_code(
15 |         "src/generative_ai/large_language_models/chatbots/chatbot_tools.py"
16 |     )
17 |     with st.expander(label="Chat parameters", expanded=True):
18 |         col1, col2 = st.columns(2)
19 |         with col1:
20 |             selected_language = st_ss.setdefault(
21 |                 "language_widget", utils.LanguageWidget()
22 |             ).selected_language
23 |         with col2:
24 |             lakera_activated = st_ss.setdefault(
25 |                 "lakera_widget", utils.LakeraWidget()
26 |             ).lakera_activated
27 | 
28 |     chosen_model = st.selectbox(
29 |         label="Large Language Model:",
30 |         placeholder="Choose an option",
31 |         options=LLM_CONFIG.keys(),
32 |         index=0,
33 |         on_change=utils.reset_session_state_key,
34 |         kwargs={"key": "chatbot_tools"},
35 |     )
36 | 
37 |     chosen_tools = st.multiselect(
38 |         label="Tools:",
39 |         options=ChatbotTools.available_tools,
40 |         default=None,
41 |         on_change=utils.reset_session_state_key,
42 |         kwargs={"key": "chatbot_tools"},
43 |     )
44 | 
45 |     if chosen_model and chosen_tools:
46 |         chatbot = st_ss.setdefault(
47 |             "chatbot_tools",
48 |             ChatbotTools(**LLM_CONFIG[chosen_model], tool_names=chosen_tools),
49 |         )
50 |         for message in chatbot.history:
51 |             st.chat_message(message["role"]).write(message["content"])
52 |     else:
53 |         st.info("Choose tools for the LLM", icon="ℹ️")
54 | 
55 |     if prompt := st.chat_input(
56 |         placeholder=f"Chat with {chosen_model}!"
57 |         if (chosen_model and chosen_tools)
58 |         else "",
59 |         disabled=not (chosen_model and chosen_tools),
60 |     ):
61 |         st.chat_message("human").write(prompt)
62 |         if lakera_activated:
63 |             flag, response = st_ss.setdefault(
64 |                 "lakera_widget", utils.LakeraWidget()
65 |             ).flag_prompt(prompt=prompt)
66 |             if flag:
67 |                 st.warning(body="Prompt injection detected", icon="🚨")
68 |                 st.expander(label="LOGS").json(response)
69 |         with st.chat_message("ai"):
70 |             st.write(chatbot.ask(
71 |                 query=prompt,
72 |                 language=selected_language,
73 |             ))
74 | 


--------------------------------------------------------------------------------
/src/generative_ai/large_language_models/chatbots/chatbot_tools.py:
--------------------------------------------------------------------------------
 1 | import typing as t
 2 | from functools import cached_property
 3 | 
 4 | from langchain.agents import (AgentExecutor, AgentType, initialize_agent,
 5 |                               load_tools)
 6 | from langchain.callbacks.base import BaseCallbackHandler
 7 | from langchain.tools import BaseTool
 8 | 
 9 | from src.generative_ai.large_language_models.chatbots import Chatbot, ModelArgs
10 | 
11 | 
12 | class ChatbotTools(Chatbot):
13 |     available_tools = ["google-search", "arxiv", "wikipedia", "stackexchange", "human"]
14 | 
15 |     def __init__(
16 |         self,
17 |         tool_names: t.List[str] | None = None,
18 |         **model_kwargs: t.Unpack[ModelArgs],
19 |     ) -> None:
20 |         super().__init__(**model_kwargs)
21 |         self.tool_names = tool_names or []
22 |         self.memory.input_key = "input"
23 | 
24 |     # @property
25 |     # def callbacks(self) -> t.List[BaseCallbackHandler]:
26 |     #     return [super().callbacks[1]]
27 | 
28 |     @cached_property
29 |     def tools(self) -> t.List[BaseTool]:
30 |         return load_tools(tool_names=self.tool_names)
31 | 
32 |     @staticmethod
33 |     def update_agent_prompt_template(
34 |         agent: AgentExecutor,
35 |         text: str,
36 |         input_variable: str | None = None,
37 |     ):
38 |         template = agent.agent.llm_chain.prompt.template
39 |         newline_index = agent.agent.llm_chain.prompt.template.find("\n\n")
40 |         agent.agent.llm_chain.prompt.template = text + template[newline_index:]
41 |         if input_variable:
42 |             agent.agent.llm_chain.prompt.input_variables.append(input_variable)
43 |         return agent
44 | 
45 |     @cached_property
46 |     def chain(self) -> AgentExecutor:
47 |         agent = initialize_agent(
48 |             llm=self.llm,
49 |             memory=self.memory,
50 |             verbose=True,
51 |             agent=AgentType.CONVERSATIONAL_REACT_DESCRIPTION,
52 |             agent_kwargs={
53 |                 "input_variables": [
54 |                     "input",
55 |                     "chat_history",
56 |                     "agent_scratchpad",
57 |                     "language",
58 |                 ]
59 |             },
60 |             tools=self.tools,
61 |             handle_parsing_errors=True,
62 |             return_intermediate_steps=False,
63 |         )
64 |         agent = self.update_agent_prompt_template(
65 |             agent=agent,
66 |             text="Assistant is a large language model, speaking in {language}.",
67 |             input_variable="language",
68 |         )
69 |         return agent
70 | 
71 |     def ask(
72 |         self,
73 |         query: str,
74 |         language: str | None = None,
75 |     ) -> str:
76 |         return self.chain.run(
77 |             input=query,
78 |             language=language or "the input language",
79 |             callbacks=self.callbacks,
80 |         )
81 | 


--------------------------------------------------------------------------------
/src/machine_learning/clustering/kmeans_manager.py:
--------------------------------------------------------------------------------
 1 | import typing as t
 2 | 
 3 | import pandas as pd
 4 | import plotly.express as px
 5 | import plotly.graph_objects as go
 6 | import streamlit as st
 7 | from sklearn.cluster import KMeans
 8 | 
 9 | 
10 | class KMeansManager:
11 |     def __init__(self, max_n_clusters: int):
12 |         self.max_n_clusters = max_n_clusters
13 |         self.model: KMeans | None = None
14 | 
15 |     @property
16 |     def params(self) -> t.Dict[str, int]:
17 |         columns = st.columns(2)
18 |         return {
19 |             "n_clusters": columns[0].slider(
20 |                 label="Number of Clusters",
21 |                 min_value=1,
22 |                 max_value=self.max_n_clusters,
23 |                 value=2,
24 |                 step=1,
25 |                 help="Number of clusters to form.",
26 |             ),
27 |         }
28 | 
29 |     @staticmethod
30 |     @st.cache_resource(show_spinner=True)
31 |     def _get_model(n_clusters: int) -> KMeans:
32 |         return KMeans(n_clusters=n_clusters, n_init="auto")
33 | 
34 |     def set_model(self) -> None:
35 |         params = self.params
36 |         self.model = self._get_model(params["n_clusters"])
37 | 
38 |     @staticmethod
39 |     @st.cache_resource(
40 |         show_spinner=True,
41 |         hash_funcs={KMeans: lambda model: model.n_clusters},
42 |     )
43 |     def _perform_clustering(model: KMeans, data: pd.DataFrame) -> pd.DataFrame:
44 |         model = model.fit(data)
45 |         clusters = model.predict(data)
46 |         data = data.assign(Cluster=clusters)
47 |         data["Cluster"] = data["Cluster"].astype(str)
48 |         return model, data
49 | 
50 |     def fit(self, data: pd.DataFrame):
51 |         self.model, self.data_clustered = self._perform_clustering(
52 |             model=self.model, data=data
53 |         )
54 | 
55 |     def scatter_plot(self, x_col: str, y_col: str) -> None:
56 |         return px.scatter(
57 |             self.data_clustered,
58 |             x=x_col,
59 |             y=y_col,
60 |             color="Cluster",
61 |             labels={"color": "Cluster"},
62 |         )
63 | 
64 |     def centroids_plot(self, x_col: str, y_col: str) -> None:
65 |         centroids = pd.DataFrame(
66 |             self.model.cluster_centers_,
67 |             columns=[f"{col}_centroid" for col in self.data_clustered.columns[:-1]],
68 |         )
69 |         centroids[x_col] = centroids[f"{x_col}_centroid"]
70 |         centroids[y_col] = centroids[f"{y_col}_centroid"]
71 | 
72 |         fig = px.scatter(
73 |             self.data_clustered,
74 |             x=x_col,
75 |             y=y_col,
76 |             color="Cluster",
77 |             labels={"color": "Cluster"},
78 |         )
79 | 
80 |         fig.add_trace(
81 |             go.Scatter(
82 |                 x=centroids[x_col],
83 |                 y=centroids[y_col],
84 |                 mode="markers",
85 |                 marker=dict(size=20, symbol="x", color="white"),
86 |                 name="Centroids",
87 |             )
88 |         )
89 |         return fig
90 | 


--------------------------------------------------------------------------------
/pages/large_language_models/chatbot_rag.py:
--------------------------------------------------------------------------------
 1 | import streamlit as st
 2 | 
 3 | import utils
 4 | from pages.large_language_models import LLM_CONFIG
 5 | from src.generative_ai.large_language_models import (ChatbotRAG,
 6 |                                                      get_vector_store)
 7 | 
 8 | loader = utils.PageConfigLoader(__file__)
 9 | loader.set_page_config(globals())
10 | 
11 | st_ss = st.session_state
12 | 
13 | 
14 | def main():
15 |     utils.show_source_code(
16 |         path="src/generative_ai/large_language_models/chatbots/chatbot_rag.py"
17 |     )
18 |     with st.expander(label="Chat parameters", expanded=True):
19 |         col1, col2 = st.columns(2)
20 |         with col1:
21 |             selected_language = st_ss.setdefault(
22 |                 "language_widget", utils.LanguageWidget()
23 |             ).selected_language
24 |         with col2:
25 |             lakera_activated = st_ss.setdefault(
26 |                 "lakera_widget", utils.LakeraWidget()
27 |             ).lakera_activated
28 | 
29 |     chosen_model = st.selectbox(
30 |         label="Large Language Model:",
31 |         placeholder="Choose an option",
32 |         options=LLM_CONFIG.keys(),
33 |         index=0,
34 |         on_change=utils.reset_session_state_key,
35 |         kwargs={"key": "chatbot_rag"},
36 |     )
37 | 
38 |     if uploaded_file := st.file_uploader(
39 |         "Upload a PDF file",
40 |         type="pdf",
41 |         accept_multiple_files=False,
42 |         help="https://python.langchain.com/docs/use_cases/question_answering/#what-is-rag",
43 |         on_change=utils.reset_session_state_key,
44 |         kwargs={"key": "chatbot_rag"},
45 |     ):
46 |         with open(uploaded_file.name, "wb") as f:
47 |             f.write(uploaded_file.getbuffer())
48 |         vector_db = get_vector_store(file=uploaded_file.name, mode="upload")
49 | 
50 |     if chosen_model and uploaded_file:
51 |         chatbot = st_ss.setdefault(
52 |             "chatbot_rag",
53 |             ChatbotRAG(vector_store=vector_db, **LLM_CONFIG[chosen_model]),
54 |         )
55 |         for message in chatbot.history:
56 |             st.chat_message(message["role"]).write(message["content"])
57 |     else:
58 |         st.info("Please upload a PDF file for the RAG", icon="ℹ️")
59 | 
60 |     if prompt := st.chat_input(
61 |         placeholder=f"Chat with {chosen_model}!"
62 |         if (chosen_model and uploaded_file)
63 |         else "",
64 |         disabled=not (chosen_model and uploaded_file),
65 |     ):
66 |         st.chat_message("human").write(prompt)
67 |         if lakera_activated:
68 |             flag, response = st_ss.setdefault(
69 |                 "lakera_widget", utils.LakeraWidget()
70 |             ).flag_prompt(prompt=prompt)
71 |             if flag:
72 |                 st.warning(body="Prompt injection detected", icon="🚨")
73 |                 st.expander(label="LOGS").json(response)
74 |         with st.chat_message("ai"):
75 |             chatbot.ask(
76 |                 query=prompt,
77 |                 language=selected_language,
78 |             )
79 | 


--------------------------------------------------------------------------------
/src/computer_vision/object_detection/face_detection.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import typing as t
 3 | from functools import cached_property
 4 | 
 5 | import cv2
 6 | import mediapipe as mp
 7 | import streamlit_webrtc as st_webrtc
 8 | from av import VideoFrame
 9 | from mediapipe.framework.formats import detection_pb2
10 | from numpy import ndarray
11 | 
12 | import utils
13 | 
14 | logger = utils.CustomLogger(__file__)
15 | 
16 | os.environ["MEDIAPIPE_DISABLE_GPU"] = "1"
17 | 
18 | 
19 | class FaceDetectionApp:
20 |     def __init__(self):
21 |         pass
22 | 
23 |     @cached_property
24 |     def detector(self):
25 |         return mp.solutions.face_detection.FaceDetection(
26 |             min_detection_confidence=0.5,
27 |             model_selection=0,
28 |         )
29 | 
30 |     def detect_faces(self, image: ndarray) -> t.Any:
31 |         return self.detector.process(image).detections
32 | 
33 |     def video_frame_callback(self, frame: VideoFrame) -> VideoFrame:
34 |         image = frame.to_ndarray(format="bgr24")
35 | 
36 |         detection_list = self.detect_faces(image)
37 |         self.annotate_faces(
38 |             image=image,
39 |             detection_list=detection_list,
40 |         )
41 |         utils.annotate_time(image=image)
42 |         return VideoFrame.from_ndarray(image, format="bgr24")
43 | 
44 |     def stream(self) -> None:
45 |         st_webrtc.webrtc_streamer(
46 |             video_frame_callback=self.video_frame_callback,
47 |             key="face_streamer",
48 |             mode=st_webrtc.WebRtcMode.SENDRECV,
49 |             rtc_configuration=st_webrtc.RTCConfiguration(
50 |                 {"iceServers": utils.get_ice_servers(), "iceTransportPolicy": "relay"}
51 |             ),
52 |             media_stream_constraints={"video": True, "audio": False},
53 |             async_processing=True,
54 |             desired_playing_state=None,
55 |         )
56 | 
57 |     @staticmethod
58 |     def annotate_faces(
59 |         image: ndarray,
60 |         detection_list: t.List[detection_pb2.Detection],
61 |     ) -> None:
62 |         if not detection_list:
63 |             return
64 | 
65 |         for detection in detection_list:
66 |             score = detection.score[0]
67 |             bbox = detection.location_data.relative_bounding_box
68 |             height, width, _ = image.shape
69 |             xmin, ymin = int(bbox.xmin * width), int(bbox.ymin * height)
70 |             xmax, ymax = int((bbox.xmin + bbox.width) * width), int(
71 |                 (bbox.ymin + bbox.height) * height
72 |             )
73 |             cv2.rectangle(
74 |                 img=image,
75 |                 pt1=(xmin, ymin),
76 |                 pt2=(xmax, ymax),
77 |                 color=(0, 255, 0),
78 |                 thickness=3,
79 |             )
80 |             cv2.putText(
81 |                 img=image,
82 |                 text=f"score: {score:.3f}",
83 |                 org=(xmin, ymin - 10),
84 |                 fontFace=cv2.FONT_HERSHEY_SIMPLEX,
85 |                 fontScale=0.5,
86 |                 color=(0, 255, 0),
87 |                 thickness=2,
88 |             )
89 | 


--------------------------------------------------------------------------------
/pages/statistical_tests/chi2_test.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import streamlit as st
 3 | 
 4 | import utils
 5 | from src.statistics.statistical_tests import Chi2Testing
 6 | 
 7 | loader = utils.PageConfigLoader(__file__)
 8 | loader.set_page_config(globals())
 9 | 
10 | 
11 | def main():
12 |     st.header("Data", divider="gray")
13 |     observed_template = pd.DataFrame(
14 |         data=[["Group A", 30, 20], ["Group B", 70, 80]],
15 |         index=None,
16 |         columns=["group", "category_1", "category_2"],
17 |     )
18 |     col_df, col_sum = st.columns([0.8, 0.2])
19 |     with col_df:
20 |         observed = st.data_editor(
21 |             data=observed_template,
22 |             hide_index=True,
23 |             column_config={
24 |                 "group": st.column_config.TextColumn(
25 |                     "Group",
26 |                     help="The name of the considered group.",
27 |                 ),
28 |                 "category_1": st.column_config.NumberColumn(
29 |                     "Category 1",
30 |                     min_value=1,
31 |                     required=True,
32 |                     help="The observed values for the category 1.",
33 |                 ),
34 |                 "category_2": st.column_config.NumberColumn(
35 |                     "Category 2",
36 |                     min_value=1,
37 |                     required=True,
38 |                     help="The observed values for the category 2.",
39 |                 ),
40 |             },
41 |             disabled=False,
42 |             use_container_width=True,
43 |         )
44 |         st.info("Click on any cell to change its content.", icon="💡")
45 |     with col_sum:
46 |         total_col = observed.drop("group", axis=1).sum(axis=1).to_frame(name="Total")
47 |         st.dataframe(total_col, hide_index=True, use_container_width=True)
48 | 
49 |     st.header("Settings", divider="gray")
50 |     settings_container = st.container(border=True)
51 |     confidence_col, alpha_col = settings_container.columns(2)
52 |     confidence = confidence_col.columns([0.15, 1, 0.15])[1].select_slider(
53 |         "Confidence level",
54 |         options=[0.9, 0.95, 0.99],
55 |         value=0.95,
56 |         key="chi2_test.confidence",
57 |         format_func=lambda x: f"{100*x}%",
58 |         on_change=utils.update_slider_callback,
59 |         kwargs={"updated": "chi2_test.confidence", "to_update": "chi2_test.alpha"},
60 |     )
61 |     alpha = alpha_col.columns([0.15, 1, 0.15])[1].select_slider(
62 |         "Alpha value",
63 |         options=[0.01, 0.05, 0.1],
64 |         value=0.05,
65 |         key="chi2_test.alpha",
66 |         format_func=lambda x: f"{100*x}%",
67 |         on_change=utils.update_slider_callback,
68 |         kwargs={"updated": "chi2_test.alpha", "to_update": "chi2_test.confidence"},
69 |     )
70 | 
71 |     chi2_testing = Chi2Testing(observed.drop("group", axis=1), alpha)
72 | 
73 |     st.header("Results", divider="gray")
74 |     result = chi2_testing.perform_chi2_test()
75 | 
76 |     if result["is_significant"]:
77 |         st.success("The difference is significant", icon="✅")
78 |     else:
79 |         st.error("The difference is not significant", icon="❌")
80 | 
81 |     st.expander(label="Test details").json(result)
82 | 


--------------------------------------------------------------------------------
/src/statistics/dimensionality_reduction/umap_manager.py:
--------------------------------------------------------------------------------
 1 | import typing as t
 2 | 
 3 | import pandas as pd
 4 | import plotly.express as px
 5 | import streamlit as st
 6 | from umap import UMAP
 7 | 
 8 | 
 9 | class UMAPManager:
10 |     def __init__(self, max_n_components: int):
11 |         self.max_n_components = max_n_components
12 |         self.model: UMAP | None = None
13 |         self.target_col: pd.Series | None = None
14 |         self.embedded_data_df: pd.DataFrame | None = None
15 | 
16 |     @property
17 |     def params(self) -> t.Dict[str, int | float]:
18 |         columns = st.columns(3)
19 |         return {
20 |             "n_components": columns[0].slider(
21 |                 label="Number of Components",
22 |                 min_value=1,
23 |                 max_value=self.max_n_components,
24 |                 value=3,
25 |                 step=1,
26 |                 help="Number of components to compute.",
27 |             ),
28 |             "n_neighbors": columns[1].slider(
29 |                 label="Number of Neighbors",
30 |                 min_value=2,
31 |                 max_value=100,
32 |                 value=15,
33 |                 step=1,
34 |                 help="Size of local neighborhood used for manifold approximation.",
35 |             ),
36 |             "min_dist": columns[2].slider(
37 |                 label="Minimum Distance",
38 |                 min_value=0.1,
39 |                 max_value=1.0,
40 |                 value=0.5,
41 |                 step=0.1,
42 |                 help="Minimum distance between embedded points.",
43 |             ),
44 |         }
45 | 
46 |     @st.cache_resource(show_spinner=True)
47 |     def _get_model(_self, params: t.Dict[str, int | float]) -> UMAP:
48 |         return UMAP(
49 |             n_components=params["n_components"],
50 |             n_neighbors=params["n_neighbors"],
51 |             min_dist=params["min_dist"],
52 |         )
53 | 
54 |     def set_model(self) -> None:
55 |         params = self.params
56 |         self.model = self._get_model(params)
57 | 
58 |     @st.cache_resource(
59 |         show_spinner=True,
60 |         hash_funcs={
61 |             UMAP: lambda model: (model.n_components, model.n_neighbors, model.min_dist)
62 |         },
63 |     )
64 |     def _compute_umap(_self, model: UMAP, data: pd.DataFrame) -> pd.DataFrame:
65 |         embedded_data = model.fit_transform(data)
66 |         column_names = [f"D{i}" for i in range(1, model.n_components + 1)]
67 |         return pd.DataFrame(embedded_data, columns=column_names)
68 | 
69 |     def fit(self, data: pd.DataFrame, target_col: pd.Series):
70 |         self.embedded_data_df = self._compute_umap(model=self.model, data=data)
71 |         self.target_col = target_col
72 | 
73 |     def scatter_matrix_plot(self) -> None:
74 |         return px.scatter_matrix(
75 |             self.embedded_data_df, color=self.target_col, labels={"color": "target"}
76 |         )
77 | 
78 |     def scatter_2d_plot(self) -> None:
79 |         return px.scatter(
80 |             self.embedded_data_df,
81 |             x="D1",
82 |             y="D2",
83 |             color=self.target_col,
84 |             labels={"color": "target"},
85 |         )
86 | 
87 |     def scatter_3d_plot(self) -> None:
88 |         return px.scatter_3d(
89 |             self.embedded_data_df,
90 |             x="D1",
91 |             y="D2",
92 |             z="D3",
93 |             color=self.target_col,
94 |             labels={"color": "target"},
95 |         )
96 | 


--------------------------------------------------------------------------------
/src/generative_ai/large_language_models/chatbots/chatbot.py:
--------------------------------------------------------------------------------
  1 | import typing as t
  2 | from functools import cached_property
  3 | 
  4 | from langchain.callbacks import StreamingStdOutCallbackHandler
  5 | from langchain.callbacks.base import BaseCallbackHandler
  6 | from langchain.chains import LLMChain
  7 | from langchain.chains.base import Chain
  8 | from langchain.chat_models import ChatOpenAI
  9 | from langchain.llms import Together
 10 | from langchain.memory import ConversationBufferMemory
 11 | from langchain.prompts import PromptTemplate
 12 | 
 13 | from src.generative_ai.large_language_models.callbacks import \
 14 |     StreamingChatCallbackHandler
 15 | 
 16 | 
 17 | class ModelArgs(t.TypedDict):
 18 |     provider: t.Literal["openai", "together"]
 19 |     owner: t.Literal["mistralai", "togethercomputer"] | None
 20 |     string: t.Literal["gpt-3.5-turbo", "llama-2-7b-chat", "Mistral-7B-Instruct-v0.1"]
 21 | 
 22 | 
 23 | class Chatbot:
 24 |     BASE_TEMPLATE = """
 25 |     Use the following context and chat history to answer the question:
 26 |     
 27 |     Context: {context}
 28 |     Chat history: {chat_history}
 29 |     Question: {question}
 30 |     
 31 |     Your answer (in {language}):
 32 |     """
 33 | 
 34 |     def __init__(self, **model_kwargs: t.Unpack[ModelArgs]) -> None:
 35 |         self.model_provider = model_kwargs.get("provider", "openai")
 36 |         self.model_owner = model_kwargs.get("owner", None)
 37 |         self.model_string = model_kwargs.get("string", "gpt-3.5-turbo")
 38 | 
 39 |     @cached_property
 40 |     def llm(self) -> ChatOpenAI | Together:
 41 |         if self.model_provider == "openai":
 42 |             return ChatOpenAI(
 43 |                 model=self.model_string,
 44 |                 streaming=True,
 45 |                 model_kwargs={},
 46 |             )
 47 |         elif self.model_provider == "together":
 48 |             return Together(
 49 |                 model=f"{self.model_owner}/{self.model_string}",
 50 |                 max_tokens=1024,
 51 |             )
 52 | 
 53 |     @cached_property
 54 |     def memory(self) -> ConversationBufferMemory:
 55 |         return ConversationBufferMemory(
 56 |             memory_key="chat_history",
 57 |             input_key="question",
 58 |             return_messages=True,
 59 |         )
 60 | 
 61 |     @property
 62 |     def history(self) -> t.List[t.Dict[str, str]]:
 63 |         return [
 64 |             {"role": message.type, "content": message.content}
 65 |             for message in self.memory.buffer
 66 |         ]
 67 | 
 68 |     @cached_property
 69 |     def template(self) -> PromptTemplate:
 70 |         return PromptTemplate(
 71 |             template=self.BASE_TEMPLATE,
 72 |             input_variables=["context", "chat_history", "question", "language"],
 73 |         )
 74 | 
 75 |     @cached_property
 76 |     def chain(self) -> Chain:
 77 |         return LLMChain(
 78 |             llm=self.llm,
 79 |             memory=self.memory,
 80 |             verbose=True,
 81 |             prompt=self.template,
 82 |         )
 83 | 
 84 |     @property
 85 |     def callbacks(self) -> t.List[BaseCallbackHandler]:
 86 |         return [StreamingChatCallbackHandler(), StreamingStdOutCallbackHandler()]
 87 | 
 88 |     def ask(
 89 |         self,
 90 |         query: str,
 91 |         context: str | None = None,
 92 |         language: str | None = None,
 93 |     ) -> str:
 94 |         return self.chain.run(
 95 |             question=query,
 96 |             context=context or "",
 97 |             language=language or "the input language",
 98 |             callbacks=self.callbacks,
 99 |         )
100 | 


--------------------------------------------------------------------------------
/src/statistics/dimensionality_reduction/tsne_manager.py:
--------------------------------------------------------------------------------
  1 | import typing as t
  2 | 
  3 | import pandas as pd
  4 | import plotly.express as px
  5 | import streamlit as st
  6 | from sklearn.manifold import TSNE
  7 | 
  8 | 
  9 | class TSNEManager:
 10 |     def __init__(self, max_n_components: int):
 11 |         self.max_n_components = max_n_components
 12 |         self.model: TSNE | None = None
 13 |         self.target_col: pd.Series | None = None
 14 |         self.embedded_data_df: pd.DataFrame | None = None
 15 | 
 16 |     @property
 17 |     def params(self) -> t.Dict[str, int]:
 18 |         columns = st.columns(3)
 19 |         return {
 20 |             "n_components": columns[0].slider(
 21 |                 label="Number of Components",
 22 |                 min_value=1,
 23 |                 max_value=self.max_n_components,
 24 |                 value=3,
 25 |                 step=1,
 26 |                 help="Number of components to compute.",
 27 |             ),
 28 |             "perplexity": columns[1].slider(
 29 |                 label="Perplexity",
 30 |                 min_value=1,
 31 |                 max_value=100,
 32 |                 value=30,
 33 |                 step=1,
 34 |                 help="A measure of how to balance attention between local and global aspects of the data.",
 35 |             ),
 36 |             "learning_rate": columns[2].slider(
 37 |                 label="Learning Rate",
 38 |                 min_value=10.0,
 39 |                 max_value=500.0,
 40 |                 value=200.0,
 41 |                 step=50.0,
 42 |                 help="Step size for each iteration in optimizing the cost function.",
 43 |             ),
 44 |         }
 45 | 
 46 |     @st.cache_resource(show_spinner=True)
 47 |     def _get_model(_self, params: t.Dict[str, int]) -> TSNE:
 48 |         return TSNE(
 49 |             n_components=params["n_components"],
 50 |             perplexity=params["perplexity"],
 51 |             learning_rate=params["learning_rate"],
 52 |         )
 53 | 
 54 |     def set_model(self) -> None:
 55 |         params = self.params
 56 |         self.model = self._get_model(params)
 57 | 
 58 |     @st.cache_resource(
 59 |         show_spinner=True,
 60 |         hash_funcs={
 61 |             TSNE: lambda model: (
 62 |                 model.n_components,
 63 |                 model.perplexity,
 64 |                 model.learning_rate,
 65 |             )
 66 |         },
 67 |     )
 68 |     def _compute_tsne(_self, model: TSNE, data: pd.DataFrame) -> pd.DataFrame:
 69 |         embedded_data = model.fit_transform(data)
 70 |         column_names = [f"D{i}" for i in range(1, model.n_components + 1)]
 71 |         return pd.DataFrame(embedded_data, columns=column_names)
 72 | 
 73 |     def fit(self, data: pd.DataFrame, target_col: pd.Series):
 74 |         self.embedded_data_df = self._compute_tsne(model=self.model, data=data)
 75 |         self.target_col = target_col
 76 | 
 77 |     def scatter_matrix_plot(self) -> None:
 78 |         return px.scatter_matrix(
 79 |             self.embedded_data_df, color=self.target_col, labels={"color": "target"}
 80 |         )
 81 | 
 82 |     def scatter_2d_plot(self) -> None:
 83 |         return px.scatter(
 84 |             self.embedded_data_df,
 85 |             x="D1",
 86 |             y="D2",
 87 |             color=self.target_col,
 88 |             labels={"color": "target"},
 89 |         )
 90 | 
 91 |     def scatter_3d_plot(self) -> None:
 92 |         return px.scatter_3d(
 93 |             self.embedded_data_df,
 94 |             x="D1",
 95 |             y="D2",
 96 |             z="D3",
 97 |             color=self.target_col,
 98 |             labels={"color": "target"},
 99 |         )
100 | 


--------------------------------------------------------------------------------
/src/computer_vision/landmarks/base.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import typing as t
  3 | from datetime import datetime
  4 | 
  5 | import cv2
  6 | import mediapipe as mp
  7 | import streamlit_webrtc as st_webrtc
  8 | from av import VideoFrame
  9 | from mediapipe.framework.formats import landmark_pb2
 10 | from numpy import ndarray
 11 | 
 12 | import utils
 13 | 
 14 | logger = utils.CustomLogger(__file__)
 15 | 
 16 | os.environ["MEDIAPIPE_DISABLE_GPU"] = "1"
 17 | 
 18 | 
 19 | class BaseLandmarkerApp:
 20 |     def __init__(self):
 21 |         pass
 22 | 
 23 |     def get_landmarks(self, image: ndarray) -> landmark_pb2.NormalizedLandmarkList:
 24 |         detection_result = self.landmarker.process(image)
 25 |         landmark_list = getattr(detection_result, self.landmarks_type)
 26 |         return landmark_list[0] if isinstance(landmark_list, list) else landmark_list
 27 | 
 28 |     def video_frame_callback(self, frame: VideoFrame) -> VideoFrame:
 29 |         image = frame.to_ndarray(format="bgr24")
 30 | 
 31 |         landmark_list = self.get_landmarks(image)
 32 |         self.annotate_landmarks(
 33 |             image=image,
 34 |             connections_list=self.connections_list,
 35 |             landmark_list=landmark_list,
 36 |             drawing_specs_list=self.drawing_specs_list,
 37 |         )
 38 |         utils.annotate_time(image=image)
 39 |         return VideoFrame.from_ndarray(image, format="bgr24")
 40 | 
 41 |     def stream(self) -> None:
 42 |         st_webrtc.webrtc_streamer(
 43 |             video_frame_callback=self.video_frame_callback,
 44 |             key=f"{self.landmarks_type}_streamer",
 45 |             mode=st_webrtc.WebRtcMode.SENDRECV,
 46 |             rtc_configuration=st_webrtc.RTCConfiguration(
 47 |                 {"iceServers": utils.get_ice_servers(), "iceTransportPolicy": "relay"}
 48 |             ),
 49 |             media_stream_constraints={"video": True, "audio": False},
 50 |             async_processing=True,
 51 |             desired_playing_state=None,
 52 |         )
 53 | 
 54 |     @staticmethod
 55 |     def annotate_time(image: ndarray) -> None:
 56 |         text = datetime.now().strftime("%Y-%m-%d %H:%M:%S.%f")[:-3]
 57 |         text_args = {
 58 |             "text": text,
 59 |             "fontFace": cv2.FONT_HERSHEY_SIMPLEX,
 60 |             "fontScale": 1,
 61 |             "thickness": 2,
 62 |         }
 63 |         text_size = cv2.getTextSize(**text_args)[0]
 64 |         rect_width, rect_height = text_size[0] + 20, text_size[1] + 20
 65 |         cv2.rectangle(
 66 |             img=image,
 67 |             pt1=(0, 0),
 68 |             pt2=(rect_width, rect_height),
 69 |             color=(255, 255, 255),
 70 |             thickness=cv2.FILLED,
 71 |         )
 72 |         cv2.rectangle(
 73 |             img=image,
 74 |             pt1=(0, 0),
 75 |             pt2=(rect_width, rect_height),
 76 |             color=(0, 0, 0),
 77 |             thickness=2,
 78 |         )
 79 |         cv2.putText(
 80 |             img=image,
 81 |             org=(10, text_size[1] + 10),
 82 |             color=(0, 0, 0),
 83 |             lineType=cv2.LINE_AA,
 84 |             **text_args,
 85 |         )
 86 | 
 87 |     @staticmethod
 88 |     def annotate_landmarks(
 89 |         image: ndarray,
 90 |         connections_list: t.List[t.FrozenSet[t.Tuple[int, int]]],
 91 |         landmark_list: landmark_pb2.NormalizedLandmarkList,
 92 |         drawing_specs_list: t.List[t.Dict[str, mp.solutions.drawing_utils.DrawingSpec]],
 93 |     ) -> None:
 94 |         if not landmark_list:
 95 |             return
 96 | 
 97 |         for connections, drawing_specs in zip(connections_list, drawing_specs_list):
 98 |             mp.solutions.drawing_utils.draw_landmarks(
 99 |                 image=image,
100 |                 landmark_list=landmark_list,
101 |                 connections=connections,
102 |                 **drawing_specs,
103 |             )
104 | 


--------------------------------------------------------------------------------
/src/machine_learning/datasets.py:
--------------------------------------------------------------------------------
  1 | import typing as t
  2 | 
  3 | import pandas as pd
  4 | import streamlit as st
  5 | from sklearn import datasets
  6 | from sklearn.model_selection import train_test_split
  7 | 
  8 | 
  9 | class DatasetParams(t.TypedDict):
 10 |     source: t.Literal["iris", "digits", "breast_cancer"]
 11 |     test_size: float | None
 12 |     shuffle: bool
 13 |     stratify: bool
 14 | 
 15 | 
 16 | class Dataset:
 17 |     def __init__(
 18 |         self,
 19 |         type: t.Literal["classification", "regression"] | None = None,
 20 |     ):
 21 |         self.type = type
 22 |         self.X: t.Tuple[pd.DataFrame, pd.DataFrame] | None = None
 23 |         self.y: t.Tuple[pd.Series, pd.Series] | None = None
 24 |         self.label_mapping: t.Dict[int, str] | None = None
 25 |         self.description: str | None = None
 26 | 
 27 |     @property
 28 |     def params(self) -> t.Dict[str, t.Any]:
 29 |         columns = st.columns(3)
 30 |         return {
 31 |             "source": columns[0].selectbox(
 32 |                 label="source",
 33 |                 options=["iris", "digits", "breast_cancer"]
 34 |                 if self.type == "classification"
 35 |                 else ["diabetes"]
 36 |                 if self.type == "regression"
 37 |                 else ["iris", "digits", "breast_cancer", "diabetes"],
 38 |                 help="The scikit-learn toy dataset to use.",
 39 |             ),
 40 |             "test_size": columns[1].slider(
 41 |                 "test_size",
 42 |                 min_value=0.05,
 43 |                 max_value=0.3,
 44 |                 value=0.2,
 45 |                 step=0.05,
 46 |                 help="The proportion of the dataset to include in the test split",
 47 |             )
 48 |             if self.type is not None
 49 |             else None,
 50 |             "shuffle": columns[2].checkbox(
 51 |                 label="shuffle",
 52 |                 value=True,
 53 |                 help="Whether to shuffle the dataset or not.",
 54 |             )
 55 |             if self.type is not None
 56 |             else None,
 57 |             "stratify": columns[2].checkbox(
 58 |                 label="stratify",
 59 |                 value=False,
 60 |                 help="Whether to stratify the dataset or not. "
 61 |                 "Stratifying means keeping the same label distribution in the initial, train and test datasets. "
 62 |                 "Available for classification only.",
 63 |                 disabled=self.type == "regression",
 64 |             )
 65 |             if self.type is not None
 66 |             else None,
 67 |         }
 68 | 
 69 |     @staticmethod
 70 |     @st.cache_data(show_spinner=False)
 71 |     def get_dataset(
 72 |         split: bool = False, **params: t.Unpack[DatasetParams]
 73 |     ) -> t.Dict[str, t.Any]:
 74 |         raw_dataset = getattr(datasets, f"load_{params['source']}")(as_frame=True)
 75 |         X, y = raw_dataset.data, raw_dataset.target
 76 |         if split:
 77 |             X_train, X_test, y_train, y_test = train_test_split(
 78 |                 X,
 79 |                 y,
 80 |                 test_size=params["test_size"],
 81 |                 shuffle=params["shuffle"],
 82 |                 stratify=y if params["stratify"] else None,
 83 |                 random_state=0,
 84 |             )
 85 |             X = X_train, X_test
 86 |             y = y_train, y_test
 87 |         return {
 88 |             "X": X,
 89 |             "y": y,
 90 |             "label_mapping": dict(enumerate(raw_dataset.target_names))
 91 |             if "target_names" in raw_dataset
 92 |             else None,
 93 |             "description": raw_dataset.DESCR,
 94 |         }
 95 | 
 96 |     def set(self, raw_dataset_dict: t.Dict[str, t.Any]):
 97 |         self.X = raw_dataset_dict["X"]
 98 |         self.y = raw_dataset_dict["y"]
 99 |         self.label_mapping = raw_dataset_dict["label_mapping"]
100 |         self.description = raw_dataset_dict["description"]
101 | 


--------------------------------------------------------------------------------
/src/statistics/statistical_tests/ab_test.py:
--------------------------------------------------------------------------------
  1 | import typing as t
  2 | 
  3 | import streamlit as st
  4 | from scipy import stats
  5 | 
  6 | 
  7 | def input_group_data(
  8 |     group_name: t.Literal["A", "B"],
  9 |     default_visitors: int,
 10 |     default_conversions: int,
 11 | ) -> t.Tuple[int, int, float]:
 12 |     visitors = st.number_input(
 13 |         "Visitors",
 14 |         key=f"ab_test.{group_name.lower()}_visitors",
 15 |         min_value=1,
 16 |         value=default_visitors,
 17 |         step=1,
 18 |     )
 19 |     conversion_col, rate_col = st.columns(2)
 20 |     conversions = conversion_col.number_input(
 21 |         "Conversions",
 22 |         key=f"ab_test.{group_name.lower()}_conversions",
 23 |         min_value=0,
 24 |         max_value=visitors,
 25 |         value=default_conversions,
 26 |         step=1,
 27 |     )
 28 |     rate = rate_col.number_input(
 29 |         "Conversion rate",
 30 |         key=f"ab_test.{group_name.lower()}_rate",
 31 |         min_value=0.0,
 32 |         max_value=1.0,
 33 |         value=conversions / visitors,
 34 |         disabled=True,
 35 |     )
 36 |     return visitors, conversions, rate
 37 | 
 38 | 
 39 | class ABTesting:
 40 |     def __init__(
 41 |         self,
 42 |         a_visitors: int,
 43 |         a_rate: float,
 44 |         b_visitors: int,
 45 |         b_rate: float,
 46 |         alpha: float,
 47 |         test_type: t.Literal["one-sided", "two-sided"],
 48 |     ):
 49 |         self.a_visitors, self.a_rate = a_visitors, a_rate
 50 |         self.b_visitors, self.b_rate = b_visitors, b_rate
 51 |         self.alpha = alpha
 52 |         self.test_type = test_type
 53 | 
 54 |     @staticmethod
 55 |     @st.cache_data(show_spinner=False)
 56 |     def compute_standard_deviation(rate: float, visitors: int) -> float:
 57 |         return (rate * (1 - rate) / visitors) ** 0.5
 58 | 
 59 |     @classmethod
 60 |     @st.cache_data(show_spinner=False)
 61 |     def compute_confidence_interval(
 62 |         _cls,
 63 |         a_rate: float,
 64 |         b_rate: float,
 65 |         a_visitors: int,
 66 |         b_visitors: int,
 67 |         alpha: float,
 68 |     ) -> t.Tuple[float, float]:
 69 |         a_std = _cls.compute_standard_deviation(a_rate, a_visitors)
 70 |         b_std = _cls.compute_standard_deviation(b_rate, b_visitors)
 71 |         interval = (
 72 |             stats.norm.ppf(1 - alpha / 2)
 73 |             * ((a_std**2 / a_visitors) + (b_std**2 / b_visitors)) ** 0.5
 74 |         )
 75 |         return b_rate - a_rate - interval, b_rate - a_rate + interval
 76 | 
 77 |     @staticmethod
 78 |     @st.cache_data(show_spinner=False)
 79 |     def is_statistically_significant(p_value: float, alpha: float) -> bool:
 80 |         return p_value < alpha
 81 | 
 82 |     @staticmethod
 83 |     @st.cache_data(show_spinner=False)
 84 |     def t_test(a_rate, a_std, a_visitors, b_rate, b_std, b_visitors, test_type):
 85 |         t_statistic, p_value = stats.ttest_ind_from_stats(
 86 |             mean1=a_rate,
 87 |             std1=a_std,
 88 |             nobs1=a_visitors,
 89 |             mean2=b_rate,
 90 |             std2=b_std,
 91 |             nobs2=b_visitors,
 92 |         )
 93 |         if test_type == "one-sided":
 94 |             p_value /= 2
 95 | 
 96 |         return t_statistic, p_value
 97 | 
 98 |     def perform_ab_test(self) -> t.Dict[str, any]:
 99 |         a_std = self.compute_standard_deviation(self.a_rate, self.a_visitors)
100 |         b_std = self.compute_standard_deviation(self.b_rate, self.b_visitors)
101 | 
102 |         t_statistic, p_value = self.t_test(
103 |             self.a_rate,
104 |             a_std,
105 |             self.a_visitors,
106 |             self.b_rate,
107 |             b_std,
108 |             self.b_visitors,
109 |             self.test_type,
110 |         )
111 | 
112 |         confidence_interval = self.compute_confidence_interval(
113 |             self.a_rate,
114 |             self.b_rate,
115 |             self.a_visitors,
116 |             self.b_visitors,
117 |             self.alpha,
118 |         )
119 | 
120 |         is_significant = self.is_statistically_significant(p_value, self.alpha)
121 | 
122 |         return {
123 |             "t_statistic": t_statistic,
124 |             "p_value": p_value,
125 |             "confidence_interval": confidence_interval,
126 |             "is_significant": is_significant,
127 |         }
128 | 


--------------------------------------------------------------------------------
/src/statistics/dimensionality_reduction/pca_manager.py:
--------------------------------------------------------------------------------
  1 | import typing as t
  2 | 
  3 | import numpy as np
  4 | import pandas as pd
  5 | import plotly.express as px
  6 | import plotly.graph_objects as go
  7 | import streamlit as st
  8 | from sklearn.decomposition import PCA
  9 | 
 10 | 
 11 | class PCAManager:
 12 |     def __init__(self, max_n_components: int):
 13 |         self.max_n_components = max_n_components
 14 |         self.normalize: bool | None = None
 15 |         self.model: PCA | None = None
 16 |         self.target_col: pd.Series | None = None
 17 | 
 18 |     @property
 19 |     def params(self) -> t.Dict[str, int]:
 20 |         columns = st.columns(2)
 21 |         return {
 22 |             "n_components": columns[0].slider(
 23 |                 label="Number of Components",
 24 |                 min_value=1,
 25 |                 max_value=self.max_n_components,
 26 |                 value=3,
 27 |                 step=1,
 28 |                 help="Number of principal components to compute.",
 29 |             ),
 30 |             "normalize": (
 31 |                 columns[1]
 32 |                 .columns([0.5, 1, 0.5])[1]
 33 |                 .toggle("Normalize data", value=False)
 34 |             ),
 35 |         }
 36 | 
 37 |     @st.cache_resource(show_spinner=True)
 38 |     def _get_model(_self, n_components: int) -> PCA:
 39 |         return PCA(n_components)
 40 | 
 41 |     def set_model(self) -> None:
 42 |         params = self.params
 43 |         self.model = self._get_model(params["n_components"])
 44 |         self.model.normalize = params["normalize"]
 45 | 
 46 |     @st.cache_resource(
 47 |         show_spinner=True,
 48 |         hash_funcs={PCA: lambda model: (model.n_components, model.normalize)},
 49 |     )
 50 |     def _compute_pca(
 51 |         _self, model: PCA, data: pd.DataFrame
 52 |     ) -> t.Tuple[pd.DataFrame, PCA]:
 53 |         data_normalized = (
 54 |             (data - data.mean()) / (data.std() + 1e-5) if model.normalize else data
 55 |         )
 56 |         components = model.fit_transform(data_normalized)
 57 | 
 58 |         return pd.DataFrame(
 59 |             components, columns=[f"PC{i+1}" for i in range(components.shape[1])]
 60 |         )
 61 | 
 62 |     def fit(self, data: pd.DataFrame, target_col: pd.Series):
 63 |         self.components_df = self._compute_pca(model=self.model, data=data)
 64 |         self.target_col = target_col
 65 | 
 66 |     def scatter_matrix_plot(self) -> None:
 67 |         return px.scatter_matrix(
 68 |             self.components_df, color=self.target_col, labels={"color": "target"}
 69 |         )
 70 | 
 71 |     def explained_variance_plot(self) -> None:
 72 |         exp_var_cumul = np.cumsum(self.model.explained_variance_ratio_)
 73 |         x_ticks = list(range(1, exp_var_cumul.shape[0] + 1))
 74 |         fig = px.bar(
 75 |             x=x_ticks,
 76 |             y=exp_var_cumul,
 77 |             labels={"x": "# Components", "y": "Explained Variance"},
 78 |         )
 79 |         fig.update_xaxes(tickvals=x_ticks, ticktext=list(map(str, x_ticks)))
 80 |         fig.add_trace(
 81 |             go.Scatter(
 82 |                 x=x_ticks,
 83 |                 y=exp_var_cumul,
 84 |                 mode="lines+markers",
 85 |                 line=dict(color="red", width=3),
 86 |                 marker=dict(size=10),
 87 |                 showlegend=False,
 88 |             )
 89 |         )
 90 |         return fig
 91 | 
 92 |     def scatter_2d_plot(self) -> None:
 93 |         return px.scatter(
 94 |             self.components_df,
 95 |             x="PC1",
 96 |             y="PC2",
 97 |             color=self.target_col,
 98 |             labels={"color": "target"},
 99 |         )
100 | 
101 |     def scatter_3d_plot(self) -> None:
102 |         return px.scatter_3d(
103 |             self.components_df,
104 |             x="PC1",
105 |             y="PC2",
106 |             z="PC3",
107 |             color=self.target_col,
108 |             labels={"color": "target"},
109 |         )
110 | 
111 |     def loadings_plot(self) -> None:
112 |         loadings = self.model.components_.T * np.sqrt(self.model.explained_variance_)
113 | 
114 |         fig = px.scatter(
115 |             self.components_df,
116 |             x="PC1",
117 |             y="PC2",
118 |             color=self.target_col,
119 |             labels={"color": "target"},
120 |         )
121 | 
122 |         for i, feature in enumerate(self.components_df.columns):
123 |             fig.add_annotation(
124 |                 ax=0,
125 |                 ay=0,
126 |                 axref="x",
127 |                 ayref="y",
128 |                 x=loadings[i, 0],
129 |                 y=loadings[i, 1],
130 |                 showarrow=True,
131 |                 arrowsize=2,
132 |                 arrowhead=2,
133 |                 xanchor="right",
134 |                 yanchor="top",
135 |             )
136 |             fig.add_annotation(
137 |                 x=loadings[i, 0],
138 |                 y=loadings[i, 1],
139 |                 ax=0,
140 |                 ay=0,
141 |                 xanchor="center",
142 |                 yanchor="bottom",
143 |                 text=feature,
144 |                 yshift=5,
145 |             )
146 |         return fig
147 | 


--------------------------------------------------------------------------------
/pages/pages_config.yaml:
--------------------------------------------------------------------------------
  1 | __init__.py:
  2 |   NAME: Home
  3 |   ICON: 🏠
  4 |   DESCRIPTION: |
  5 |     Welcome to my superapp, a comprehensive toolset for data science and machine learning 🚀  
  6 |     [![source code](https://img.shields.io/badge/source_code-gray?logo=github)](https://github.com/daltunay/my-superapp/)
  7 | 
  8 |     Technologies used:
  9 |     - **Programming language**: Python
 10 |     - **Libraries**: pandas, numpy, scikit-learn, OpenCV, Mediapipe, plotly, XGBoost, SHAP, LangChain, OpenAI, Together, FAISS, ultralytics, umap
 11 |     - **Deployment**: Docker, Streamlit
 12 | 
 13 |     Feel free to provide feedback and make this superapp even more powerful!
 14 | 
 15 | 
 16 |     > _Made by Daniel Altunay_
 17 |     [![LinkedIn URL](https://img.icons8.com/?id=13930&format=png)](https://linkedin.com/in/daltunay)
 18 |     [![GitHub URL](https://img.icons8.com/?id=AZOZNnY73haj&format=png)](https://github.com/daltunay)
 19 | 
 20 |     ---
 21 |   SIDEBAR: radio
 22 |   TAG:
 23 | 
 24 | statistical_tests:
 25 |   __init__.py:
 26 |     NAME: Statistical Tests
 27 |     ICON: 🔢
 28 |     DESCRIPTION: Perform several statistical tests!
 29 |     SIDEBAR: radio
 30 |     TAG: 📚 Statistics
 31 | 
 32 |   ab_test.py:
 33 |     NAME: A/B Test
 34 |     ICON: 🆎
 35 |     DESCRIPTION: |
 36 |       Perform A/B tests!
 37 |       > pandas, numpy, scipy
 38 |     TAG:
 39 | 
 40 |   chi2_test.py:
 41 |     NAME: Chi-squared Test
 42 |     ICON: 🆇
 43 |     DESCRIPTION: |
 44 |       Perform chi-squared tests!
 45 |       > pandas, numpy, scipy
 46 |     TAG:
 47 | 
 48 | dimensionality_reduction:
 49 |   __init__.py:
 50 |     NAME: Dimensionality Reduction
 51 |     ICON: 🔽
 52 |     DESCRIPTION: Reduce dimensionality for high-D data!
 53 |     SIDEBAR: radio
 54 |     TAG: 📚 Statistics
 55 | 
 56 |   pca.py:
 57 |     NAME: PCA
 58 |     ICON: ⭕
 59 |     DESCRIPTION: |
 60 |       Perform Principal Component Analysis!
 61 |       > pandas, scikit-learn, plotly
 62 |     TAG:
 63 | 
 64 |   t-sne.py:
 65 |     NAME: t-SNE
 66 |     ICON: 📊
 67 |     DESCRIPTION: |
 68 |       Perform t-distributed Stochastic Neighbor Embedding!
 69 |       > pandas, scikit-learn, plotly
 70 |     TAG:
 71 | 
 72 |   umap.py:
 73 |     NAME: UMAP
 74 |     ICON: 🗺️
 75 |     DESCRIPTION: |
 76 |       Perform Uniform Manifold Approximation and Projection!
 77 |       > pandas, scikit-learn, umap, plotly
 78 |     TAG:
 79 | 
 80 | landmarks:
 81 |   __init__.py:
 82 |     NAME: Landmarks Detection
 83 |     ICON: 📍
 84 |     DESCRIPTION: Perform live landmark detection using your webcam!
 85 |     SIDEBAR: radio
 86 |     TAG: 👁️ Computer Vision
 87 | 
 88 |   face_landmarks.py:
 89 |     NAME: Face Mesh
 90 |     ICON: 👤
 91 |     DESCRIPTION: |
 92 |       Detect face landmarks using Mediapipe!
 93 |       > OpenCV, Mediapipe, WebRTC
 94 |     TAG:
 95 | 
 96 |   pose_landmarks.py:
 97 |     NAME: Pose Landmarks
 98 |     ICON: 🤸‍♂️
 99 |     DESCRIPTION: |
100 |       Detect body pose landmarks using Mediapipe!
101 |       > OpenCV, Mediapipe, WebRTC
102 |     TAG:
103 | 
104 | object_detection:
105 |   __init__.py:
106 |     NAME: Object Detection
107 |     ICON: 🔍
108 |     DESCRIPTION: Perform live object detection using your webcam!
109 |     SIDEBAR: radio
110 |     TAG: 👁️ Computer Vision
111 | 
112 |   face_detection.py:
113 |     NAME: Face Detection
114 |     ICON: 👀
115 |     DESCRIPTION: |
116 |       Detect one or several faces using Mediapipe!
117 |       > OpenCV, Mediapipe, WebRTC
118 |     TAG:
119 | 
120 |   multi_objects.py:
121 |     NAME: Multi-Object Detection
122 |     ICON: 📦
123 |     DESCRIPTION: |
124 |       Detect 80 unique labels using YOLOv8!
125 |       > OpenCV, ultralytics, WebRTC
126 |     TAG:
127 | 
128 | image_generation:
129 |   __init__.py:
130 |     NAME: Image Generation
131 |     ICON: 🎨
132 |     DESCRIPTION: Generate pictures with AI!
133 |     SIDEBAR: radio
134 |     TAG: 🧠 Generative AI
135 | 
136 |   dall_e.py:
137 |     NAME: DALL·E
138 |     ICON: 🖼️
139 |     DESCRIPTION: DALL·E model from OpenAI
140 |     TAG:
141 | 
142 |   stable_diffusion.py:
143 |     NAME: Stable Diffusion
144 |     ICON: 🖼️
145 |     DESCRIPTION: Stable Diffusion model from Stability AI
146 |     TAG:
147 | 
148 | large_language_models:
149 |   __init__.py:
150 |     NAME: Large Language Models
151 |     ICON: 💬
152 |     DESCRIPTION: Interact with large language models!
153 |     SIDEBAR: radio
154 |     TAG: 🧠 Generative AI
155 | 
156 |   chatbot.py:
157 |     NAME: Basic Chatbot
158 |     ICON: 👋
159 |     DESCRIPTION: |
160 |       A regular chatbot.
161 |       > LangChain, OpenAI, Together
162 |     TAG: 🤖 Chatbots
163 | 
164 |   chatbot_rag.py:
165 |     NAME: Chatbot with RAG
166 |     ICON: 📄
167 |     DESCRIPTION: |
168 |       A chatbot with RAG (retrieval augmented generation).
169 |       > LangChain, OpenAI, Together, FAISS
170 |     TAG: 🤖 Chatbots
171 | 
172 |   chatbot_tools.py:
173 |     NAME: Chatbot with Tools
174 |     ICON: 🛠️
175 |     DESCRIPTION: |
176 |       A chatbot augmented with tools (web access, code interpreter, etc.)
177 |       > LangChain (Agents), OpenAI, Together
178 |     TAG: 🤖 Chatbots
179 | 
180 |   chatbot_web_summary.py:
181 |     NAME: Webpage summary
182 |     ICON: 🌐
183 |     DESCRIPTION: |
184 |       A model to synthetize to text content of a webpage.
185 |       > LangChain, OpenAI, Together, unstructured
186 |     TAG: 🔄 Other
187 | 
188 | classification:
189 |   __init__.py:
190 |     NAME: Classification
191 |     ICON: 🎯
192 |     DESCRIPTION: Perform several types of classification!
193 |     SIDEBAR: radio
194 |     TAG: ⚙️ Machine Learning
195 | 
196 |   xgboost.py:
197 |     NAME: Gradient Boosting
198 |     ICON: 🌲
199 |     DESCRIPTION: |
200 |       Use gradient boosting for binary & multi-class classification!
201 |       > pandas, XGBoost, scikit-learn, SHAP, plotly
202 |     TAG:
203 | 
204 | regression:
205 |   __init__.py:
206 |     NAME: Regression
207 |     ICON: 📈
208 |     DESCRIPTION: Perform several types of regression!
209 |     SIDEBAR: radio
210 |     TAG: ⚙️ Machine Learning
211 | 
212 |   xgboost.py:
213 |     NAME: Gradient Boosting
214 |     ICON: 🌲
215 |     DESCRIPTION: |
216 |       Use gradient boosting for regression!
217 |       > pandas, XGBoost, scikit-learn, SHAP, plotly
218 |     TAG:
219 | 
220 | clustering:
221 |   __init__.py:
222 |     NAME: Clustering
223 |     ICON: 🕸️
224 |     DESCRIPTION: Perform different types of clustering!
225 |     SIDEBAR: radio
226 |     TAG: ⚙️ Machine Learning
227 | 
228 |   kmeans.py:
229 |     NAME: K-Means
230 |     ICON: 🇰
231 |     DESCRIPTION: |
232 |       Perform a K-Means clustering!
233 |       > pandas, scikit-learn, plotly
234 |     TAG:
235 | 
236 |   dbscan.py:
237 |     NAME: DBSCAN
238 |     ICON: 🇩
239 |     DESCRIPTION: |
240 |       Perform a DBSCAN clustering!
241 |       > pandas, scikit-learn, plotly
242 |     TAG:
243 | 


--------------------------------------------------------------------------------
/src/machine_learning/xgboost_manager.py:
--------------------------------------------------------------------------------
  1 | import typing as t
  2 | 
  3 | import matplotlib.pyplot as plt
  4 | import numpy as np
  5 | import pandas as pd
  6 | import shap
  7 | import sklearn.metrics
  8 | import streamlit as st
  9 | from matplotlib.figure import Figure
 10 | from xgboost import XGBClassifier, XGBRegressor
 11 | 
 12 | 
 13 | def xgb_hash_func(model: XGBClassifier | XGBRegressor):
 14 |     return {key: val for key, val in vars(model).items() if key != "_Booster"}
 15 | 
 16 | 
 17 | class XGBoostManager:
 18 |     def __init__(self, task: t.Literal["classification", "regression"]) -> None:
 19 |         self.task = task
 20 |         self.model: XGBClassifier | None = None
 21 |         self.classification_report: pd.DataFrame | None = None
 22 |         self.confusion_matrix: pd.DataFrame | None = None
 23 |         self.metrics_report: pd.DataFrame | None = None
 24 | 
 25 |     @property
 26 |     def params(self) -> t.Dict[str, float | int]:
 27 |         columns = st.columns(3)
 28 |         return {
 29 |             "max_depth": columns[0].slider(
 30 |                 label="`max_depth`",
 31 |                 min_value=1,
 32 |                 max_value=5,
 33 |                 value=3,
 34 |                 step=1,
 35 |                 help="Maximum depth of a tree. "
 36 |                 "Increasing this value will make the model more complex and more likely to overfit. "
 37 |                 "0 indicates no limit on depth.",
 38 |             ),
 39 |             "learning_rate": columns[0].select_slider(
 40 |                 label="`learning_rate`",
 41 |                 options=[0.01, 0.05, 0.1, 0.25, 0.5, 1.0],
 42 |                 value=0.01,
 43 |                 help="Step size shrinkage used in update to prevents overfitting. "
 44 |                 "After each boosting step, we can directly get the weights of new features, and `learning_rate` shrinks the feature weights to make the boosting process more conservative.",
 45 |             ),
 46 |             "n_estimators": columns[0].slider(
 47 |                 label="`n_estimators`",
 48 |                 min_value=10,
 49 |                 max_value=50,
 50 |                 value=50,
 51 |                 step=10,
 52 |                 help="Number of gradient boosted trees. "
 53 |                 "Equivalent to number of boosting rounds.",
 54 |             ),
 55 |             "subsample": columns[1].slider(
 56 |                 label="`subsample`",
 57 |                 min_value=0.1,
 58 |                 max_value=1.0,
 59 |                 value=0.8,
 60 |                 step=0.1,
 61 |                 help="Subsample ratio of the training instances. "
 62 |                 "Setting it to 0.5 means that XGBoost would randomly sample half of the training data prior to growing trees and this will prevent overfitting. "
 63 |                 "Subsampling will occur once in every boosting iteration.",
 64 |             ),
 65 |             "colsample_bytree": columns[1].slider(
 66 |                 label="`colsample_bytree`",
 67 |                 min_value=0.1,
 68 |                 max_value=1.0,
 69 |                 value=0.8,
 70 |                 step=0.1,
 71 |                 help="Subsample ratio of columns when constructing each tree. "
 72 |                 "Subsampling occurs once for every tree constructed.",
 73 |             ),
 74 |             "min_split_loss": columns[1].slider(
 75 |                 label="`min_split_loss`",
 76 |                 min_value=0.0,
 77 |                 max_value=5.0,
 78 |                 value=0.0,
 79 |                 step=0.5,
 80 |                 help="Minimum loss reduction required to make a further partition on a leaf node of the tree. "
 81 |                 "The larger `min_split_loss` is, the more conservative the algorithm will be.",
 82 |             ),
 83 |             "min_child_weight": columns[2].slider(
 84 |                 label="`min_child_weight`",
 85 |                 min_value=0.0,
 86 |                 max_value=5.0,
 87 |                 value=1.0,
 88 |                 step=0.5,
 89 |                 help="Minimum sum of instance weight (hessian) needed in a child. "
 90 |                 "If the tree partition step results in a leaf node with the sum of instance weight less than `min_child_weight`, then the building process will give up further partitioning. "
 91 |                 "In linear regression task, this simply corresponds to minimum number of instances needed to be in each node. "
 92 |                 "The larger `min_child_weight` is, the more conservative the algorithm will be.",
 93 |             ),
 94 |             "reg_alpha": columns[2].slider(
 95 |                 label="`reg_alpha`",
 96 |                 min_value=0.0,
 97 |                 max_value=5.0,
 98 |                 value=1.0,
 99 |                 step=0.5,
100 |                 help="L1 regularization term on weights. "
101 |                 "Increasing this value will make model more conservative.",
102 |             ),
103 |             "reg_lambda": columns[2].slider(
104 |                 label="`reg_lambda`",
105 |                 min_value=0.0,
106 |                 max_value=5.0,
107 |                 value=0.0,
108 |                 step=0.5,
109 |                 help="L2 regularization term on weights. "
110 |                 "Increasing this value will make model more conservative.",
111 |             ),
112 |         }
113 | 
114 |     @staticmethod
115 |     @st.cache_resource(show_spinner=True)
116 |     def _get_model(
117 |         task: t.Literal["classification", "regression"],
118 |         label_mapping: t.Dict[int, str] | None = None,
119 |         **params
120 |     ) -> XGBClassifier:
121 |         if task == "classification":
122 |             return XGBClassifier(**params)
123 |         elif task == "regression":
124 |             return XGBRegressor(**params)
125 | 
126 |     def set_model(self, label_mapping: t.Dict[int, str] | None = None) -> None:
127 |         self.model = self._get_model(self.task, label_mapping, **self.params)
128 | 
129 |     @staticmethod
130 |     @st.cache_resource(
131 |         show_spinner=True,
132 |         hash_funcs={XGBClassifier: xgb_hash_func, XGBRegressor: xgb_hash_func},
133 |     )
134 |     def _fit_model(
135 |         model: XGBClassifier | XGBRegressor, X_train: pd.DataFrame, y_train: pd.Series
136 |     ) -> XGBClassifier | XGBRegressor:
137 |         return model.fit(X_train, y_train)
138 | 
139 |     def fit(self, X_train: pd.DataFrame, y_train: pd.Series):
140 |         self.model = self._fit_model(self.model, X_train, y_train)
141 | 
142 |     @staticmethod
143 |     @st.cache_data(show_spinner=True)
144 |     def _classification_report(
145 |         y_true: pd.Series, y_pred: pd.Series, target_names: t.List[str]
146 |     ):
147 |         return (
148 |             pd.DataFrame(
149 |                 sklearn.metrics.classification_report(
150 |                     y_true=y_true,
151 |                     y_pred=y_pred,
152 |                     target_names=target_names,
153 |                     output_dict=True,
154 |                     zero_division=np.nan,
155 |                 )
156 |             )
157 |             .astype(float)
158 |             .round(4)
159 |             .transpose()
160 |         )
161 | 
162 |     @staticmethod
163 |     @st.cache_data(show_spinner=True)
164 |     def _confusion_matrix(y_true: pd.Series, y_pred: pd.Series):
165 |         return pd.DataFrame(
166 |             sklearn.metrics.confusion_matrix(y_true=y_true, y_pred=y_pred)
167 |         )
168 | 
169 |     @staticmethod
170 |     @st.cache_data(show_spinner=True)
171 |     def _metrics_report(y_true: pd.Series, y_pred: pd.Series):
172 |         mean_absolute_error = sklearn.metrics.mean_absolute_error(y_true, y_pred)
173 |         median_absolute_error = sklearn.metrics.median_absolute_error(y_true, y_pred)
174 |         mean_squared_error = sklearn.metrics.mean_squared_error(y_true, y_pred)
175 |         r2 = sklearn.metrics.r2_score(y_true, y_pred)
176 |         explained_variance = sklearn.metrics.explained_variance_score(y_true, y_pred)
177 |         return pd.DataFrame(
178 |             {
179 |                 "Mean Absolute Error": [mean_absolute_error],
180 |                 "Median Absolute Error": [median_absolute_error],
181 |                 "Mean Squared Error": [mean_squared_error],
182 |                 "Root Mean Squared Error": [mean_squared_error**0.5],
183 |                 "R^2": [r2],
184 |                 "Explained Variance": [explained_variance],
185 |             },
186 |             index=["Value"],
187 |         ).transpose()
188 | 
189 |     @staticmethod
190 |     @st.cache_data(show_spinner=True)
191 |     def _confusion_matrix_display(
192 |         confusion_matrix: pd.DataFrame, display_labels: t.List[str]
193 |     ) -> sklearn.metrics.ConfusionMatrixDisplay:
194 |         return sklearn.metrics.ConfusionMatrixDisplay(
195 |             confusion_matrix=confusion_matrix,
196 |             display_labels=display_labels,
197 |         )
198 | 
199 |     def confusion_matrix_display(self, display_labels: t.List[str]) -> Figure:
200 |         confusion_matrix_display = self._confusion_matrix_display(
201 |             confusion_matrix=self.confusion_matrix.to_numpy(),
202 |             display_labels=display_labels,
203 |         )
204 |         fig, ax = plt.subplots()
205 |         confusion_matrix_display.plot(ax=ax)
206 |         return fig
207 | 
208 |     def evaluate(
209 |         self,
210 |         X_test: pd.DataFrame,
211 |         y_test: pd.Series,
212 |         target_names: t.List[str] | None = None,
213 |     ):
214 |         y_pred = self.model.predict(X_test)
215 |         if self.task == "classification":
216 |             self.classification_report = self._classification_report(
217 |                 y_true=y_test,
218 |                 y_pred=y_pred,
219 |                 target_names=target_names,
220 |             )
221 |             self.confusion_matrix = self._confusion_matrix(
222 |                 y_true=y_test,
223 |                 y_pred=y_pred,
224 |             )
225 |         elif self.task == "regression":
226 |             self.metrics_report = self._metrics_report(
227 |                 y_true=y_test,
228 |                 y_pred=y_pred,
229 |             )
230 | 
231 |     @staticmethod
232 |     @st.cache_data(
233 |         show_spinner=True,
234 |         hash_funcs={XGBClassifier: xgb_hash_func, XGBRegressor: xgb_hash_func},
235 |     )
236 |     def _shap_values(model: XGBClassifier | XGBRegressor, X_test: pd.DataFrame):
237 |         explainer = shap.TreeExplainer(model)
238 |         shap_values = explainer.shap_values(X_test)
239 |         return explainer, shap_values
240 | 
241 |     def shap_force_plot(self, X_test: pd.DataFrame):
242 |         explainer, shap_values = self._shap_values(self.model, X_test)
243 |         base_value = explainer.expected_value
244 |         if isinstance(self.model, XGBClassifier):
245 |             base_value = base_value[0]
246 |             shap_values = shap_values[0]
247 |         return shap.force_plot(
248 |             base_value=base_value, shap_values=shap_values, features=X_test
249 |         )
250 | 


--------------------------------------------------------------------------------