├── src
├── __init__.py
├── computer_vision
│ ├── __init__.py
│ ├── object_detection
│ │ ├── __init__.py
│ │ ├── multi_objects.py
│ │ └── face_detection.py
│ └── landmarks
│ │ ├── __init__.py
│ │ ├── pose_landmarks.py
│ │ ├── face_landmarks.py
│ │ └── base.py
├── machine_learning
│ ├── __init__.py
│ ├── clustering
│ │ ├── __init__.py
│ │ ├── dbscan_manager.py
│ │ └── kmeans_manager.py
│ ├── datasets.py
│ └── xgboost_manager.py
├── statistics
│ ├── statistical_tests
│ │ ├── __init__.py
│ │ ├── chi_squared.py
│ │ └── ab_test.py
│ └── dimensionality_reduction
│ │ ├── __init__.py
│ │ ├── umap_manager.py
│ │ ├── tsne_manager.py
│ │ └── pca_manager.py
└── generative_ai
│ ├── image_generation
│ ├── __init__.py
│ ├── dall_e.py
│ └── stable_diffusion.py
│ └── large_language_models
│ ├── __init__.py
│ ├── chatbots
│ ├── __init__.py
│ ├── chatbot_web_summary.py
│ ├── chatbot_rag.py
│ ├── chatbot_tools.py
│ └── chatbot.py
│ ├── callbacks.py
│ └── ingest.py
├── data
└── documents
│ └── .gitkeep
├── faiss_index
└── .gitkeep
├── notebooks
└── draft.ipynb
├── packages.txt
├── pages
├── __init__.py
├── landmarks
│ ├── __init__.py
│ ├── face_landmarks.py
│ └── pose_landmarks.py
├── classification
│ ├── __init__.py
│ └── xgboost.py
├── clustering
│ ├── __init__.py
│ ├── dbscan.py
│ └── kmeans.py
├── regression
│ ├── __init__.py
│ └── xgboost.py
├── image_generation
│ ├── __init__.py
│ ├── dall_e.py
│ └── stable_diffusion.py
├── object_detection
│ ├── __init__.py
│ ├── face_detection.py
│ └── multi_objects.py
├── statistical_tests
│ ├── __init__.py
│ ├── ab_test.py
│ └── chi2_test.py
├── dimensionality_reduction
│ ├── __init__.py
│ ├── t-sne.py
│ ├── umap.py
│ └── pca.py
├── large_language_models
│ ├── __init__.py
│ ├── chatbot_web_summary.py
│ ├── chatbot.py
│ ├── chatbot_tools.py
│ └── chatbot_rag.py
└── pages_config.yaml
├── utils
├── widgets
│ ├── __init__.py
│ ├── language.py
│ └── lakera.py
├── callbacks.py
├── shap.py
├── secrets.py
├── turn.py
├── __init__.py
├── image_annotation.py
├── logging.py
├── pages_config.py
├── misc.py
└── streamlit_display.py
├── .streamlit
├── config.toml
└── secrets.toml.example
├── .gitignore
├── app.py
├── Dockerfile
├── bin
└── run.sh
├── config
├── providers.yaml
└── models.yaml
├── LICENSE
├── pyproject.toml
└── README.md
/src/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/data/documents/.gitkeep:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/faiss_index/.gitkeep:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/notebooks/draft.ipynb:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/packages.txt:
--------------------------------------------------------------------------------
1 | python3-opencv
--------------------------------------------------------------------------------
/src/computer_vision/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/pages/__init__.py:
--------------------------------------------------------------------------------
1 | import utils
2 |
3 | loader = utils.PageConfigLoader(__file__)
4 | loader.set_page_config(globals())
5 |
--------------------------------------------------------------------------------
/pages/landmarks/__init__.py:
--------------------------------------------------------------------------------
1 | import utils
2 |
3 | loader = utils.PageConfigLoader(__file__)
4 | loader.set_page_config(globals())
5 |
--------------------------------------------------------------------------------
/pages/classification/__init__.py:
--------------------------------------------------------------------------------
1 | import utils
2 |
3 | loader = utils.PageConfigLoader(__file__)
4 | loader.set_page_config(globals())
5 |
--------------------------------------------------------------------------------
/pages/clustering/__init__.py:
--------------------------------------------------------------------------------
1 | import utils
2 |
3 | loader = utils.PageConfigLoader(__file__)
4 | loader.set_page_config(globals())
5 |
--------------------------------------------------------------------------------
/pages/regression/__init__.py:
--------------------------------------------------------------------------------
1 | import utils
2 |
3 | loader = utils.PageConfigLoader(__file__)
4 | loader.set_page_config(globals())
5 |
--------------------------------------------------------------------------------
/pages/image_generation/__init__.py:
--------------------------------------------------------------------------------
1 | import utils
2 |
3 | loader = utils.PageConfigLoader(__file__)
4 | loader.set_page_config(globals())
5 |
--------------------------------------------------------------------------------
/pages/object_detection/__init__.py:
--------------------------------------------------------------------------------
1 | import utils
2 |
3 | loader = utils.PageConfigLoader(__file__)
4 | loader.set_page_config(globals())
5 |
--------------------------------------------------------------------------------
/pages/statistical_tests/__init__.py:
--------------------------------------------------------------------------------
1 | import utils
2 |
3 | loader = utils.PageConfigLoader(__file__)
4 | loader.set_page_config(globals())
5 |
--------------------------------------------------------------------------------
/pages/dimensionality_reduction/__init__.py:
--------------------------------------------------------------------------------
1 | import utils
2 |
3 | loader = utils.PageConfigLoader(__file__)
4 | loader.set_page_config(globals())
5 |
--------------------------------------------------------------------------------
/utils/widgets/__init__.py:
--------------------------------------------------------------------------------
1 | from utils.widgets.lakera import LakeraWidget
2 | from utils.widgets.language import LanguageWidget
3 |
4 | __all__ = ["LakeraWidget", "LanguageWidget"]
5 |
--------------------------------------------------------------------------------
/.streamlit/config.toml:
--------------------------------------------------------------------------------
1 | [browser]
2 | gatherUsageStats = true
3 |
4 | [server]
5 | address = "0.0.0.0"
6 | port = 8501
7 |
8 | [global]
9 | disableWidgetStateDuplicationWarning = true
10 |
--------------------------------------------------------------------------------
/utils/callbacks.py:
--------------------------------------------------------------------------------
1 | import streamlit as st
2 |
3 |
4 | def update_slider_callback(updated: str, to_update: str):
5 | setattr(st.session_state, to_update, 1 - st.session_state.get(updated))
6 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | __pycache__/
2 | .DS_store
3 | .mypy_cache/
4 | .ruff_cache/
5 | .__pycache__/
6 | .streamlit/secrets.toml
7 | *.pem
8 | *.pyc
9 | cache/
10 | TODO.md
11 | *.pt
12 | *.faiss
13 | *.pkl
--------------------------------------------------------------------------------
/src/machine_learning/__init__.py:
--------------------------------------------------------------------------------
1 | from src.machine_learning.datasets import Dataset
2 | from src.machine_learning.xgboost_manager import XGBoostManager
3 |
4 | __all__ = ["Dataset", "XGBoostManager"]
5 |
--------------------------------------------------------------------------------
/src/machine_learning/clustering/__init__.py:
--------------------------------------------------------------------------------
1 | from src.machine_learning.clustering.dbscan_manager import DBScanManager
2 | from src.machine_learning.clustering.kmeans_manager import KMeansManager
3 |
4 | __all__ = ["KMeansManager", "DBScanManager"]
5 |
--------------------------------------------------------------------------------
/utils/shap.py:
--------------------------------------------------------------------------------
1 | import shap
2 | import streamlit.components.v1 as components
3 |
4 |
5 | def st_shap(plot, height=None):
6 | shap_html = f"
{shap.getjs()}{plot.html()}"
7 | components.html(shap_html, height=height)
8 |
--------------------------------------------------------------------------------
/app.py:
--------------------------------------------------------------------------------
1 | import streamlit as st
2 | import streamlit_superapp as st_superapp
3 |
4 | import utils
5 |
6 | utils.load_secrets()
7 |
8 | st.set_page_config(page_title="daltunay", page_icon="🚀", layout="centered")
9 |
10 | st_superapp.run()
11 |
--------------------------------------------------------------------------------
/src/computer_vision/object_detection/__init__.py:
--------------------------------------------------------------------------------
1 | from src.computer_vision.object_detection.face_detection import \
2 | FaceDetectionApp
3 | from src.computer_vision.object_detection.multi_objects import \
4 | MultiObjectsDetectionApp
5 |
6 | __all__ = ["FaceDetectionApp", "MultiObjectsDetectionApp"]
7 |
--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM python:3.11-slim
2 |
3 | WORKDIR /app
4 |
5 | COPY . /app
6 |
7 | RUN pip install "poetry==1.7.0" \
8 | && poetry config virtualenvs.create false \
9 | && poetry install --no-interaction --no-dev
10 |
11 | EXPOSE 8501
12 |
13 | ENTRYPOINT ["streamlit", "run"]
14 |
15 | CMD ["app.py"]
16 |
--------------------------------------------------------------------------------
/src/statistics/statistical_tests/__init__.py:
--------------------------------------------------------------------------------
1 | from src.statistics.statistical_tests.ab_test import (ABTesting,
2 | input_group_data)
3 | from src.statistics.statistical_tests.chi_squared import Chi2Testing
4 |
5 | __all__ = ["ABTesting", "input_group_data", "Chi2Testing"]
6 |
--------------------------------------------------------------------------------
/bin/run.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | export DOCKER_CLI_HINTS=false
4 |
5 | if [ "$(docker ps -q --filter ancestor=daltunay)" ]; then
6 | docker stop $(docker ps -q --filter ancestor=daltunay)
7 | docker rm $(docker ps -q --filter ancestor=daltunay)
8 | fi
9 |
10 | docker build -t daltunay . &&
11 | docker run -p 8501:8501 daltunay
12 |
--------------------------------------------------------------------------------
/src/computer_vision/landmarks/__init__.py:
--------------------------------------------------------------------------------
1 | from src.computer_vision.landmarks.base import BaseLandmarkerApp
2 | from src.computer_vision.landmarks.face_landmarks import FaceLandmarkerApp
3 | from src.computer_vision.landmarks.pose_landmarks import PoseLandmarkerApp
4 |
5 | __all__ = ["BaseLandmarkerApp", "FaceLandmarkerApp", "PoseLandmarkerApp"]
6 |
--------------------------------------------------------------------------------
/.streamlit/secrets.toml.example:
--------------------------------------------------------------------------------
1 | [twilio]
2 | TWILIO_ACCOUNT_SID = "<...>"
3 | TWILIO_AUTH_TOKEN = "<...>"
4 |
5 | [openai]
6 | OPENAI_API_KEY = "<...>"
7 |
8 | [together]
9 | TOGETHER_API_KEY = "<...>"
10 |
11 | [lakera_guard]
12 | LAKERA_GUARD_API_KEY = "<...>"
13 |
14 | [google]
15 | GOOGLE_API_KEY = "<...>"
16 | GOOGLE_CSE_ID = "<...>"
17 |
--------------------------------------------------------------------------------
/src/statistics/dimensionality_reduction/__init__.py:
--------------------------------------------------------------------------------
1 | from src.statistics.dimensionality_reduction.pca_manager import PCAManager
2 | from src.statistics.dimensionality_reduction.tsne_manager import TSNEManager
3 | from src.statistics.dimensionality_reduction.umap_manager import UMAPManager
4 |
5 | __all__ = ["PCAManager", "TSNEManager", "UMAPManager"]
6 |
--------------------------------------------------------------------------------
/pages/large_language_models/__init__.py:
--------------------------------------------------------------------------------
1 | import typing as t
2 |
3 | import yaml
4 |
5 | import utils
6 |
7 | loader = utils.PageConfigLoader(__file__)
8 | loader.set_page_config(globals())
9 |
10 | with open("config/models.yaml") as f:
11 | LLM_CONFIG: t.Dict[str, str] = yaml.safe_load(f)["generative_ai"][
12 | "large_language_models"
13 | ]
14 |
15 | __all__ = ["LLM_CONFIG"]
16 |
--------------------------------------------------------------------------------
/utils/secrets.py:
--------------------------------------------------------------------------------
1 | import os
2 |
3 | import streamlit as st
4 |
5 | import utils
6 |
7 | logger = utils.CustomLogger(__file__)
8 |
9 |
10 | def load_secrets():
11 | for secrets in st.secrets.values():
12 | for secret_name, secret in secrets.items():
13 | masked_secret = secret[:4] + "*" * (len(secret) - 4)
14 | logger.info(f"Setting {secret_name}={masked_secret}")
15 | os.environ[secret_name] = secret
16 |
--------------------------------------------------------------------------------
/config/providers.yaml:
--------------------------------------------------------------------------------
1 | openai:
2 | name: OpenAI
3 | url: https://openai.com/
4 | api:
5 | help: https://platform.openai.com/account/api-keys
6 | endpoint: https://api.openai.com/v1/images/generations
7 | key: OPENAI_API_KEY
8 |
9 | together:
10 | name: Together AI
11 | url: https://www.together.ai/
12 | api:
13 | help: https://api.together.xyz/settings/api-keys
14 | endpoint: https://api.together.xyz/inference
15 | key: TOGETHER_API_KEY
16 |
--------------------------------------------------------------------------------
/utils/turn.py:
--------------------------------------------------------------------------------
1 | import os
2 |
3 | import streamlit as st
4 | from twilio.rest import Client
5 |
6 | import utils
7 |
8 | logger = utils.CustomLogger(__file__)
9 |
10 |
11 | @st.cache_data(show_spinner=False)
12 | def get_ice_servers():
13 | account_sid = os.getenv("TWILIO_ACCOUNT_SID")
14 | auth_token = os.getenv("TWILIO_AUTH_TOKEN")
15 |
16 | client = Client(account_sid, auth_token)
17 | token = client.tokens.create()
18 |
19 | return token.ice_servers
20 |
--------------------------------------------------------------------------------
/pages/landmarks/face_landmarks.py:
--------------------------------------------------------------------------------
1 | import streamlit as st
2 |
3 | import utils
4 | from src.computer_vision.landmarks import FaceLandmarkerApp
5 |
6 | loader = utils.PageConfigLoader(__file__)
7 | loader.set_page_config(globals())
8 |
9 | logger = utils.CustomLogger(__file__)
10 |
11 | st_ss = st.session_state
12 |
13 |
14 | def main():
15 | utils.show_source_code("src/computer_vision/landmarks/face_landmarks.py")
16 |
17 | st_ss.setdefault("face_app", FaceLandmarkerApp()).stream()
18 |
--------------------------------------------------------------------------------
/pages/landmarks/pose_landmarks.py:
--------------------------------------------------------------------------------
1 | import streamlit as st
2 |
3 | import utils
4 | from src.computer_vision.landmarks import PoseLandmarkerApp
5 |
6 | loader = utils.PageConfigLoader(__file__)
7 | loader.set_page_config(globals())
8 |
9 | logger = utils.CustomLogger(__file__)
10 |
11 | st_ss = st.session_state
12 |
13 |
14 | def main():
15 | utils.show_source_code("src/computer_vision/landmarks/pose_landmarks.py")
16 |
17 | st_ss.setdefault("pose_app", PoseLandmarkerApp()).stream()
18 |
--------------------------------------------------------------------------------
/src/generative_ai/image_generation/__init__.py:
--------------------------------------------------------------------------------
1 | import typing as t
2 |
3 | import yaml
4 |
5 | from src.generative_ai.image_generation.dall_e import dall_e_image
6 | from src.generative_ai.image_generation.stable_diffusion import \
7 | stable_diffusion_image
8 |
9 | with open("config/models.yaml") as f:
10 | IMAGE_GEN_CONFIG: t.Dict[str, str] = yaml.safe_load(f)["generative_ai"][
11 | "image_creation"
12 | ]
13 |
14 | __all__ = ["IMAGE_GEN_CONFIG", "dall_e_image", "stable_diffusion_image"]
15 |
--------------------------------------------------------------------------------
/pages/object_detection/face_detection.py:
--------------------------------------------------------------------------------
1 | import streamlit as st
2 |
3 | import utils
4 | from src.computer_vision.object_detection import FaceDetectionApp
5 |
6 | loader = utils.PageConfigLoader(__file__)
7 | loader.set_page_config(globals())
8 |
9 | logger = utils.CustomLogger(__file__)
10 |
11 | st_ss = st.session_state
12 |
13 |
14 | def main():
15 | utils.show_source_code(
16 | path="src/computer_vision/object_detection/face_detection.py"
17 | )
18 |
19 | st_ss.setdefault("face_detection_app", FaceDetectionApp()).stream()
20 |
--------------------------------------------------------------------------------
/pages/object_detection/multi_objects.py:
--------------------------------------------------------------------------------
1 | import streamlit as st
2 |
3 | import utils
4 | from src.computer_vision.object_detection import MultiObjectsDetectionApp
5 |
6 | loader = utils.PageConfigLoader(__file__)
7 | loader.set_page_config(globals())
8 |
9 | logger = utils.CustomLogger(__file__)
10 |
11 | st_ss = st.session_state
12 |
13 |
14 | def main():
15 | utils.show_source_code(path="src/computer_vision/object_detection/multi_objects.py")
16 |
17 | st_ss.setdefault("multi_objects_detection_app", MultiObjectsDetectionApp()).stream()
18 |
--------------------------------------------------------------------------------
/src/generative_ai/large_language_models/__init__.py:
--------------------------------------------------------------------------------
1 | from src.generative_ai.large_language_models.callbacks import \
2 | StreamingChatCallbackHandler
3 | from src.generative_ai.large_language_models.chatbots import (
4 | Chatbot, ChatbotRAG, ChatbotTools, ChatbotWebSummary)
5 | from src.generative_ai.large_language_models.ingest import get_vector_store
6 |
7 | __all__ = [
8 | "Chatbot",
9 | "ChatbotRAG",
10 | "ChatbotTools",
11 | "ChatbotWebSummary",
12 | "StreamingChatCallbackHandler",
13 | "get_vector_store",
14 | ]
15 |
--------------------------------------------------------------------------------
/src/generative_ai/large_language_models/chatbots/__init__.py:
--------------------------------------------------------------------------------
1 | from src.generative_ai.large_language_models.chatbots.chatbot import (
2 | Chatbot, ModelArgs)
3 | from src.generative_ai.large_language_models.chatbots.chatbot_rag import \
4 | ChatbotRAG
5 | from src.generative_ai.large_language_models.chatbots.chatbot_tools import \
6 | ChatbotTools
7 | from src.generative_ai.large_language_models.chatbots.chatbot_web_summary import \
8 | ChatbotWebSummary
9 |
10 | __all__ = ["Chatbot", "ModelArgs", "ChatbotRAG", "ChatbotTools", "ChatbotWebSummary"]
11 |
--------------------------------------------------------------------------------
/src/generative_ai/large_language_models/callbacks.py:
--------------------------------------------------------------------------------
1 | import streamlit as st
2 | from langchain.callbacks.base import BaseCallbackHandler
3 |
4 |
5 | class StreamingChatCallbackHandler(BaseCallbackHandler):
6 | def __init__(self):
7 | pass
8 |
9 | def on_llm_start(self, *args, **kwargs):
10 | self.container = st.empty()
11 | self.text = ""
12 |
13 | def on_llm_new_token(self, token: str, *args, **kwargs):
14 | self.text += token
15 | self.container.markdown(
16 | body=self.text,
17 | unsafe_allow_html=False,
18 | )
19 |
20 | def on_llm_end(self, response: str, *args, **kwargs):
21 | self.container.markdown(
22 | body=response.generations[0][0].text,
23 | unsafe_allow_html=False,
24 | )
25 |
--------------------------------------------------------------------------------
/src/generative_ai/image_generation/dall_e.py:
--------------------------------------------------------------------------------
1 | import streamlit as st
2 | from openai import OpenAI
3 | from PIL import Image
4 |
5 | import utils
6 | from utils.misc import base64_to_img
7 |
8 | logger = utils.CustomLogger(__file__)
9 |
10 |
11 | @st.cache_data(show_spinner="Generating picture...")
12 | def dall_e_image(
13 | prompt: str,
14 | width: int = 1024,
15 | height: int = 1024,
16 | ) -> Image.Image:
17 | from src.generative_ai.image_generation import IMAGE_GEN_CONFIG
18 |
19 | model_config = IMAGE_GEN_CONFIG["DALL-E 2"]
20 |
21 | client = OpenAI()
22 | response = client.images.generate(
23 | model=model_config["string"],
24 | prompt=prompt,
25 | size=f"{width}x{height}",
26 | n=1,
27 | response_format="b64_json",
28 | )
29 | base64 = response.data[0].b64_json
30 | return base64_to_img(base64)
31 |
--------------------------------------------------------------------------------
/pages/image_generation/dall_e.py:
--------------------------------------------------------------------------------
1 | import streamlit as st
2 |
3 | import utils
4 | from src.generative_ai.image_generation import dall_e_image
5 |
6 | loader = utils.PageConfigLoader(__file__)
7 | loader.set_page_config(globals())
8 |
9 | logger = utils.CustomLogger(__file__)
10 |
11 |
12 | def main():
13 | utils.show_source_code("src/generative_ai/image_generation/dall_e.py")
14 |
15 | submitted = False
16 | with st.form(key="dall_e_form"):
17 | prompt = st.text_input(label="Input prompt: ")
18 | centered = st.columns(3)[1]
19 | with centered:
20 | submitted = st.form_submit_button(
21 | label="Generate with DALL·E", use_container_width=True
22 | )
23 | st.subheader(body="Output", anchor=False)
24 | if submitted:
25 | image = dall_e_image(prompt=prompt)
26 | st.image(image=image, caption=f"{prompt} - Generated by DALL·E")
27 |
--------------------------------------------------------------------------------
/src/generative_ai/image_generation/stable_diffusion.py:
--------------------------------------------------------------------------------
1 | import os
2 |
3 | import streamlit as st
4 | import together
5 | from PIL import Image
6 |
7 | import utils
8 | from utils.misc import base64_to_img
9 |
10 | logger = utils.CustomLogger(__file__)
11 |
12 |
13 | @st.cache_data(show_spinner="Generating picture...")
14 | def stable_diffusion_image(
15 | prompt: str,
16 | width: int = 1024,
17 | height: int = 1024,
18 | ) -> Image.Image:
19 | from src.generative_ai.image_generation import IMAGE_GEN_CONFIG
20 |
21 | model_config = IMAGE_GEN_CONFIG["Stable Diffusion 2.1"]
22 | together.api_key = os.getenv("TOGETHER_API_KEY")
23 |
24 | response = together.Image.create(
25 | model=f"{model_config['owner']}/{model_config['string']}",
26 | prompt=prompt,
27 | width=width,
28 | height=height,
29 | )
30 |
31 | base64 = response["output"]["choices"][0]["image_base64"]
32 | return base64_to_img(base64)
33 |
--------------------------------------------------------------------------------
/pages/image_generation/stable_diffusion.py:
--------------------------------------------------------------------------------
1 | import streamlit as st
2 |
3 | import utils
4 | from src.generative_ai.image_generation import stable_diffusion_image
5 |
6 | loader = utils.PageConfigLoader(__file__)
7 | loader.set_page_config(globals())
8 |
9 | logger = utils.CustomLogger(__file__)
10 |
11 |
12 | def main():
13 | utils.show_source_code(
14 | path="src/generative_ai/image_generation/stable_diffusion.py"
15 | )
16 |
17 | submitted = False
18 | with st.form(key="stable_diffusion_form"):
19 | prompt = st.text_input(label="Input prompt: ")
20 | centered = st.columns(3)[1]
21 | with centered:
22 | submitted = st.form_submit_button(
23 | label="Generate with Stable Diffusion", use_container_width=True
24 | )
25 | st.subheader(body="Output", anchor=False)
26 | if submitted:
27 | image = stable_diffusion_image(prompt=prompt)
28 | st.image(image=image, caption=f"{prompt} - Generated by Stable Diffusion")
29 |
--------------------------------------------------------------------------------
/config/models.yaml:
--------------------------------------------------------------------------------
1 | generative_ai:
2 | large_language_models:
3 | GPT-3.5 Turbo:
4 | provider: openai
5 | organization: OpenAI
6 | owner: null
7 | string: gpt-3.5-turbo
8 | experimental_flag: true
9 |
10 | LLaMA-2 Chat (7B):
11 | provider: together
12 | organization: Meta
13 | owner: togethercomputer
14 | string: llama-2-7b-chat
15 | experimental_flag: true
16 |
17 | Mistral (7B) Instruct:
18 | provider: together
19 | organization: mistralai
20 | owner: mistralai
21 | string: Mistral-7B-Instruct-v0.1
22 | experimental_flag: true
23 |
24 | image_creation:
25 | DALL-E 2:
26 | provider: openai
27 | organization: OpenAI
28 | owner: null
29 | string: dall-e-2
30 | experimental_flag: true
31 |
32 | Stable Diffusion 2.1:
33 | provider: together
34 | organization: Stability AI
35 | owner: stabilityai
36 | string: stable-diffusion-2-1
37 | experimental_flag: true
38 |
--------------------------------------------------------------------------------
/utils/__init__.py:
--------------------------------------------------------------------------------
1 | from utils.callbacks import update_slider_callback
2 | from utils.image_annotation import annotate_time
3 | from utils.logging import CustomLogger
4 | from utils.misc import (base64_to_img, generate_logo_link,
5 | reset_session_state_key, show_logos, show_source_code)
6 | from utils.pages_config import PageConfigLoader
7 | from utils.secrets import load_secrets
8 | from utils.shap import st_shap
9 | from utils.streamlit_display import display_tab_content, tabs_config
10 | from utils.turn import get_ice_servers
11 | from utils.widgets import LakeraWidget, LanguageWidget
12 |
13 | __all__ = [
14 | "base64_to_img",
15 | "generate_logo_link",
16 | "load_secrets",
17 | "CustomLogger",
18 | "show_logos",
19 | "show_source_code",
20 | "LakeraWidget",
21 | "LanguageWidget",
22 | "PageConfigLoader",
23 | "reset_session_state_key",
24 | "get_ice_servers",
25 | "annotate_time",
26 | "tabs_config",
27 | "display_tab_content",
28 | "update_slider_callback",
29 | "st_shap",
30 | ]
31 |
--------------------------------------------------------------------------------
/utils/image_annotation.py:
--------------------------------------------------------------------------------
1 | from datetime import datetime
2 |
3 | import cv2
4 | from numpy import ndarray
5 |
6 |
7 | def annotate_time(image: ndarray) -> None:
8 | text = datetime.now().strftime("%Y-%m-%d %H:%M:%S.%f")[:-3]
9 | text_args = {
10 | "text": text,
11 | "fontFace": cv2.FONT_HERSHEY_SIMPLEX,
12 | "fontScale": .5,
13 | "thickness": 1,
14 | }
15 | text_size = cv2.getTextSize(**text_args)[0]
16 | rect_width, rect_height = text_size[0] + 20, text_size[1] + 20
17 | cv2.rectangle(
18 | img=image,
19 | pt1=(0, 0),
20 | pt2=(rect_width, rect_height),
21 | color=(255, 255, 255),
22 | thickness=cv2.FILLED,
23 | )
24 | cv2.rectangle(
25 | img=image,
26 | pt1=(0, 0),
27 | pt2=(rect_width, rect_height),
28 | color=(0, 0, 0),
29 | thickness=2,
30 | )
31 | cv2.putText(
32 | img=image,
33 | org=(10, text_size[1] + 10),
34 | color=(0, 0, 0),
35 | lineType=cv2.LINE_AA,
36 | **text_args,
37 | )
38 |
--------------------------------------------------------------------------------
/utils/widgets/language.py:
--------------------------------------------------------------------------------
1 | import typing as t
2 |
3 | import streamlit as st
4 |
5 | import utils
6 |
7 | logger = utils.CustomLogger(__file__)
8 |
9 | st_ss = st.session_state
10 |
11 |
12 | class LanguageWidget:
13 | widget_key = "language_widget"
14 | selectbox_key = f"{widget_key}.selection"
15 |
16 | def __init__(
17 | self,
18 | languages: t.List[str] | None = None,
19 | default: str | None = None,
20 | ):
21 | logger.info(f"Initializing {self.__class__.__name__}")
22 | self.languages = languages or ["English", "French"]
23 | self.default = default or "English"
24 |
25 | @property
26 | def selected_language(self):
27 | return st.selectbox(
28 | label="Language:",
29 | options=list(self.languages),
30 | index=list(self.languages).index(
31 | st_ss.get(self.selectbox_key, self.default)
32 | ),
33 | key=self.selectbox_key,
34 | help="Changes the **chat language only**, not the interface language",
35 | )
36 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2023 Daniel Altunay
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/src/computer_vision/landmarks/pose_landmarks.py:
--------------------------------------------------------------------------------
1 | import typing as t
2 | from functools import cached_property
3 |
4 | import mediapipe as mp
5 |
6 | from src.computer_vision.landmarks import BaseLandmarkerApp
7 |
8 |
9 | class PoseLandmarkerApp(BaseLandmarkerApp):
10 | landmarks_type = "pose_landmarks"
11 |
12 | def __init__(self):
13 | super().__init__()
14 |
15 | @cached_property
16 | def landmarker(self) -> mp.solutions.pose.Pose:
17 | return mp.solutions.pose.Pose(
18 | static_image_mode=False,
19 | model_complexity=1,
20 | smooth_landmarks=True,
21 | enable_segmentation=False,
22 | min_detection_confidence=0.5,
23 | min_tracking_confidence=0.5,
24 | )
25 |
26 | @cached_property
27 | def connections_list(self) -> t.List[t.FrozenSet[t.Tuple[int, int]]]:
28 | return [mp.solutions.pose.POSE_CONNECTIONS]
29 |
30 | @cached_property
31 | def drawing_specs_list(
32 | self,
33 | ) -> t.List[t.Dict[str, mp.solutions.drawing_utils.DrawingSpec]]:
34 | return [
35 | {
36 | "landmark_drawing_spec": mp.solutions.drawing_styles.get_default_pose_landmarks_style()
37 | }
38 | ]
39 |
--------------------------------------------------------------------------------
/utils/logging.py:
--------------------------------------------------------------------------------
1 | import logging
2 | import typing as t
3 | from functools import cached_property
4 |
5 | import streamlit as st
6 |
7 |
8 | class CustomLogger:
9 | method_names = ["debug", "info", "warning", "error", "critical"]
10 |
11 | def __init__(self, file: str, level: str = "info"):
12 | self.file = file.split("my-superapp")[1] if "my-superapp" in file else file
13 | self.level = getattr(logging, level.upper())
14 | self.cache_methods(methods_to_cache=self.method_names)
15 |
16 | @cached_property
17 | def logger(self) -> logging.Logger:
18 | logger = logging.getLogger(self.file)
19 | logger.setLevel(self.level)
20 | stream_handler = logging.StreamHandler()
21 | formatter = logging.Formatter(
22 | "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
23 | )
24 | stream_handler.setFormatter(formatter)
25 | logger.addHandler(stream_handler)
26 |
27 | return logger
28 |
29 | def cache_methods(self, methods_to_cache: t.List[str]) -> None:
30 | for method_name in methods_to_cache:
31 | method = getattr(self.logger, method_name)
32 | wrapped_method = st.cache_resource(func=method, show_spinner=False)
33 | setattr(self, method_name, wrapped_method)
34 |
--------------------------------------------------------------------------------
/src/statistics/statistical_tests/chi_squared.py:
--------------------------------------------------------------------------------
1 | import typing as t
2 |
3 | import pandas as pd
4 | import streamlit as st
5 | from scipy.stats import chi2_contingency
6 |
7 |
8 | class Chi2Testing:
9 | def __init__(
10 | self,
11 | observed: pd.DataFrame,
12 | alpha: float,
13 | ):
14 | self.observed = observed
15 | self.alpha = alpha
16 |
17 | @staticmethod
18 | @st.cache_data(show_spinner=False)
19 | def chi2_test(
20 | observed: pd.DataFrame,
21 | ) -> t.Tuple[float, float, int, t.List[t.List[float]]]:
22 | chi2, p_value, dof, expected = chi2_contingency(observed)
23 | return chi2, p_value, dof, expected
24 |
25 | @staticmethod
26 | @st.cache_data(show_spinner=False)
27 | def is_statistically_significant(p_value: float, alpha: float) -> bool:
28 | return p_value < alpha
29 |
30 | def perform_chi2_test(self) -> t.Dict[str, t.Any]:
31 | chi2, p_value, dof, expected = self.chi2_test(self.observed)
32 | is_significant = self.is_statistically_significant(p_value, self.alpha)
33 |
34 | return {
35 | "chi2_statistic": chi2,
36 | "p_value": p_value,
37 | "degrees_of_freedom": dof,
38 | "expected_frequencies": expected,
39 | "is_significant": is_significant,
40 | }
41 |
--------------------------------------------------------------------------------
/utils/pages_config.py:
--------------------------------------------------------------------------------
1 | import typing as t
2 | from functools import cached_property
3 |
4 | import yaml
5 |
6 | import utils
7 |
8 |
9 | class PageConfigLoader:
10 | config_path = "pages/pages_config.yaml"
11 |
12 | def __init__(self, file):
13 | self.file = file
14 | self.logger = utils.CustomLogger(self.file)
15 |
16 | @cached_property
17 | def pages_config(self) -> t.Dict:
18 | with open(self.config_path, "r") as file:
19 | pages_config = yaml.safe_load(file)
20 | return pages_config
21 |
22 | @cached_property
23 | def page_config(self) -> t.Dict:
24 | path_keys = self.file.split("my-superapp/pages/")[1].split("/")
25 | section = self.pages_config
26 |
27 | for path_key in path_keys:
28 | section = section.get(path_key, {})
29 |
30 | return self._set_recursive(section, path_keys)
31 |
32 | def _set_recursive(self, section, keys) -> t.Dict:
33 | return {
34 | key: self._set_recursive(value, keys + [key])
35 | if isinstance(value, dict)
36 | else value
37 | for key, value in section.items()
38 | }
39 |
40 | def set_page_config(self, _globals):
41 | self.logger.info(f"Setting page config: {self.page_config}")
42 | for key, value in self.page_config.items():
43 | _globals[key] = value
44 |
--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
1 | [tool.poetry]
2 | name = "my-superapp"
3 | version = "0.0.0"
4 | description = "Daniel ALTUNAY's superapp!"
5 | authors = ["Daniel Altunay "]
6 | readme = "README.md"
7 | homepage = "https://data-science-superapp.streamlit.app/"
8 | repository = "https://github.com/daltunay/my-superapp"
9 | packages = [{ include = "src" }, { include = "utils" }, { include = "pages" }]
10 |
11 | [tool.poetry.dependencies]
12 | python = "~3.11"
13 | # Streamlit
14 | streamlit = "^1.29.0"
15 | streamlit-superapp = "^1.3.0"
16 | streamlit-webrtc = "^0.47.1"
17 | twilio = "^8.10.3"
18 | watchdog = "^3.0.0"
19 | # Computer Vision
20 | opencv-python-headless = "^4.8.1.78"
21 | av = ">=9.0.0,<11.0.0"
22 | ultralytics = "^8.0.222"
23 | mediapipe = "^0.10.8"
24 | # LLMs
25 | langchain = "^0.0.345"
26 | openai = "^1.3.5"
27 | together = "^0.2.8"
28 | tiktoken = "^0.5.1"
29 | faiss-cpu = "^1.7.4"
30 | transformers = "^4.35.2"
31 | pypdf = "^3.17.1"
32 | unstructured = "^0.11.2"
33 | validators = "^0.22.0"
34 | ## Tools
35 | google-api-python-client = "^2.108.0"
36 | arxiv = "^2.0.0"
37 | wikipedia = "^1.4.0"
38 | stackapi = "^0.3.0"
39 | # Machine Learning
40 | scipy = "^1.11.4"
41 | scikit-learn = "^1.3.2"
42 | xgboost = "^2.0.2"
43 | shap = "^0.44.0"
44 | umap-learn = "^0.5.5"
45 | # Data Visualization
46 | plotly = "^5.18.0"
47 |
48 | [tool.poetry.group.dev.dependencies]
49 | ruff = "^0.1.3"
50 | isort = "^5.12.0"
51 | mypy = "^1.6.1"
52 | ipykernel = "^6.26.0"
53 |
54 | [tool.poetry.group.types.dependencies]
55 | types-requests = "^2.31.0.10"
56 |
57 | [build-system]
58 | requires = ["poetry-core"]
59 | build-backend = "poetry.core.masonry.api"
60 |
--------------------------------------------------------------------------------
/src/computer_vision/landmarks/face_landmarks.py:
--------------------------------------------------------------------------------
1 | import typing as t
2 | from functools import cached_property
3 |
4 | import mediapipe as mp
5 |
6 | from src.computer_vision.landmarks import BaseLandmarkerApp
7 |
8 |
9 | class FaceLandmarkerApp(BaseLandmarkerApp):
10 | landmarks_type = "multi_face_landmarks"
11 |
12 | def __init__(self):
13 | super().__init__()
14 |
15 | @cached_property
16 | def landmarker(self) -> mp.solutions.face_mesh.FaceMesh:
17 | return mp.solutions.face_mesh.FaceMesh(
18 | static_image_mode=False,
19 | max_num_faces=1,
20 | refine_landmarks=True,
21 | min_detection_confidence=0.5,
22 | min_tracking_confidence=0.5,
23 | )
24 |
25 | @cached_property
26 | def connections_list(self) -> t.List[t.FrozenSet[t.Tuple[int, int]]]:
27 | return [
28 | mp.solutions.face_mesh.FACEMESH_TESSELATION,
29 | mp.solutions.face_mesh.FACEMESH_CONTOURS,
30 | mp.solutions.face_mesh.FACEMESH_IRISES,
31 | ]
32 |
33 | @cached_property
34 | def drawing_specs_list(
35 | self,
36 | ) -> t.List[t.Dict[str, mp.solutions.drawing_utils.DrawingSpec]]:
37 | return [
38 | {"connection_drawing_spec": style, "landmark_drawing_spec": None}
39 | for style in (
40 | mp.solutions.drawing_styles.get_default_face_mesh_tesselation_style(),
41 | mp.solutions.drawing_styles.get_default_face_mesh_contours_style(),
42 | mp.solutions.drawing_styles.get_default_face_mesh_iris_connections_style(),
43 | )
44 | ]
45 |
--------------------------------------------------------------------------------
/utils/misc.py:
--------------------------------------------------------------------------------
1 | from base64 import b64decode
2 | from io import BytesIO
3 |
4 | import streamlit as st
5 | from PIL import Image
6 |
7 | st_ss = st.session_state
8 |
9 |
10 | def generate_logo_link(url: str, img_url: str) -> str:
11 | return f'
'
12 |
13 |
14 | def show_source_code(path: str):
15 | st.markdown(
16 | "[]"
17 | f"(https://github.com/daltunay/my-superapp/tree/main/{path})"
18 | )
19 |
20 |
21 | def show_logos(linkedin: bool = True, github: bool = True):
22 | logos = []
23 |
24 | if linkedin:
25 | logos.append(
26 | generate_logo_link(
27 | url="https://linkedin.com/in/daltunay",
28 | img_url="https://img.icons8.com/?id=13930&format=png",
29 | )
30 | )
31 |
32 | if github:
33 | logos.append(
34 | generate_logo_link(
35 | url="https://github.com/daltunay",
36 | img_url="https://img.icons8.com/?id=AZOZNnY73haj&format=png",
37 | )
38 | )
39 |
40 | logos_html = "".join(logos)
41 | html_content = f"""
42 |
43 | Made by Daniel Altunay
44 | {logos_html}
45 |
46 | """
47 |
48 | st.markdown(html_content, unsafe_allow_html=True)
49 |
50 |
51 | def base64_to_img(base64: str) -> Image.Image:
52 | return Image.open(BytesIO(b64decode(base64)))
53 |
54 |
55 | def reset_session_state_key(key: str):
56 | if hasattr(st_ss, key):
57 | delattr(st_ss, key)
58 |
--------------------------------------------------------------------------------
/pages/clustering/dbscan.py:
--------------------------------------------------------------------------------
1 | import streamlit as st
2 |
3 | import utils
4 | from src.machine_learning.clustering import DBScanManager
5 | from src.machine_learning.datasets import Dataset
6 |
7 | loader = utils.PageConfigLoader(__file__)
8 | loader.set_page_config(globals())
9 |
10 | logger = utils.CustomLogger(__file__)
11 |
12 | st_ss = st.session_state
13 |
14 |
15 | def main():
16 | utils.tabs_config()
17 | utils.show_source_code("src/statistics/dimensionality_rediction/dbscan_manager.py")
18 |
19 | st.header("Dataset", divider="gray")
20 | dataset = Dataset(type=None)
21 | raw_dataset_dict = Dataset.get_dataset(**dataset.params, split=False)
22 | dataset.set(raw_dataset_dict)
23 |
24 | with st.expander(label="Dataset description"):
25 | st.markdown(dataset.description)
26 |
27 | X, y = dataset.X, dataset.y
28 | if label_mapping := dataset.label_mapping:
29 | y = y.map(label_mapping)
30 |
31 | st.subheader("Visualize data")
32 | with st.container(border=True):
33 | utils.display_tab_content("data", X, y)
34 |
35 | st.subheader("DBSCAN")
36 | with st.container(border=True):
37 | dbscan_manager = DBScanManager()
38 | dbscan_manager.set_model()
39 |
40 | dbscan_manager.fit(data=X)
41 |
42 | st.subheader("Scatter plot", divider="gray")
43 | col_x, col_y = st.columns(2)
44 | x_col_scatter = col_x.selectbox(
45 | label="X column", key="scatter_x", options=X.columns, index=0
46 | )
47 | y_col_scatter = col_y.selectbox(
48 | label="Y column", key="scatter_y", options=X.columns, index=1
49 | )
50 | st.plotly_chart(
51 | dbscan_manager.scatter_plot(x_col_scatter, y_col_scatter),
52 | use_container_width=True,
53 | )
54 |
--------------------------------------------------------------------------------
/src/generative_ai/large_language_models/ingest.py:
--------------------------------------------------------------------------------
1 | import os
2 | import typing as t
3 |
4 | from langchain.document_loaders import DirectoryLoader, PyPDFLoader
5 | from langchain.embeddings import OpenAIEmbeddings
6 | from langchain.text_splitter import RecursiveCharacterTextSplitter
7 | from langchain.vectorstores import FAISS
8 |
9 | import utils
10 |
11 |
12 | def get_loader(
13 | file: str | None = None,
14 | mode: t.Literal["local"] | t.Literal["upload"] = "local",
15 | ) -> DirectoryLoader | PyPDFLoader:
16 | if mode == "local":
17 | return DirectoryLoader(
18 | path="data/documents/",
19 | glob="./*.pdf",
20 | loader_cls=PyPDFLoader,
21 | show_progress=True,
22 | )
23 | elif mode == "upload":
24 | return PyPDFLoader(file)
25 |
26 |
27 | def get_vector_store(
28 | file: str | None = None,
29 | mode: t.Literal["local"] | t.Literal["upload"] = "local",
30 | ) -> None:
31 | loader = get_loader(file=file, mode=mode)
32 | documents = loader.load()
33 | splitter = RecursiveCharacterTextSplitter(
34 | chunk_size=1000,
35 | chunk_overlap=50,
36 | length_function=len,
37 | )
38 | documents_chunked = splitter.split_documents(documents)
39 | embeddings = OpenAIEmbeddings()
40 | db = FAISS.from_documents(documents=documents_chunked, embedding=embeddings)
41 |
42 | if mode == "local":
43 | db.save_local(
44 | folder_path="faiss_index",
45 | index_name="index" if mode == "local" else os.path.splitext(file)[0],
46 | )
47 | elif mode == "upload":
48 | return db
49 |
50 |
51 | def main():
52 | get_vector_store(file=None, mode="local")
53 |
54 |
55 | if __name__ == "__main__":
56 | utils.load_secrets()
57 | main()
58 |
--------------------------------------------------------------------------------
/pages/large_language_models/chatbot_web_summary.py:
--------------------------------------------------------------------------------
1 | import streamlit as st
2 | import validators
3 |
4 | import utils
5 | from pages.large_language_models import LLM_CONFIG
6 | from src.generative_ai.large_language_models import ChatbotWebSummary
7 |
8 | loader = utils.PageConfigLoader(__file__)
9 | loader.set_page_config(globals())
10 |
11 | logger = utils.CustomLogger(__file__)
12 |
13 | st_ss = st.session_state
14 |
15 |
16 | def main():
17 | utils.show_source_code(
18 | path="src/generative_ai/large_language_models/chatbots/chatbot_web.py"
19 | )
20 | chosen_model = st.selectbox(
21 | label="Large Language Model:",
22 | placeholder="Choose an option",
23 | options=LLM_CONFIG.keys(),
24 | index=0,
25 | on_change=utils.reset_session_state_key,
26 | kwargs={"key": "chatbot_web_summary"},
27 | )
28 |
29 | chosen_chain_type = st.selectbox(
30 | label="Chain type:",
31 | options=ChatbotWebSummary.available_chain_types,
32 | index=None,
33 | on_change=utils.reset_session_state_key,
34 | kwargs={"key": "chatbot_web_summary"},
35 | )
36 |
37 | if chosen_model and chosen_chain_type:
38 | chatbot = st_ss.setdefault(
39 | "chatbot_web_summary", ChatbotWebSummary(**LLM_CONFIG[chosen_model])
40 | )
41 | else:
42 | st.info("Choose a chain type for the LLM", icon="ℹ️")
43 |
44 | if input_url := st.text_input(
45 | label="URL of the page to summarize:",
46 | disabled=not (chosen_model and chosen_chain_type),
47 | ):
48 | if validators.url(input_url):
49 | st.chat_message("human").write(input_url)
50 | with st.chat_message("ai"):
51 | chatbot.summarize(url=input_url)
52 | else:
53 | st.error("Invalid URL", icon="❌")
54 |
--------------------------------------------------------------------------------
/src/computer_vision/object_detection/multi_objects.py:
--------------------------------------------------------------------------------
1 | from functools import cached_property
2 |
3 | import streamlit_webrtc as st_webrtc
4 | from av import VideoFrame
5 | from numpy import ndarray
6 | from ultralytics import YOLO
7 | from ultralytics.engine.results import Results
8 |
9 | import utils
10 |
11 | logger = utils.CustomLogger(__file__)
12 |
13 |
14 | class MultiObjectsDetectionApp:
15 | def __init__(self):
16 | pass
17 |
18 | @cached_property
19 | def detector(self) -> YOLO:
20 | return YOLO(model="yolov8n.pt", task=None)
21 |
22 | def detect_objects(self, image: ndarray) -> Results:
23 | return self.detector.predict(
24 | source=image,
25 | stream=False,
26 | show=False,
27 | show_labels=True,
28 | show_conf=True,
29 | verbose=False,
30 | )
31 |
32 | def video_frame_callback(self, frame: VideoFrame) -> VideoFrame:
33 | image = frame.to_ndarray(format="bgr24")
34 |
35 | detections = self.detect_objects(image)
36 | image = self.annotate_detections(detections)
37 | utils.annotate_time(image)
38 | return VideoFrame.from_ndarray(image, format="bgr24")
39 |
40 | def stream(self) -> None:
41 | st_webrtc.webrtc_streamer(
42 | video_frame_callback=self.video_frame_callback,
43 | key="multi_objects_streamer",
44 | mode=st_webrtc.WebRtcMode.SENDRECV,
45 | rtc_configuration=st_webrtc.RTCConfiguration(
46 | {"iceServers": utils.get_ice_servers(), "iceTransportPolicy": "relay"}
47 | ),
48 | media_stream_constraints={"video": True, "audio": False},
49 | async_processing=True,
50 | desired_playing_state=None,
51 | )
52 |
53 | @staticmethod
54 | def annotate_detections(detections: Results) -> ndarray:
55 | return detections[0].plot()
56 |
--------------------------------------------------------------------------------
/utils/widgets/lakera.py:
--------------------------------------------------------------------------------
1 | import os
2 | import typing as t
3 |
4 | import requests
5 | import streamlit as st
6 |
7 | import utils
8 |
9 | logger = utils.CustomLogger(__file__)
10 |
11 | st_ss = st.session_state
12 |
13 |
14 | class LakeraWidget:
15 | widget_key = "lakera_widget"
16 | checkbox_key = f"{widget_key}.checkbox"
17 |
18 | def __init__(
19 | self,
20 | default: bool = False,
21 | ):
22 | logger.info(f"Initializing {self.__class__.__name__}")
23 | self.api_key = os.getenv("LAKERA_GUARD_API_KEY")
24 | self.default = default
25 |
26 | @property
27 | def lakera_activated(self):
28 | return st.checkbox(
29 | label="Prompt injection security",
30 | value=st_ss.get(self.checkbox_key, self.default),
31 | key=self.checkbox_key,
32 | help="Use Lakera Guard API to defend against LLM prompt injections",
33 | on_change=self.authentificate,
34 | )
35 |
36 | def request_api(self, input: str) -> requests.Response:
37 | return requests.post(
38 | url="https://api.lakera.ai/v1/prompt_injection",
39 | json={"input": input},
40 | headers={"Authorization": f"Bearer {self.api_key}"},
41 | )
42 |
43 | def authentificate(self):
44 | if not st_ss.get(self.checkbox_key):
45 | return
46 | try:
47 | response = self.request_api("")
48 | except requests.exceptions.SSLError:
49 | st.toast("SSL CERTIFICATE VERIFY FAILED", icon="🚫")
50 | else:
51 | success = response.ok
52 | st.toast("Lakera Guard API authentication", icon="✅" if success else "🚫")
53 |
54 | def flag_prompt(self, prompt: str) -> t.Tuple[bool, t.Dict]:
55 | response = self.request_api(prompt).json()
56 | flagged = response["results"][0]["flagged"]
57 | return flagged, response
58 |
--------------------------------------------------------------------------------
/pages/dimensionality_reduction/t-sne.py:
--------------------------------------------------------------------------------
1 | import streamlit as st
2 |
3 | import utils
4 | from src.machine_learning.datasets import Dataset
5 | from src.statistics.dimensionality_reduction import TSNEManager
6 |
7 | loader = utils.PageConfigLoader(__file__)
8 | loader.set_page_config(globals())
9 |
10 | logger = utils.CustomLogger(__file__)
11 |
12 | st_ss = st.session_state
13 |
14 |
15 | def main():
16 | utils.tabs_config()
17 | utils.show_source_code("src/statistics/dimensionality_rediction/tsne_manager.py")
18 |
19 | st.header("Dataset", divider="gray")
20 | dataset = Dataset(type=None)
21 | raw_dataset_dict = Dataset.get_dataset(**dataset.params, split=False)
22 | dataset.set(raw_dataset_dict)
23 |
24 | with st.expander(label="Dataset description"):
25 | st.markdown(dataset.description)
26 |
27 | X, y = dataset.X, dataset.y
28 | if label_mapping := dataset.label_mapping:
29 | y = y.map(label_mapping)
30 |
31 | st.subheader("Visualize data")
32 | with st.container(border=True):
33 | utils.display_tab_content("data", X, y)
34 |
35 | st.subheader("t-SNE")
36 | with st.container(border=True):
37 | tsne_manager = TSNEManager(max_n_components=3)
38 | tsne_manager.set_model()
39 |
40 | tsne_manager.fit(data=X, target_col=y)
41 |
42 | st.subheader("Scatter matrix plot", divider="gray")
43 | st.plotly_chart(tsne_manager.scatter_matrix_plot(), use_container_width=True)
44 |
45 | st.subheader("Scatter 2D plot", divider="gray")
46 | try:
47 | st.plotly_chart(tsne_manager.scatter_2d_plot(), use_container_width=True)
48 | except ValueError:
49 | st.error("Number of principal components not sufficient for the plot")
50 |
51 | st.subheader("Scatter 3D plot", divider="gray")
52 | try:
53 | st.plotly_chart(tsne_manager.scatter_3d_plot(), use_container_width=True)
54 | except ValueError:
55 | st.error("Number of principal components not sufficient for the plot")
56 |
--------------------------------------------------------------------------------
/pages/dimensionality_reduction/umap.py:
--------------------------------------------------------------------------------
1 | import streamlit as st
2 |
3 | import utils
4 | from src.machine_learning.datasets import Dataset
5 | from src.statistics.dimensionality_reduction import UMAPManager
6 |
7 | loader = utils.PageConfigLoader(__file__)
8 | loader.set_page_config(globals())
9 |
10 | logger = utils.CustomLogger(__file__)
11 |
12 | st_ss = st.session_state
13 |
14 |
15 | def main():
16 | utils.tabs_config()
17 | utils.show_source_code("src/statistics/dimensionality_rediction/umap_manager.py")
18 |
19 | st.header("Dataset", divider="gray")
20 | dataset = Dataset(type=None)
21 | raw_dataset_dict = Dataset.get_dataset(**dataset.params, split=False)
22 | dataset.set(raw_dataset_dict)
23 |
24 | with st.expander(label="Dataset description"):
25 | st.markdown(dataset.description)
26 |
27 | X, y = dataset.X, dataset.y
28 | if label_mapping := dataset.label_mapping:
29 | y = y.map(label_mapping)
30 |
31 | st.subheader("Visualize data")
32 | with st.container(border=True):
33 | utils.display_tab_content("data", X, y)
34 |
35 | st.subheader("UMAP")
36 | with st.container(border=True):
37 | umap_manager = UMAPManager(max_n_components=3)
38 | umap_manager.set_model()
39 |
40 | umap_manager.fit(data=X, target_col=y)
41 |
42 | st.subheader("Scatter matrix plot", divider="gray")
43 | st.plotly_chart(umap_manager.scatter_matrix_plot(), use_container_width=True)
44 |
45 | st.subheader("Scatter 2D plot", divider="gray")
46 | try:
47 | st.plotly_chart(umap_manager.scatter_2d_plot(), use_container_width=True)
48 | except ValueError:
49 | st.error("Number of principal components not sufficient for the plot")
50 |
51 | st.subheader("Scatter 3D plot", divider="gray")
52 | try:
53 | st.plotly_chart(umap_manager.scatter_3d_plot(), use_container_width=True)
54 | except ValueError:
55 | st.error("Number of principal components not sufficient for the plot")
56 |
--------------------------------------------------------------------------------
/utils/streamlit_display.py:
--------------------------------------------------------------------------------
1 | import typing as t
2 |
3 | import pandas as pd
4 | import streamlit as st
5 |
6 |
7 | def tabs_config():
8 | st.markdown(
9 | """
10 |
17 | """,
18 | unsafe_allow_html=True,
19 | )
20 |
21 |
22 | def display_tab_content(
23 | label: t.Literal["train", "test"],
24 | X_data: pd.DataFrame,
25 | y_data: pd.DataFrame,
26 | label_mapping: t.Dict[int, str] | None = None,
27 | ):
28 | data_container = st.container()
29 | col1, col2 = data_container.columns([0.65, 0.35], gap="medium")
30 | with col1:
31 | st.markdown(
32 | f"X_{label}
", unsafe_allow_html=True
33 | )
34 | st.dataframe(data=X_data, use_container_width=True)
35 |
36 | with col2:
37 | st.markdown(
38 | f"y_{label}
", unsafe_allow_html=True
39 | )
40 | st.dataframe(
41 | data=y_data.map(label_mapping or (lambda x: x)), use_container_width=True
42 | )
43 |
44 | describe_container = st.expander("Data statistics").container()
45 | col1, col2 = describe_container.columns([0.65, 0.35], gap="medium")
46 | with col1:
47 | st.dataframe(X_data.describe(), use_container_width=True)
48 | with col2:
49 | if label_mapping:
50 | st.dataframe(
51 | pd.concat(
52 | [
53 | y_data.map(label_mapping).value_counts().sort_index(),
54 | y_data.map(label_mapping)
55 | .value_counts(normalize=True)
56 | .sort_index(),
57 | ],
58 | axis=1,
59 | ).round(3)
60 | )
61 | else:
62 | st.dataframe(y_data.describe(), use_container_width=True)
63 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 |
2 | [](https://data-science-superapp.streamlit.app)
3 |
4 |
5 | ## Prerequisites
6 |
7 | **Poetry**: If [Poetry](https://python-poetry.org/) is not installed, you can do so using pip:
8 |
9 |
10 | ```bash
11 | pip install poetry
12 | ```
13 |
14 | **Docker**: If [Docker](https://www.docker.com/) is not installed, you can do so following [this link](https://docs.docker.com/get-docker/)
15 |
16 | ## Installation
17 |
18 | 1. Clone the repository:
19 |
20 | ```bash
21 | git clone https://github.com/daltunay/my-superapp.git
22 | cd my-superapp
23 | ```
24 |
25 | 2. Set up the project dependencies using Poetry:
26 |
27 | ```bash
28 | poetry install
29 | ```
30 |
31 | This command will create a virtual environment and install the necessary dependencies.
32 |
33 | ## Setting up API Keys
34 |
35 | The application uses several APIs to function properly.
36 | You can specifiy the API keys in `.streamlit/secrets.toml`:
37 |
38 | ```toml
39 | [twilio]
40 | TWILIO_ACCOUNT_SID = "<...>"
41 | TWILIO_AUTH_TOKEN = "<...>"
42 |
43 | [openai]
44 | OPENAI_API_KEY = "<...>"
45 |
46 | [together]
47 | TOGETHER_API_KEY = "<...>"
48 |
49 | [lakera_guard]
50 | LAKERA_GUARD_API_KEY = "<...>"
51 |
52 | [google]
53 | GOOGLE_API_KEY = "<...>"
54 | GOOGLE_CSE_ID = "<...>"
55 | ```
56 |
57 |
58 | ## Running the Application
59 | The _my-superapp_ application can be run using either Poetry or Docker.
60 |
61 | ### Using Poetry
62 |
63 | To run the application using Poetry:
64 |
65 | ```bash
66 | poetry run streamlit run app.py
67 | ```
68 |
69 | ### Using Docker
70 |
71 | 1. Build the Docker image:
72 |
73 | ```bash
74 | docker build -t my-superapp .
75 | ```
76 |
77 | 2. Run the application as a Docker container:
78 |
79 | ```bash
80 | docker run -p 8501:8501 my-superapp
81 | ```
82 |
83 | Alternatively, you can just run the following:
84 |
85 | ```bash
86 | chmod +x ./bin/run.sh
87 | ./bin/run.sh
88 | ```
89 |
90 | Once the application is running, it will be accessible at http://localhost:8501 in your web browser.
91 |
--------------------------------------------------------------------------------
/src/generative_ai/large_language_models/chatbots/chatbot_web_summary.py:
--------------------------------------------------------------------------------
1 | import typing as t
2 | from functools import cached_property
3 |
4 | from langchain.chains.combine_documents.base import BaseCombineDocumentsChain
5 | from langchain.chains.summarize import load_summarize_chain
6 | from langchain.docstore.document import Document
7 | from langchain.document_loaders import UnstructuredURLLoader
8 | from unstructured.cleaners.core import (clean, clean_extra_whitespace,
9 | remove_punctuation)
10 |
11 | from src.generative_ai.large_language_models.chatbots import Chatbot, ModelArgs
12 |
13 |
14 | class ChatbotWebSummary(Chatbot):
15 | available_chain_types = ["stuff", "map_reduce"]
16 |
17 | def __init__(
18 | self,
19 | chain_type: t.Literal["stuff"] | t.Literal["map_reduce"] = "stuff",
20 | **model_kwargs: t.Unpack[ModelArgs],
21 | ) -> None:
22 | super().__init__(**model_kwargs)
23 | self.chain_type = chain_type
24 |
25 | @staticmethod
26 | def url_to_doc(source_url: str) -> Document:
27 | url_loader = UnstructuredURLLoader(
28 | urls=[source_url],
29 | mode="elements",
30 | post_processors=[clean, remove_punctuation, clean_extra_whitespace],
31 | )
32 |
33 | narrative_elements = [
34 | element
35 | for element in url_loader.load()
36 | if element.metadata.get("category") == "NarrativeText"
37 | ]
38 | cleaned_content = " ".join(
39 | element.page_content for element in narrative_elements
40 | )
41 |
42 | return Document(page_content=cleaned_content, metadata={"source": source_url})
43 |
44 | @cached_property
45 | def chain(self) -> BaseCombineDocumentsChain:
46 | return load_summarize_chain(self.llm, chain_type=self.chain_type, verbose=True)
47 |
48 | def summarize(self, url: str) -> str:
49 | document = self.url_to_doc(url)
50 | return self.chain.run(
51 | [document],
52 | callbacks=self.callbacks,
53 | )
54 |
--------------------------------------------------------------------------------
/pages/dimensionality_reduction/pca.py:
--------------------------------------------------------------------------------
1 | import streamlit as st
2 |
3 | import utils
4 | from src.machine_learning.datasets import Dataset
5 | from src.statistics.dimensionality_reduction import PCAManager
6 |
7 | loader = utils.PageConfigLoader(__file__)
8 | loader.set_page_config(globals())
9 |
10 | logger = utils.CustomLogger(__file__)
11 |
12 | st_ss = st.session_state
13 |
14 |
15 | def main():
16 | utils.tabs_config()
17 | utils.show_source_code("src/statistics/dimensionality_rediction/pca_manager.py")
18 |
19 | st.header("Dataset", divider="gray")
20 | dataset = Dataset(type=None)
21 | raw_dataset_dict = Dataset.get_dataset(**dataset.params, split=False)
22 | dataset.set(raw_dataset_dict)
23 |
24 | with st.expander(label="Dataset description"):
25 | st.markdown(dataset.description)
26 |
27 | X, y = dataset.X, dataset.y
28 | if label_mapping := dataset.label_mapping:
29 | y = y.map(label_mapping)
30 |
31 | st.subheader("Visualize data")
32 | with st.container(border=True):
33 | utils.display_tab_content("data", X, y)
34 |
35 | st.subheader("PCA")
36 | with st.container(border=True):
37 | pca_manager = PCAManager(max_n_components=3)
38 | pca_manager.set_model()
39 |
40 | pca_manager.fit(data=X, target_col=y)
41 |
42 | st.subheader("Scatter matrix plot", divider="gray")
43 | st.plotly_chart(pca_manager.scatter_matrix_plot(), use_container_width=True)
44 |
45 | st.subheader("Explained variance plot", divider="gray")
46 | st.plotly_chart(pca_manager.explained_variance_plot(), use_container_width=True)
47 |
48 | st.subheader("Scatter 2D + Loadings plot", divider="gray")
49 | try:
50 | st.plotly_chart(pca_manager.loadings_plot(), use_container_width=True)
51 | except ValueError:
52 | st.error("Number of principal components not sufficient for the plot")
53 |
54 | st.subheader("Scatter 3D plot", divider="gray")
55 | try:
56 | st.plotly_chart(pca_manager.scatter_3d_plot(), use_container_width=True)
57 | except ValueError:
58 | st.error("Number of principal components not sufficient for the plot")
59 |
--------------------------------------------------------------------------------
/pages/regression/xgboost.py:
--------------------------------------------------------------------------------
1 | import streamlit as st
2 |
3 | import utils
4 | from src.machine_learning import XGBoostManager
5 | from src.machine_learning.datasets import Dataset
6 |
7 | loader = utils.PageConfigLoader(__file__)
8 | loader.set_page_config(globals())
9 |
10 | logger = utils.CustomLogger(__file__)
11 |
12 | st_ss = st.session_state
13 |
14 |
15 | def main():
16 | utils.tabs_config()
17 | utils.show_source_code("src/machine_learning/xgboost_manager.py")
18 |
19 | st.header("Dataset", divider="gray")
20 | dataset = Dataset(type="regression")
21 | raw_dataset_dict = Dataset.get_dataset(**dataset.params, split=True)
22 | dataset.set(raw_dataset_dict)
23 |
24 | with st.expander(label="Dataset description"):
25 | st.markdown(dataset.description)
26 |
27 | X_train, X_test = dataset.X
28 | y_train, y_test = dataset.y
29 | label_mapping = dataset.label_mapping
30 |
31 | st.subheader("Visualize data")
32 | train_tab, test_tab = st.tabs(tabs=["Train", "Test"])
33 | with train_tab:
34 | with st.container(border=True):
35 | utils.display_tab_content("train", X_train, y_train, label_mapping)
36 | with test_tab:
37 | with st.container(border=True):
38 | utils.display_tab_content("test", X_test, y_test, label_mapping)
39 |
40 | st.header("Regression", divider="gray")
41 | st.markdown(
42 | "Regression model: `XGBRegressor` from `xgboost` "
43 | "([official documentation](https://xgboost.readthedocs.io/en/stable/python/python_api.html#xgboost.XGBRegressor))"
44 | )
45 | regression_manager = XGBoostManager(task="regression")
46 |
47 | st.subheader("Hyperparameters")
48 | with st.container(border=True):
49 | regression_manager.set_model()
50 |
51 | st.subheader("Evaluation")
52 | regression_manager.fit(X_train, y_train)
53 | regression_manager.evaluate(X_test, y_test)
54 | st.markdown("Metrics Report")
55 | st.columns([0.5, 1, 0.5])[1].dataframe(
56 | data=regression_manager.metrics_report.round(2), use_container_width=True
57 | )
58 | st.subheader("Explainability")
59 | st.markdown("SHAP force plot")
60 | utils.st_shap(plot=regression_manager.shap_force_plot(X_test), height=400)
61 |
--------------------------------------------------------------------------------
/pages/clustering/kmeans.py:
--------------------------------------------------------------------------------
1 | import streamlit as st
2 |
3 | import utils
4 | from src.machine_learning.clustering import KMeansManager
5 | from src.machine_learning.datasets import Dataset
6 |
7 | loader = utils.PageConfigLoader(__file__)
8 | loader.set_page_config(globals())
9 |
10 | logger = utils.CustomLogger(__file__)
11 |
12 | st_ss = st.session_state
13 |
14 |
15 | def main():
16 | utils.tabs_config()
17 | utils.show_source_code("src/statistics/dimensionality_rediction/kmeans_manager.py")
18 |
19 | st.header("Dataset", divider="gray")
20 | dataset = Dataset(type=None)
21 | raw_dataset_dict = Dataset.get_dataset(**dataset.params, split=False)
22 | dataset.set(raw_dataset_dict)
23 |
24 | with st.expander(label="Dataset description"):
25 | st.markdown(dataset.description)
26 |
27 | X, y = dataset.X, dataset.y
28 | if label_mapping := dataset.label_mapping:
29 | y = y.map(label_mapping)
30 |
31 | st.subheader("Visualize data")
32 | with st.container(border=True):
33 | utils.display_tab_content("data", X, y)
34 |
35 | st.subheader("K-Means")
36 | with st.container(border=True):
37 | kmeans_manager = KMeansManager(max_n_clusters=10)
38 | kmeans_manager.set_model()
39 |
40 | kmeans_manager.fit(data=X)
41 |
42 | st.subheader("Scatter plot", divider="gray")
43 | col_x, col_y = st.columns(2)
44 | x_col_scatter = col_x.selectbox(
45 | label="X column", key="scatter_x", options=X.columns, index=0
46 | )
47 | y_col_scatter = col_y.selectbox(
48 | label="Y column", key="scatter_y", options=X.columns, index=1
49 | )
50 | st.plotly_chart(
51 | kmeans_manager.scatter_plot(x_col_scatter, y_col_scatter),
52 | use_container_width=True,
53 | )
54 |
55 | st.subheader("Centroids plot", divider="gray")
56 | col_x, col_y = st.columns(2)
57 | x_col_centroids = col_x.selectbox(
58 | label="X column", key="centroids_x", options=X.columns, index=0
59 | )
60 | y_col_centroids = col_y.selectbox(
61 | label="Y column", key="centroids_y", options=X.columns, index=1
62 | )
63 | st.plotly_chart(
64 | kmeans_manager.centroids_plot(x_col_centroids, y_col_centroids),
65 | use_container_width=True,
66 | )
67 |
--------------------------------------------------------------------------------
/src/machine_learning/clustering/dbscan_manager.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 | import plotly.express as px
3 | import streamlit as st
4 | from sklearn.cluster import DBSCAN
5 |
6 |
7 | class DBScanManager:
8 | def __init__(self):
9 | self.model: DBSCAN | None = None
10 |
11 | @property
12 | def params(self) -> dict:
13 | columns = st.columns(2)
14 | return {
15 | "eps": columns[0].slider(
16 | label="Maximum Distance (eps)",
17 | min_value=0.1,
18 | max_value=5.0,
19 | value=1.0,
20 | step=0.1,
21 | help="Maximum distance between two samples for one to be considered as in the neighborhood of the other.",
22 | ),
23 | "min_samples": columns[1].slider(
24 | label="Minimum Samples",
25 | min_value=1,
26 | max_value=10,
27 | value=5,
28 | step=1,
29 | help="The number of samples in a neighborhood for a point to be considered as a core point.",
30 | ),
31 | }
32 |
33 | @staticmethod
34 | @st.cache_resource(show_spinner=True)
35 | def _get_model(eps: float, min_samples: int) -> DBSCAN:
36 | return DBSCAN(eps=eps, min_samples=min_samples)
37 |
38 | def set_model(self) -> None:
39 | self.model = self._get_model(**self.params)
40 |
41 | @staticmethod
42 | @st.cache_resource(
43 | show_spinner=True,
44 | hash_funcs={DBSCAN: lambda model: (model.eps, model.min_samples)},
45 | )
46 | def _perform_clustering(model: DBSCAN, data: pd.DataFrame) -> pd.DataFrame:
47 | clusters = model.fit_predict(data)
48 | data = data.assign(Cluster=clusters)
49 | data["Cluster"] = data["Cluster"].astype(str)
50 | return model, data
51 |
52 | def fit(self, data: pd.DataFrame):
53 | self.model, self.data_clustered = self._perform_clustering(
54 | model=self.model, data=data
55 | )
56 |
57 | def scatter_plot(self, x_col: str, y_col: str) -> None:
58 | return px.scatter(
59 | self.data_clustered,
60 | x=x_col,
61 | y=y_col,
62 | color="Cluster",
63 | labels={"color": "Cluster"},
64 | )
65 |
--------------------------------------------------------------------------------
/src/generative_ai/large_language_models/chatbots/chatbot_rag.py:
--------------------------------------------------------------------------------
1 | import typing as t
2 | from functools import cached_property
3 |
4 | from langchain.chains import ConversationalRetrievalChain
5 | from langchain.chains.conversational_retrieval.base import \
6 | BaseConversationalRetrievalChain
7 | from langchain.embeddings import OpenAIEmbeddings
8 | from langchain.vectorstores import FAISS
9 | from langchain.vectorstores.base import VectorStoreRetriever
10 |
11 | from src.generative_ai.large_language_models.chatbots import Chatbot, ModelArgs
12 |
13 |
14 | class ChatbotRAG(Chatbot):
15 | def __init__(
16 | self,
17 | vector_store: FAISS | None = None,
18 | embeddings_kwargs: t.Dict | None = None,
19 | search_kwargs: t.Dict | None = None,
20 | **model_kwargs: t.Unpack[ModelArgs],
21 | ) -> None:
22 | super().__init__(**model_kwargs)
23 | if vector_store:
24 | self.vector_store = vector_store
25 | self.embeddings_kwargs = embeddings_kwargs or {}
26 | self.search_kwargs = search_kwargs or {}
27 |
28 | @cached_property
29 | def embeddings(self) -> OpenAIEmbeddings:
30 | return OpenAIEmbeddings(**self.embeddings_kwargs)
31 |
32 | @cached_property
33 | def vector_store(self) -> FAISS:
34 | return FAISS.load_local(folder_path="faiss_index", embeddings=self.embeddings)
35 |
36 | @cached_property
37 | def retriever(self) -> VectorStoreRetriever:
38 | return self.vector_store.as_retriever(
39 | search_type="similarity",
40 | search_kwargs=self.search_kwargs,
41 | )
42 |
43 | @cached_property
44 | def chain(self) -> BaseConversationalRetrievalChain:
45 | return ConversationalRetrievalChain.from_llm(
46 | llm=self.llm,
47 | memory=self.memory,
48 | verbose=True,
49 | combine_docs_chain_kwargs={"prompt": self.template},
50 | chain_type="stuff",
51 | retriever=self.retriever,
52 | )
53 |
54 | def ask(
55 | self,
56 | query: str,
57 | language: str | None = None,
58 | ) -> str:
59 | return self.chain.run(
60 | question=query,
61 | language=language or "the input language",
62 | callbacks=self.callbacks,
63 | )
64 |
--------------------------------------------------------------------------------
/pages/large_language_models/chatbot.py:
--------------------------------------------------------------------------------
1 | import streamlit as st
2 |
3 | import utils
4 | from pages.large_language_models import LLM_CONFIG
5 | from src.generative_ai.large_language_models import Chatbot
6 |
7 | loader = utils.PageConfigLoader(__file__)
8 | loader.set_page_config(globals())
9 |
10 | st_ss = st.session_state
11 |
12 |
13 | def main():
14 | utils.show_source_code("src/generative_ai/large_language_models/chatbots/chatbot.py")
15 | with st.expander(label="Chat parameters", expanded=True):
16 | col1, col2 = st.columns(2)
17 | with col1:
18 | selected_language = st_ss.setdefault(
19 | "language_widget", utils.LanguageWidget()
20 | ).selected_language
21 | with col2:
22 | lakera_activated = st_ss.setdefault(
23 | "lakera_widget", utils.LakeraWidget()
24 | ).lakera_activated
25 |
26 | chosen_model = st.selectbox(
27 | label="Large Language Model:",
28 | placeholder="Choose an option",
29 | options=LLM_CONFIG.keys(),
30 | index=0,
31 | on_change=utils.reset_session_state_key,
32 | kwargs={"key": "chatbot"},
33 | )
34 |
35 | provided_context = st.text_area(
36 | label="Context:",
37 | value="",
38 | help="This context will be passed to the chatbot.",
39 | )
40 |
41 | if chosen_model:
42 | chatbot = st_ss.setdefault("chatbot", Chatbot(**LLM_CONFIG[chosen_model]))
43 | for message in chatbot.history:
44 | st.chat_message(message["role"]).write(message["content"])
45 | else:
46 | pass
47 |
48 | if prompt := st.chat_input(
49 | placeholder=f"Chat with {chosen_model}!" if chosen_model else "",
50 | disabled=not chosen_model,
51 | ):
52 | st.chat_message("human").write(prompt)
53 | if lakera_activated:
54 | flag, response = st_ss.setdefault(
55 | "lakera_widget", utils.LakeraWidget()
56 | ).flag_prompt(prompt=prompt)
57 | if flag:
58 | st.warning(body="Prompt injection detected", icon="🚨")
59 | st.expander(label="LOGS").json(response)
60 | with st.chat_message("ai"):
61 | chatbot.ask(
62 | query=prompt,
63 | context=provided_context,
64 | language=selected_language,
65 | )
66 |
--------------------------------------------------------------------------------
/pages/statistical_tests/ab_test.py:
--------------------------------------------------------------------------------
1 | import streamlit as st
2 |
3 | import utils
4 | from src.statistics.statistical_tests import ABTesting, input_group_data
5 |
6 | loader = utils.PageConfigLoader(__file__)
7 | loader.set_page_config(globals())
8 |
9 |
10 | def main():
11 | st.header("Data", divider="gray")
12 | a_col, b_col = st.columns(2, gap="small")
13 | with a_col.container(border=True):
14 | st.subheader("Group A")
15 | a_visitors, a_conversions, a_rate = input_group_data(
16 | group_name="A", default_visitors=1000, default_conversions=50
17 | )
18 | with b_col.container(border=True):
19 | st.subheader("Group B")
20 | b_visitors, b_conversions, b_rate = input_group_data(
21 | group_name="B", default_visitors=200, default_conversions=35
22 | )
23 |
24 | st.header("Settings", divider="gray")
25 | settings_container = st.container(border=True)
26 | test_type = settings_container.selectbox(
27 | label="Test type",
28 | key="ab_test.test_type",
29 | options=["one-sided", "two-sided"],
30 | index=1,
31 | format_func=lambda x: x.replace("-", " ").capitalize(),
32 | )
33 | confidence_col, alpha_col = settings_container.columns(2)
34 | confidence = confidence_col.columns([0.15, 1, 0.15])[1].select_slider(
35 | "Confidence level",
36 | options=[0.9, 0.95, 0.99],
37 | value=0.95,
38 | key="ab_test.confidence",
39 | format_func=lambda x: f"{100*x}%",
40 | on_change=utils.update_slider_callback,
41 | kwargs={"updated": "ab_test.confidence", "to_update": "ab_test.alpha"},
42 | )
43 | alpha = alpha_col.columns([0.15, 1, 0.15])[1].select_slider(
44 | "Alpha value",
45 | options=[0.01, 0.05, 0.1],
46 | value=0.05,
47 | key="ab_test.alpha",
48 | format_func=lambda x: f"{100*x}%",
49 | on_change=utils.update_slider_callback,
50 | kwargs={"updated": "ab_test.alpha", "to_update": "ab_test.confidence"},
51 | )
52 |
53 | ab_testing = ABTesting(a_visitors, a_rate, b_visitors, b_rate, alpha, test_type)
54 |
55 | st.header("Results", divider="gray")
56 | result = ab_testing.perform_ab_test()
57 |
58 | if result["is_significant"]:
59 | st.success("The difference is significant", icon="✅")
60 | else:
61 | st.error("The difference is not significant", icon="❌")
62 |
63 | st.expander(label="Test details").json(result)
64 |
--------------------------------------------------------------------------------
/pages/classification/xgboost.py:
--------------------------------------------------------------------------------
1 | import streamlit as st
2 |
3 | import utils
4 | from src.machine_learning import XGBoostManager
5 | from src.machine_learning.datasets import Dataset
6 |
7 | loader = utils.PageConfigLoader(__file__)
8 | loader.set_page_config(globals())
9 |
10 | logger = utils.CustomLogger(__file__)
11 |
12 | st_ss = st.session_state
13 |
14 |
15 | def main():
16 | utils.tabs_config()
17 | utils.show_source_code("src/machine_learning/xgboost_manager.py")
18 |
19 | st.header("Dataset", divider="gray")
20 | dataset = Dataset(type="classification")
21 | raw_dataset_dict = Dataset.get_dataset(**dataset.params, split=True)
22 | dataset.set(raw_dataset_dict)
23 |
24 | with st.expander(label="Dataset description"):
25 | st.markdown(dataset.description)
26 |
27 | X_train, X_test = dataset.X
28 | y_train, y_test = dataset.y
29 | label_mapping = dataset.label_mapping
30 |
31 | st.subheader("Visualize data")
32 | train_tab, test_tab = st.tabs(tabs=["Train", "Test"])
33 | with train_tab:
34 | with st.container(border=True):
35 | utils.display_tab_content("train", X_train, y_train, label_mapping)
36 | with test_tab:
37 | with st.container(border=True):
38 | utils.display_tab_content("test", X_test, y_test, label_mapping)
39 |
40 | st.header("Classification", divider="gray")
41 | st.markdown(
42 | "Classification model: `XGBClassifier` from `xgboost` "
43 | "([official documentation](https://xgboost.readthedocs.io/en/stable/python/python_api.html#xgboost.XGBClassifier))"
44 | )
45 | classification_manager = XGBoostManager(task="classification")
46 |
47 | st.subheader("Hyperparameters")
48 | with st.container(border=True):
49 | classification_manager.set_model(label_mapping=label_mapping)
50 |
51 | st.subheader("Evaluation")
52 | classification_manager.fit(X_train, y_train)
53 | classification_manager.evaluate(
54 | X_test, y_test, target_names=list(label_mapping.values())
55 | )
56 | st.markdown("Classification Report")
57 | st.columns(3)[1].dataframe(
58 | data=classification_manager.classification_report, use_container_width=True
59 | )
60 | st.markdown("Confusion Matrix")
61 | st.columns([0.1, 1, 0.1])[1].pyplot(
62 | fig=classification_manager.confusion_matrix_display(
63 | display_labels=list(label_mapping.values())
64 | )
65 | )
66 | st.subheader("Explainability")
67 | st.markdown("SHAP force plot")
68 | utils.st_shap(plot=classification_manager.shap_force_plot(X_test), height=400)
69 |
--------------------------------------------------------------------------------
/pages/large_language_models/chatbot_tools.py:
--------------------------------------------------------------------------------
1 | import streamlit as st
2 |
3 | import utils
4 | from pages.large_language_models import LLM_CONFIG
5 | from src.generative_ai.large_language_models import ChatbotTools
6 |
7 | loader = utils.PageConfigLoader(__file__)
8 | loader.set_page_config(globals())
9 |
10 | st_ss = st.session_state
11 |
12 |
13 | def main():
14 | utils.show_source_code(
15 | "src/generative_ai/large_language_models/chatbots/chatbot_tools.py"
16 | )
17 | with st.expander(label="Chat parameters", expanded=True):
18 | col1, col2 = st.columns(2)
19 | with col1:
20 | selected_language = st_ss.setdefault(
21 | "language_widget", utils.LanguageWidget()
22 | ).selected_language
23 | with col2:
24 | lakera_activated = st_ss.setdefault(
25 | "lakera_widget", utils.LakeraWidget()
26 | ).lakera_activated
27 |
28 | chosen_model = st.selectbox(
29 | label="Large Language Model:",
30 | placeholder="Choose an option",
31 | options=LLM_CONFIG.keys(),
32 | index=0,
33 | on_change=utils.reset_session_state_key,
34 | kwargs={"key": "chatbot_tools"},
35 | )
36 |
37 | chosen_tools = st.multiselect(
38 | label="Tools:",
39 | options=ChatbotTools.available_tools,
40 | default=None,
41 | on_change=utils.reset_session_state_key,
42 | kwargs={"key": "chatbot_tools"},
43 | )
44 |
45 | if chosen_model and chosen_tools:
46 | chatbot = st_ss.setdefault(
47 | "chatbot_tools",
48 | ChatbotTools(**LLM_CONFIG[chosen_model], tool_names=chosen_tools),
49 | )
50 | for message in chatbot.history:
51 | st.chat_message(message["role"]).write(message["content"])
52 | else:
53 | st.info("Choose tools for the LLM", icon="ℹ️")
54 |
55 | if prompt := st.chat_input(
56 | placeholder=f"Chat with {chosen_model}!"
57 | if (chosen_model and chosen_tools)
58 | else "",
59 | disabled=not (chosen_model and chosen_tools),
60 | ):
61 | st.chat_message("human").write(prompt)
62 | if lakera_activated:
63 | flag, response = st_ss.setdefault(
64 | "lakera_widget", utils.LakeraWidget()
65 | ).flag_prompt(prompt=prompt)
66 | if flag:
67 | st.warning(body="Prompt injection detected", icon="🚨")
68 | st.expander(label="LOGS").json(response)
69 | with st.chat_message("ai"):
70 | st.write(chatbot.ask(
71 | query=prompt,
72 | language=selected_language,
73 | ))
74 |
--------------------------------------------------------------------------------
/src/generative_ai/large_language_models/chatbots/chatbot_tools.py:
--------------------------------------------------------------------------------
1 | import typing as t
2 | from functools import cached_property
3 |
4 | from langchain.agents import (AgentExecutor, AgentType, initialize_agent,
5 | load_tools)
6 | from langchain.callbacks.base import BaseCallbackHandler
7 | from langchain.tools import BaseTool
8 |
9 | from src.generative_ai.large_language_models.chatbots import Chatbot, ModelArgs
10 |
11 |
12 | class ChatbotTools(Chatbot):
13 | available_tools = ["google-search", "arxiv", "wikipedia", "stackexchange", "human"]
14 |
15 | def __init__(
16 | self,
17 | tool_names: t.List[str] | None = None,
18 | **model_kwargs: t.Unpack[ModelArgs],
19 | ) -> None:
20 | super().__init__(**model_kwargs)
21 | self.tool_names = tool_names or []
22 | self.memory.input_key = "input"
23 |
24 | # @property
25 | # def callbacks(self) -> t.List[BaseCallbackHandler]:
26 | # return [super().callbacks[1]]
27 |
28 | @cached_property
29 | def tools(self) -> t.List[BaseTool]:
30 | return load_tools(tool_names=self.tool_names)
31 |
32 | @staticmethod
33 | def update_agent_prompt_template(
34 | agent: AgentExecutor,
35 | text: str,
36 | input_variable: str | None = None,
37 | ):
38 | template = agent.agent.llm_chain.prompt.template
39 | newline_index = agent.agent.llm_chain.prompt.template.find("\n\n")
40 | agent.agent.llm_chain.prompt.template = text + template[newline_index:]
41 | if input_variable:
42 | agent.agent.llm_chain.prompt.input_variables.append(input_variable)
43 | return agent
44 |
45 | @cached_property
46 | def chain(self) -> AgentExecutor:
47 | agent = initialize_agent(
48 | llm=self.llm,
49 | memory=self.memory,
50 | verbose=True,
51 | agent=AgentType.CONVERSATIONAL_REACT_DESCRIPTION,
52 | agent_kwargs={
53 | "input_variables": [
54 | "input",
55 | "chat_history",
56 | "agent_scratchpad",
57 | "language",
58 | ]
59 | },
60 | tools=self.tools,
61 | handle_parsing_errors=True,
62 | return_intermediate_steps=False,
63 | )
64 | agent = self.update_agent_prompt_template(
65 | agent=agent,
66 | text="Assistant is a large language model, speaking in {language}.",
67 | input_variable="language",
68 | )
69 | return agent
70 |
71 | def ask(
72 | self,
73 | query: str,
74 | language: str | None = None,
75 | ) -> str:
76 | return self.chain.run(
77 | input=query,
78 | language=language or "the input language",
79 | callbacks=self.callbacks,
80 | )
81 |
--------------------------------------------------------------------------------
/src/machine_learning/clustering/kmeans_manager.py:
--------------------------------------------------------------------------------
1 | import typing as t
2 |
3 | import pandas as pd
4 | import plotly.express as px
5 | import plotly.graph_objects as go
6 | import streamlit as st
7 | from sklearn.cluster import KMeans
8 |
9 |
10 | class KMeansManager:
11 | def __init__(self, max_n_clusters: int):
12 | self.max_n_clusters = max_n_clusters
13 | self.model: KMeans | None = None
14 |
15 | @property
16 | def params(self) -> t.Dict[str, int]:
17 | columns = st.columns(2)
18 | return {
19 | "n_clusters": columns[0].slider(
20 | label="Number of Clusters",
21 | min_value=1,
22 | max_value=self.max_n_clusters,
23 | value=2,
24 | step=1,
25 | help="Number of clusters to form.",
26 | ),
27 | }
28 |
29 | @staticmethod
30 | @st.cache_resource(show_spinner=True)
31 | def _get_model(n_clusters: int) -> KMeans:
32 | return KMeans(n_clusters=n_clusters, n_init="auto")
33 |
34 | def set_model(self) -> None:
35 | params = self.params
36 | self.model = self._get_model(params["n_clusters"])
37 |
38 | @staticmethod
39 | @st.cache_resource(
40 | show_spinner=True,
41 | hash_funcs={KMeans: lambda model: model.n_clusters},
42 | )
43 | def _perform_clustering(model: KMeans, data: pd.DataFrame) -> pd.DataFrame:
44 | model = model.fit(data)
45 | clusters = model.predict(data)
46 | data = data.assign(Cluster=clusters)
47 | data["Cluster"] = data["Cluster"].astype(str)
48 | return model, data
49 |
50 | def fit(self, data: pd.DataFrame):
51 | self.model, self.data_clustered = self._perform_clustering(
52 | model=self.model, data=data
53 | )
54 |
55 | def scatter_plot(self, x_col: str, y_col: str) -> None:
56 | return px.scatter(
57 | self.data_clustered,
58 | x=x_col,
59 | y=y_col,
60 | color="Cluster",
61 | labels={"color": "Cluster"},
62 | )
63 |
64 | def centroids_plot(self, x_col: str, y_col: str) -> None:
65 | centroids = pd.DataFrame(
66 | self.model.cluster_centers_,
67 | columns=[f"{col}_centroid" for col in self.data_clustered.columns[:-1]],
68 | )
69 | centroids[x_col] = centroids[f"{x_col}_centroid"]
70 | centroids[y_col] = centroids[f"{y_col}_centroid"]
71 |
72 | fig = px.scatter(
73 | self.data_clustered,
74 | x=x_col,
75 | y=y_col,
76 | color="Cluster",
77 | labels={"color": "Cluster"},
78 | )
79 |
80 | fig.add_trace(
81 | go.Scatter(
82 | x=centroids[x_col],
83 | y=centroids[y_col],
84 | mode="markers",
85 | marker=dict(size=20, symbol="x", color="white"),
86 | name="Centroids",
87 | )
88 | )
89 | return fig
90 |
--------------------------------------------------------------------------------
/pages/large_language_models/chatbot_rag.py:
--------------------------------------------------------------------------------
1 | import streamlit as st
2 |
3 | import utils
4 | from pages.large_language_models import LLM_CONFIG
5 | from src.generative_ai.large_language_models import (ChatbotRAG,
6 | get_vector_store)
7 |
8 | loader = utils.PageConfigLoader(__file__)
9 | loader.set_page_config(globals())
10 |
11 | st_ss = st.session_state
12 |
13 |
14 | def main():
15 | utils.show_source_code(
16 | path="src/generative_ai/large_language_models/chatbots/chatbot_rag.py"
17 | )
18 | with st.expander(label="Chat parameters", expanded=True):
19 | col1, col2 = st.columns(2)
20 | with col1:
21 | selected_language = st_ss.setdefault(
22 | "language_widget", utils.LanguageWidget()
23 | ).selected_language
24 | with col2:
25 | lakera_activated = st_ss.setdefault(
26 | "lakera_widget", utils.LakeraWidget()
27 | ).lakera_activated
28 |
29 | chosen_model = st.selectbox(
30 | label="Large Language Model:",
31 | placeholder="Choose an option",
32 | options=LLM_CONFIG.keys(),
33 | index=0,
34 | on_change=utils.reset_session_state_key,
35 | kwargs={"key": "chatbot_rag"},
36 | )
37 |
38 | if uploaded_file := st.file_uploader(
39 | "Upload a PDF file",
40 | type="pdf",
41 | accept_multiple_files=False,
42 | help="https://python.langchain.com/docs/use_cases/question_answering/#what-is-rag",
43 | on_change=utils.reset_session_state_key,
44 | kwargs={"key": "chatbot_rag"},
45 | ):
46 | with open(uploaded_file.name, "wb") as f:
47 | f.write(uploaded_file.getbuffer())
48 | vector_db = get_vector_store(file=uploaded_file.name, mode="upload")
49 |
50 | if chosen_model and uploaded_file:
51 | chatbot = st_ss.setdefault(
52 | "chatbot_rag",
53 | ChatbotRAG(vector_store=vector_db, **LLM_CONFIG[chosen_model]),
54 | )
55 | for message in chatbot.history:
56 | st.chat_message(message["role"]).write(message["content"])
57 | else:
58 | st.info("Please upload a PDF file for the RAG", icon="ℹ️")
59 |
60 | if prompt := st.chat_input(
61 | placeholder=f"Chat with {chosen_model}!"
62 | if (chosen_model and uploaded_file)
63 | else "",
64 | disabled=not (chosen_model and uploaded_file),
65 | ):
66 | st.chat_message("human").write(prompt)
67 | if lakera_activated:
68 | flag, response = st_ss.setdefault(
69 | "lakera_widget", utils.LakeraWidget()
70 | ).flag_prompt(prompt=prompt)
71 | if flag:
72 | st.warning(body="Prompt injection detected", icon="🚨")
73 | st.expander(label="LOGS").json(response)
74 | with st.chat_message("ai"):
75 | chatbot.ask(
76 | query=prompt,
77 | language=selected_language,
78 | )
79 |
--------------------------------------------------------------------------------
/src/computer_vision/object_detection/face_detection.py:
--------------------------------------------------------------------------------
1 | import os
2 | import typing as t
3 | from functools import cached_property
4 |
5 | import cv2
6 | import mediapipe as mp
7 | import streamlit_webrtc as st_webrtc
8 | from av import VideoFrame
9 | from mediapipe.framework.formats import detection_pb2
10 | from numpy import ndarray
11 |
12 | import utils
13 |
14 | logger = utils.CustomLogger(__file__)
15 |
16 | os.environ["MEDIAPIPE_DISABLE_GPU"] = "1"
17 |
18 |
19 | class FaceDetectionApp:
20 | def __init__(self):
21 | pass
22 |
23 | @cached_property
24 | def detector(self):
25 | return mp.solutions.face_detection.FaceDetection(
26 | min_detection_confidence=0.5,
27 | model_selection=0,
28 | )
29 |
30 | def detect_faces(self, image: ndarray) -> t.Any:
31 | return self.detector.process(image).detections
32 |
33 | def video_frame_callback(self, frame: VideoFrame) -> VideoFrame:
34 | image = frame.to_ndarray(format="bgr24")
35 |
36 | detection_list = self.detect_faces(image)
37 | self.annotate_faces(
38 | image=image,
39 | detection_list=detection_list,
40 | )
41 | utils.annotate_time(image=image)
42 | return VideoFrame.from_ndarray(image, format="bgr24")
43 |
44 | def stream(self) -> None:
45 | st_webrtc.webrtc_streamer(
46 | video_frame_callback=self.video_frame_callback,
47 | key="face_streamer",
48 | mode=st_webrtc.WebRtcMode.SENDRECV,
49 | rtc_configuration=st_webrtc.RTCConfiguration(
50 | {"iceServers": utils.get_ice_servers(), "iceTransportPolicy": "relay"}
51 | ),
52 | media_stream_constraints={"video": True, "audio": False},
53 | async_processing=True,
54 | desired_playing_state=None,
55 | )
56 |
57 | @staticmethod
58 | def annotate_faces(
59 | image: ndarray,
60 | detection_list: t.List[detection_pb2.Detection],
61 | ) -> None:
62 | if not detection_list:
63 | return
64 |
65 | for detection in detection_list:
66 | score = detection.score[0]
67 | bbox = detection.location_data.relative_bounding_box
68 | height, width, _ = image.shape
69 | xmin, ymin = int(bbox.xmin * width), int(bbox.ymin * height)
70 | xmax, ymax = int((bbox.xmin + bbox.width) * width), int(
71 | (bbox.ymin + bbox.height) * height
72 | )
73 | cv2.rectangle(
74 | img=image,
75 | pt1=(xmin, ymin),
76 | pt2=(xmax, ymax),
77 | color=(0, 255, 0),
78 | thickness=3,
79 | )
80 | cv2.putText(
81 | img=image,
82 | text=f"score: {score:.3f}",
83 | org=(xmin, ymin - 10),
84 | fontFace=cv2.FONT_HERSHEY_SIMPLEX,
85 | fontScale=0.5,
86 | color=(0, 255, 0),
87 | thickness=2,
88 | )
89 |
--------------------------------------------------------------------------------
/pages/statistical_tests/chi2_test.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 | import streamlit as st
3 |
4 | import utils
5 | from src.statistics.statistical_tests import Chi2Testing
6 |
7 | loader = utils.PageConfigLoader(__file__)
8 | loader.set_page_config(globals())
9 |
10 |
11 | def main():
12 | st.header("Data", divider="gray")
13 | observed_template = pd.DataFrame(
14 | data=[["Group A", 30, 20], ["Group B", 70, 80]],
15 | index=None,
16 | columns=["group", "category_1", "category_2"],
17 | )
18 | col_df, col_sum = st.columns([0.8, 0.2])
19 | with col_df:
20 | observed = st.data_editor(
21 | data=observed_template,
22 | hide_index=True,
23 | column_config={
24 | "group": st.column_config.TextColumn(
25 | "Group",
26 | help="The name of the considered group.",
27 | ),
28 | "category_1": st.column_config.NumberColumn(
29 | "Category 1",
30 | min_value=1,
31 | required=True,
32 | help="The observed values for the category 1.",
33 | ),
34 | "category_2": st.column_config.NumberColumn(
35 | "Category 2",
36 | min_value=1,
37 | required=True,
38 | help="The observed values for the category 2.",
39 | ),
40 | },
41 | disabled=False,
42 | use_container_width=True,
43 | )
44 | st.info("Click on any cell to change its content.", icon="💡")
45 | with col_sum:
46 | total_col = observed.drop("group", axis=1).sum(axis=1).to_frame(name="Total")
47 | st.dataframe(total_col, hide_index=True, use_container_width=True)
48 |
49 | st.header("Settings", divider="gray")
50 | settings_container = st.container(border=True)
51 | confidence_col, alpha_col = settings_container.columns(2)
52 | confidence = confidence_col.columns([0.15, 1, 0.15])[1].select_slider(
53 | "Confidence level",
54 | options=[0.9, 0.95, 0.99],
55 | value=0.95,
56 | key="chi2_test.confidence",
57 | format_func=lambda x: f"{100*x}%",
58 | on_change=utils.update_slider_callback,
59 | kwargs={"updated": "chi2_test.confidence", "to_update": "chi2_test.alpha"},
60 | )
61 | alpha = alpha_col.columns([0.15, 1, 0.15])[1].select_slider(
62 | "Alpha value",
63 | options=[0.01, 0.05, 0.1],
64 | value=0.05,
65 | key="chi2_test.alpha",
66 | format_func=lambda x: f"{100*x}%",
67 | on_change=utils.update_slider_callback,
68 | kwargs={"updated": "chi2_test.alpha", "to_update": "chi2_test.confidence"},
69 | )
70 |
71 | chi2_testing = Chi2Testing(observed.drop("group", axis=1), alpha)
72 |
73 | st.header("Results", divider="gray")
74 | result = chi2_testing.perform_chi2_test()
75 |
76 | if result["is_significant"]:
77 | st.success("The difference is significant", icon="✅")
78 | else:
79 | st.error("The difference is not significant", icon="❌")
80 |
81 | st.expander(label="Test details").json(result)
82 |
--------------------------------------------------------------------------------
/src/statistics/dimensionality_reduction/umap_manager.py:
--------------------------------------------------------------------------------
1 | import typing as t
2 |
3 | import pandas as pd
4 | import plotly.express as px
5 | import streamlit as st
6 | from umap import UMAP
7 |
8 |
9 | class UMAPManager:
10 | def __init__(self, max_n_components: int):
11 | self.max_n_components = max_n_components
12 | self.model: UMAP | None = None
13 | self.target_col: pd.Series | None = None
14 | self.embedded_data_df: pd.DataFrame | None = None
15 |
16 | @property
17 | def params(self) -> t.Dict[str, int | float]:
18 | columns = st.columns(3)
19 | return {
20 | "n_components": columns[0].slider(
21 | label="Number of Components",
22 | min_value=1,
23 | max_value=self.max_n_components,
24 | value=3,
25 | step=1,
26 | help="Number of components to compute.",
27 | ),
28 | "n_neighbors": columns[1].slider(
29 | label="Number of Neighbors",
30 | min_value=2,
31 | max_value=100,
32 | value=15,
33 | step=1,
34 | help="Size of local neighborhood used for manifold approximation.",
35 | ),
36 | "min_dist": columns[2].slider(
37 | label="Minimum Distance",
38 | min_value=0.1,
39 | max_value=1.0,
40 | value=0.5,
41 | step=0.1,
42 | help="Minimum distance between embedded points.",
43 | ),
44 | }
45 |
46 | @st.cache_resource(show_spinner=True)
47 | def _get_model(_self, params: t.Dict[str, int | float]) -> UMAP:
48 | return UMAP(
49 | n_components=params["n_components"],
50 | n_neighbors=params["n_neighbors"],
51 | min_dist=params["min_dist"],
52 | )
53 |
54 | def set_model(self) -> None:
55 | params = self.params
56 | self.model = self._get_model(params)
57 |
58 | @st.cache_resource(
59 | show_spinner=True,
60 | hash_funcs={
61 | UMAP: lambda model: (model.n_components, model.n_neighbors, model.min_dist)
62 | },
63 | )
64 | def _compute_umap(_self, model: UMAP, data: pd.DataFrame) -> pd.DataFrame:
65 | embedded_data = model.fit_transform(data)
66 | column_names = [f"D{i}" for i in range(1, model.n_components + 1)]
67 | return pd.DataFrame(embedded_data, columns=column_names)
68 |
69 | def fit(self, data: pd.DataFrame, target_col: pd.Series):
70 | self.embedded_data_df = self._compute_umap(model=self.model, data=data)
71 | self.target_col = target_col
72 |
73 | def scatter_matrix_plot(self) -> None:
74 | return px.scatter_matrix(
75 | self.embedded_data_df, color=self.target_col, labels={"color": "target"}
76 | )
77 |
78 | def scatter_2d_plot(self) -> None:
79 | return px.scatter(
80 | self.embedded_data_df,
81 | x="D1",
82 | y="D2",
83 | color=self.target_col,
84 | labels={"color": "target"},
85 | )
86 |
87 | def scatter_3d_plot(self) -> None:
88 | return px.scatter_3d(
89 | self.embedded_data_df,
90 | x="D1",
91 | y="D2",
92 | z="D3",
93 | color=self.target_col,
94 | labels={"color": "target"},
95 | )
96 |
--------------------------------------------------------------------------------
/src/generative_ai/large_language_models/chatbots/chatbot.py:
--------------------------------------------------------------------------------
1 | import typing as t
2 | from functools import cached_property
3 |
4 | from langchain.callbacks import StreamingStdOutCallbackHandler
5 | from langchain.callbacks.base import BaseCallbackHandler
6 | from langchain.chains import LLMChain
7 | from langchain.chains.base import Chain
8 | from langchain.chat_models import ChatOpenAI
9 | from langchain.llms import Together
10 | from langchain.memory import ConversationBufferMemory
11 | from langchain.prompts import PromptTemplate
12 |
13 | from src.generative_ai.large_language_models.callbacks import \
14 | StreamingChatCallbackHandler
15 |
16 |
17 | class ModelArgs(t.TypedDict):
18 | provider: t.Literal["openai", "together"]
19 | owner: t.Literal["mistralai", "togethercomputer"] | None
20 | string: t.Literal["gpt-3.5-turbo", "llama-2-7b-chat", "Mistral-7B-Instruct-v0.1"]
21 |
22 |
23 | class Chatbot:
24 | BASE_TEMPLATE = """
25 | Use the following context and chat history to answer the question:
26 |
27 | Context: {context}
28 | Chat history: {chat_history}
29 | Question: {question}
30 |
31 | Your answer (in {language}):
32 | """
33 |
34 | def __init__(self, **model_kwargs: t.Unpack[ModelArgs]) -> None:
35 | self.model_provider = model_kwargs.get("provider", "openai")
36 | self.model_owner = model_kwargs.get("owner", None)
37 | self.model_string = model_kwargs.get("string", "gpt-3.5-turbo")
38 |
39 | @cached_property
40 | def llm(self) -> ChatOpenAI | Together:
41 | if self.model_provider == "openai":
42 | return ChatOpenAI(
43 | model=self.model_string,
44 | streaming=True,
45 | model_kwargs={},
46 | )
47 | elif self.model_provider == "together":
48 | return Together(
49 | model=f"{self.model_owner}/{self.model_string}",
50 | max_tokens=1024,
51 | )
52 |
53 | @cached_property
54 | def memory(self) -> ConversationBufferMemory:
55 | return ConversationBufferMemory(
56 | memory_key="chat_history",
57 | input_key="question",
58 | return_messages=True,
59 | )
60 |
61 | @property
62 | def history(self) -> t.List[t.Dict[str, str]]:
63 | return [
64 | {"role": message.type, "content": message.content}
65 | for message in self.memory.buffer
66 | ]
67 |
68 | @cached_property
69 | def template(self) -> PromptTemplate:
70 | return PromptTemplate(
71 | template=self.BASE_TEMPLATE,
72 | input_variables=["context", "chat_history", "question", "language"],
73 | )
74 |
75 | @cached_property
76 | def chain(self) -> Chain:
77 | return LLMChain(
78 | llm=self.llm,
79 | memory=self.memory,
80 | verbose=True,
81 | prompt=self.template,
82 | )
83 |
84 | @property
85 | def callbacks(self) -> t.List[BaseCallbackHandler]:
86 | return [StreamingChatCallbackHandler(), StreamingStdOutCallbackHandler()]
87 |
88 | def ask(
89 | self,
90 | query: str,
91 | context: str | None = None,
92 | language: str | None = None,
93 | ) -> str:
94 | return self.chain.run(
95 | question=query,
96 | context=context or "",
97 | language=language or "the input language",
98 | callbacks=self.callbacks,
99 | )
100 |
--------------------------------------------------------------------------------
/src/statistics/dimensionality_reduction/tsne_manager.py:
--------------------------------------------------------------------------------
1 | import typing as t
2 |
3 | import pandas as pd
4 | import plotly.express as px
5 | import streamlit as st
6 | from sklearn.manifold import TSNE
7 |
8 |
9 | class TSNEManager:
10 | def __init__(self, max_n_components: int):
11 | self.max_n_components = max_n_components
12 | self.model: TSNE | None = None
13 | self.target_col: pd.Series | None = None
14 | self.embedded_data_df: pd.DataFrame | None = None
15 |
16 | @property
17 | def params(self) -> t.Dict[str, int]:
18 | columns = st.columns(3)
19 | return {
20 | "n_components": columns[0].slider(
21 | label="Number of Components",
22 | min_value=1,
23 | max_value=self.max_n_components,
24 | value=3,
25 | step=1,
26 | help="Number of components to compute.",
27 | ),
28 | "perplexity": columns[1].slider(
29 | label="Perplexity",
30 | min_value=1,
31 | max_value=100,
32 | value=30,
33 | step=1,
34 | help="A measure of how to balance attention between local and global aspects of the data.",
35 | ),
36 | "learning_rate": columns[2].slider(
37 | label="Learning Rate",
38 | min_value=10.0,
39 | max_value=500.0,
40 | value=200.0,
41 | step=50.0,
42 | help="Step size for each iteration in optimizing the cost function.",
43 | ),
44 | }
45 |
46 | @st.cache_resource(show_spinner=True)
47 | def _get_model(_self, params: t.Dict[str, int]) -> TSNE:
48 | return TSNE(
49 | n_components=params["n_components"],
50 | perplexity=params["perplexity"],
51 | learning_rate=params["learning_rate"],
52 | )
53 |
54 | def set_model(self) -> None:
55 | params = self.params
56 | self.model = self._get_model(params)
57 |
58 | @st.cache_resource(
59 | show_spinner=True,
60 | hash_funcs={
61 | TSNE: lambda model: (
62 | model.n_components,
63 | model.perplexity,
64 | model.learning_rate,
65 | )
66 | },
67 | )
68 | def _compute_tsne(_self, model: TSNE, data: pd.DataFrame) -> pd.DataFrame:
69 | embedded_data = model.fit_transform(data)
70 | column_names = [f"D{i}" for i in range(1, model.n_components + 1)]
71 | return pd.DataFrame(embedded_data, columns=column_names)
72 |
73 | def fit(self, data: pd.DataFrame, target_col: pd.Series):
74 | self.embedded_data_df = self._compute_tsne(model=self.model, data=data)
75 | self.target_col = target_col
76 |
77 | def scatter_matrix_plot(self) -> None:
78 | return px.scatter_matrix(
79 | self.embedded_data_df, color=self.target_col, labels={"color": "target"}
80 | )
81 |
82 | def scatter_2d_plot(self) -> None:
83 | return px.scatter(
84 | self.embedded_data_df,
85 | x="D1",
86 | y="D2",
87 | color=self.target_col,
88 | labels={"color": "target"},
89 | )
90 |
91 | def scatter_3d_plot(self) -> None:
92 | return px.scatter_3d(
93 | self.embedded_data_df,
94 | x="D1",
95 | y="D2",
96 | z="D3",
97 | color=self.target_col,
98 | labels={"color": "target"},
99 | )
100 |
--------------------------------------------------------------------------------
/src/computer_vision/landmarks/base.py:
--------------------------------------------------------------------------------
1 | import os
2 | import typing as t
3 | from datetime import datetime
4 |
5 | import cv2
6 | import mediapipe as mp
7 | import streamlit_webrtc as st_webrtc
8 | from av import VideoFrame
9 | from mediapipe.framework.formats import landmark_pb2
10 | from numpy import ndarray
11 |
12 | import utils
13 |
14 | logger = utils.CustomLogger(__file__)
15 |
16 | os.environ["MEDIAPIPE_DISABLE_GPU"] = "1"
17 |
18 |
19 | class BaseLandmarkerApp:
20 | def __init__(self):
21 | pass
22 |
23 | def get_landmarks(self, image: ndarray) -> landmark_pb2.NormalizedLandmarkList:
24 | detection_result = self.landmarker.process(image)
25 | landmark_list = getattr(detection_result, self.landmarks_type)
26 | return landmark_list[0] if isinstance(landmark_list, list) else landmark_list
27 |
28 | def video_frame_callback(self, frame: VideoFrame) -> VideoFrame:
29 | image = frame.to_ndarray(format="bgr24")
30 |
31 | landmark_list = self.get_landmarks(image)
32 | self.annotate_landmarks(
33 | image=image,
34 | connections_list=self.connections_list,
35 | landmark_list=landmark_list,
36 | drawing_specs_list=self.drawing_specs_list,
37 | )
38 | utils.annotate_time(image=image)
39 | return VideoFrame.from_ndarray(image, format="bgr24")
40 |
41 | def stream(self) -> None:
42 | st_webrtc.webrtc_streamer(
43 | video_frame_callback=self.video_frame_callback,
44 | key=f"{self.landmarks_type}_streamer",
45 | mode=st_webrtc.WebRtcMode.SENDRECV,
46 | rtc_configuration=st_webrtc.RTCConfiguration(
47 | {"iceServers": utils.get_ice_servers(), "iceTransportPolicy": "relay"}
48 | ),
49 | media_stream_constraints={"video": True, "audio": False},
50 | async_processing=True,
51 | desired_playing_state=None,
52 | )
53 |
54 | @staticmethod
55 | def annotate_time(image: ndarray) -> None:
56 | text = datetime.now().strftime("%Y-%m-%d %H:%M:%S.%f")[:-3]
57 | text_args = {
58 | "text": text,
59 | "fontFace": cv2.FONT_HERSHEY_SIMPLEX,
60 | "fontScale": 1,
61 | "thickness": 2,
62 | }
63 | text_size = cv2.getTextSize(**text_args)[0]
64 | rect_width, rect_height = text_size[0] + 20, text_size[1] + 20
65 | cv2.rectangle(
66 | img=image,
67 | pt1=(0, 0),
68 | pt2=(rect_width, rect_height),
69 | color=(255, 255, 255),
70 | thickness=cv2.FILLED,
71 | )
72 | cv2.rectangle(
73 | img=image,
74 | pt1=(0, 0),
75 | pt2=(rect_width, rect_height),
76 | color=(0, 0, 0),
77 | thickness=2,
78 | )
79 | cv2.putText(
80 | img=image,
81 | org=(10, text_size[1] + 10),
82 | color=(0, 0, 0),
83 | lineType=cv2.LINE_AA,
84 | **text_args,
85 | )
86 |
87 | @staticmethod
88 | def annotate_landmarks(
89 | image: ndarray,
90 | connections_list: t.List[t.FrozenSet[t.Tuple[int, int]]],
91 | landmark_list: landmark_pb2.NormalizedLandmarkList,
92 | drawing_specs_list: t.List[t.Dict[str, mp.solutions.drawing_utils.DrawingSpec]],
93 | ) -> None:
94 | if not landmark_list:
95 | return
96 |
97 | for connections, drawing_specs in zip(connections_list, drawing_specs_list):
98 | mp.solutions.drawing_utils.draw_landmarks(
99 | image=image,
100 | landmark_list=landmark_list,
101 | connections=connections,
102 | **drawing_specs,
103 | )
104 |
--------------------------------------------------------------------------------
/src/machine_learning/datasets.py:
--------------------------------------------------------------------------------
1 | import typing as t
2 |
3 | import pandas as pd
4 | import streamlit as st
5 | from sklearn import datasets
6 | from sklearn.model_selection import train_test_split
7 |
8 |
9 | class DatasetParams(t.TypedDict):
10 | source: t.Literal["iris", "digits", "breast_cancer"]
11 | test_size: float | None
12 | shuffle: bool
13 | stratify: bool
14 |
15 |
16 | class Dataset:
17 | def __init__(
18 | self,
19 | type: t.Literal["classification", "regression"] | None = None,
20 | ):
21 | self.type = type
22 | self.X: t.Tuple[pd.DataFrame, pd.DataFrame] | None = None
23 | self.y: t.Tuple[pd.Series, pd.Series] | None = None
24 | self.label_mapping: t.Dict[int, str] | None = None
25 | self.description: str | None = None
26 |
27 | @property
28 | def params(self) -> t.Dict[str, t.Any]:
29 | columns = st.columns(3)
30 | return {
31 | "source": columns[0].selectbox(
32 | label="source",
33 | options=["iris", "digits", "breast_cancer"]
34 | if self.type == "classification"
35 | else ["diabetes"]
36 | if self.type == "regression"
37 | else ["iris", "digits", "breast_cancer", "diabetes"],
38 | help="The scikit-learn toy dataset to use.",
39 | ),
40 | "test_size": columns[1].slider(
41 | "test_size",
42 | min_value=0.05,
43 | max_value=0.3,
44 | value=0.2,
45 | step=0.05,
46 | help="The proportion of the dataset to include in the test split",
47 | )
48 | if self.type is not None
49 | else None,
50 | "shuffle": columns[2].checkbox(
51 | label="shuffle",
52 | value=True,
53 | help="Whether to shuffle the dataset or not.",
54 | )
55 | if self.type is not None
56 | else None,
57 | "stratify": columns[2].checkbox(
58 | label="stratify",
59 | value=False,
60 | help="Whether to stratify the dataset or not. "
61 | "Stratifying means keeping the same label distribution in the initial, train and test datasets. "
62 | "Available for classification only.",
63 | disabled=self.type == "regression",
64 | )
65 | if self.type is not None
66 | else None,
67 | }
68 |
69 | @staticmethod
70 | @st.cache_data(show_spinner=False)
71 | def get_dataset(
72 | split: bool = False, **params: t.Unpack[DatasetParams]
73 | ) -> t.Dict[str, t.Any]:
74 | raw_dataset = getattr(datasets, f"load_{params['source']}")(as_frame=True)
75 | X, y = raw_dataset.data, raw_dataset.target
76 | if split:
77 | X_train, X_test, y_train, y_test = train_test_split(
78 | X,
79 | y,
80 | test_size=params["test_size"],
81 | shuffle=params["shuffle"],
82 | stratify=y if params["stratify"] else None,
83 | random_state=0,
84 | )
85 | X = X_train, X_test
86 | y = y_train, y_test
87 | return {
88 | "X": X,
89 | "y": y,
90 | "label_mapping": dict(enumerate(raw_dataset.target_names))
91 | if "target_names" in raw_dataset
92 | else None,
93 | "description": raw_dataset.DESCR,
94 | }
95 |
96 | def set(self, raw_dataset_dict: t.Dict[str, t.Any]):
97 | self.X = raw_dataset_dict["X"]
98 | self.y = raw_dataset_dict["y"]
99 | self.label_mapping = raw_dataset_dict["label_mapping"]
100 | self.description = raw_dataset_dict["description"]
101 |
--------------------------------------------------------------------------------
/src/statistics/statistical_tests/ab_test.py:
--------------------------------------------------------------------------------
1 | import typing as t
2 |
3 | import streamlit as st
4 | from scipy import stats
5 |
6 |
7 | def input_group_data(
8 | group_name: t.Literal["A", "B"],
9 | default_visitors: int,
10 | default_conversions: int,
11 | ) -> t.Tuple[int, int, float]:
12 | visitors = st.number_input(
13 | "Visitors",
14 | key=f"ab_test.{group_name.lower()}_visitors",
15 | min_value=1,
16 | value=default_visitors,
17 | step=1,
18 | )
19 | conversion_col, rate_col = st.columns(2)
20 | conversions = conversion_col.number_input(
21 | "Conversions",
22 | key=f"ab_test.{group_name.lower()}_conversions",
23 | min_value=0,
24 | max_value=visitors,
25 | value=default_conversions,
26 | step=1,
27 | )
28 | rate = rate_col.number_input(
29 | "Conversion rate",
30 | key=f"ab_test.{group_name.lower()}_rate",
31 | min_value=0.0,
32 | max_value=1.0,
33 | value=conversions / visitors,
34 | disabled=True,
35 | )
36 | return visitors, conversions, rate
37 |
38 |
39 | class ABTesting:
40 | def __init__(
41 | self,
42 | a_visitors: int,
43 | a_rate: float,
44 | b_visitors: int,
45 | b_rate: float,
46 | alpha: float,
47 | test_type: t.Literal["one-sided", "two-sided"],
48 | ):
49 | self.a_visitors, self.a_rate = a_visitors, a_rate
50 | self.b_visitors, self.b_rate = b_visitors, b_rate
51 | self.alpha = alpha
52 | self.test_type = test_type
53 |
54 | @staticmethod
55 | @st.cache_data(show_spinner=False)
56 | def compute_standard_deviation(rate: float, visitors: int) -> float:
57 | return (rate * (1 - rate) / visitors) ** 0.5
58 |
59 | @classmethod
60 | @st.cache_data(show_spinner=False)
61 | def compute_confidence_interval(
62 | _cls,
63 | a_rate: float,
64 | b_rate: float,
65 | a_visitors: int,
66 | b_visitors: int,
67 | alpha: float,
68 | ) -> t.Tuple[float, float]:
69 | a_std = _cls.compute_standard_deviation(a_rate, a_visitors)
70 | b_std = _cls.compute_standard_deviation(b_rate, b_visitors)
71 | interval = (
72 | stats.norm.ppf(1 - alpha / 2)
73 | * ((a_std**2 / a_visitors) + (b_std**2 / b_visitors)) ** 0.5
74 | )
75 | return b_rate - a_rate - interval, b_rate - a_rate + interval
76 |
77 | @staticmethod
78 | @st.cache_data(show_spinner=False)
79 | def is_statistically_significant(p_value: float, alpha: float) -> bool:
80 | return p_value < alpha
81 |
82 | @staticmethod
83 | @st.cache_data(show_spinner=False)
84 | def t_test(a_rate, a_std, a_visitors, b_rate, b_std, b_visitors, test_type):
85 | t_statistic, p_value = stats.ttest_ind_from_stats(
86 | mean1=a_rate,
87 | std1=a_std,
88 | nobs1=a_visitors,
89 | mean2=b_rate,
90 | std2=b_std,
91 | nobs2=b_visitors,
92 | )
93 | if test_type == "one-sided":
94 | p_value /= 2
95 |
96 | return t_statistic, p_value
97 |
98 | def perform_ab_test(self) -> t.Dict[str, any]:
99 | a_std = self.compute_standard_deviation(self.a_rate, self.a_visitors)
100 | b_std = self.compute_standard_deviation(self.b_rate, self.b_visitors)
101 |
102 | t_statistic, p_value = self.t_test(
103 | self.a_rate,
104 | a_std,
105 | self.a_visitors,
106 | self.b_rate,
107 | b_std,
108 | self.b_visitors,
109 | self.test_type,
110 | )
111 |
112 | confidence_interval = self.compute_confidence_interval(
113 | self.a_rate,
114 | self.b_rate,
115 | self.a_visitors,
116 | self.b_visitors,
117 | self.alpha,
118 | )
119 |
120 | is_significant = self.is_statistically_significant(p_value, self.alpha)
121 |
122 | return {
123 | "t_statistic": t_statistic,
124 | "p_value": p_value,
125 | "confidence_interval": confidence_interval,
126 | "is_significant": is_significant,
127 | }
128 |
--------------------------------------------------------------------------------
/src/statistics/dimensionality_reduction/pca_manager.py:
--------------------------------------------------------------------------------
1 | import typing as t
2 |
3 | import numpy as np
4 | import pandas as pd
5 | import plotly.express as px
6 | import plotly.graph_objects as go
7 | import streamlit as st
8 | from sklearn.decomposition import PCA
9 |
10 |
11 | class PCAManager:
12 | def __init__(self, max_n_components: int):
13 | self.max_n_components = max_n_components
14 | self.normalize: bool | None = None
15 | self.model: PCA | None = None
16 | self.target_col: pd.Series | None = None
17 |
18 | @property
19 | def params(self) -> t.Dict[str, int]:
20 | columns = st.columns(2)
21 | return {
22 | "n_components": columns[0].slider(
23 | label="Number of Components",
24 | min_value=1,
25 | max_value=self.max_n_components,
26 | value=3,
27 | step=1,
28 | help="Number of principal components to compute.",
29 | ),
30 | "normalize": (
31 | columns[1]
32 | .columns([0.5, 1, 0.5])[1]
33 | .toggle("Normalize data", value=False)
34 | ),
35 | }
36 |
37 | @st.cache_resource(show_spinner=True)
38 | def _get_model(_self, n_components: int) -> PCA:
39 | return PCA(n_components)
40 |
41 | def set_model(self) -> None:
42 | params = self.params
43 | self.model = self._get_model(params["n_components"])
44 | self.model.normalize = params["normalize"]
45 |
46 | @st.cache_resource(
47 | show_spinner=True,
48 | hash_funcs={PCA: lambda model: (model.n_components, model.normalize)},
49 | )
50 | def _compute_pca(
51 | _self, model: PCA, data: pd.DataFrame
52 | ) -> t.Tuple[pd.DataFrame, PCA]:
53 | data_normalized = (
54 | (data - data.mean()) / (data.std() + 1e-5) if model.normalize else data
55 | )
56 | components = model.fit_transform(data_normalized)
57 |
58 | return pd.DataFrame(
59 | components, columns=[f"PC{i+1}" for i in range(components.shape[1])]
60 | )
61 |
62 | def fit(self, data: pd.DataFrame, target_col: pd.Series):
63 | self.components_df = self._compute_pca(model=self.model, data=data)
64 | self.target_col = target_col
65 |
66 | def scatter_matrix_plot(self) -> None:
67 | return px.scatter_matrix(
68 | self.components_df, color=self.target_col, labels={"color": "target"}
69 | )
70 |
71 | def explained_variance_plot(self) -> None:
72 | exp_var_cumul = np.cumsum(self.model.explained_variance_ratio_)
73 | x_ticks = list(range(1, exp_var_cumul.shape[0] + 1))
74 | fig = px.bar(
75 | x=x_ticks,
76 | y=exp_var_cumul,
77 | labels={"x": "# Components", "y": "Explained Variance"},
78 | )
79 | fig.update_xaxes(tickvals=x_ticks, ticktext=list(map(str, x_ticks)))
80 | fig.add_trace(
81 | go.Scatter(
82 | x=x_ticks,
83 | y=exp_var_cumul,
84 | mode="lines+markers",
85 | line=dict(color="red", width=3),
86 | marker=dict(size=10),
87 | showlegend=False,
88 | )
89 | )
90 | return fig
91 |
92 | def scatter_2d_plot(self) -> None:
93 | return px.scatter(
94 | self.components_df,
95 | x="PC1",
96 | y="PC2",
97 | color=self.target_col,
98 | labels={"color": "target"},
99 | )
100 |
101 | def scatter_3d_plot(self) -> None:
102 | return px.scatter_3d(
103 | self.components_df,
104 | x="PC1",
105 | y="PC2",
106 | z="PC3",
107 | color=self.target_col,
108 | labels={"color": "target"},
109 | )
110 |
111 | def loadings_plot(self) -> None:
112 | loadings = self.model.components_.T * np.sqrt(self.model.explained_variance_)
113 |
114 | fig = px.scatter(
115 | self.components_df,
116 | x="PC1",
117 | y="PC2",
118 | color=self.target_col,
119 | labels={"color": "target"},
120 | )
121 |
122 | for i, feature in enumerate(self.components_df.columns):
123 | fig.add_annotation(
124 | ax=0,
125 | ay=0,
126 | axref="x",
127 | ayref="y",
128 | x=loadings[i, 0],
129 | y=loadings[i, 1],
130 | showarrow=True,
131 | arrowsize=2,
132 | arrowhead=2,
133 | xanchor="right",
134 | yanchor="top",
135 | )
136 | fig.add_annotation(
137 | x=loadings[i, 0],
138 | y=loadings[i, 1],
139 | ax=0,
140 | ay=0,
141 | xanchor="center",
142 | yanchor="bottom",
143 | text=feature,
144 | yshift=5,
145 | )
146 | return fig
147 |
--------------------------------------------------------------------------------
/pages/pages_config.yaml:
--------------------------------------------------------------------------------
1 | __init__.py:
2 | NAME: Home
3 | ICON: 🏠
4 | DESCRIPTION: |
5 | Welcome to my superapp, a comprehensive toolset for data science and machine learning 🚀
6 | [](https://github.com/daltunay/my-superapp/)
7 |
8 | Technologies used:
9 | - **Programming language**: Python
10 | - **Libraries**: pandas, numpy, scikit-learn, OpenCV, Mediapipe, plotly, XGBoost, SHAP, LangChain, OpenAI, Together, FAISS, ultralytics, umap
11 | - **Deployment**: Docker, Streamlit
12 |
13 | Feel free to provide feedback and make this superapp even more powerful!
14 |
15 |
16 | > _Made by Daniel Altunay_
17 | [](https://linkedin.com/in/daltunay)
18 | [](https://github.com/daltunay)
19 |
20 | ---
21 | SIDEBAR: radio
22 | TAG:
23 |
24 | statistical_tests:
25 | __init__.py:
26 | NAME: Statistical Tests
27 | ICON: 🔢
28 | DESCRIPTION: Perform several statistical tests!
29 | SIDEBAR: radio
30 | TAG: 📚 Statistics
31 |
32 | ab_test.py:
33 | NAME: A/B Test
34 | ICON: 🆎
35 | DESCRIPTION: |
36 | Perform A/B tests!
37 | > pandas, numpy, scipy
38 | TAG:
39 |
40 | chi2_test.py:
41 | NAME: Chi-squared Test
42 | ICON: 🆇
43 | DESCRIPTION: |
44 | Perform chi-squared tests!
45 | > pandas, numpy, scipy
46 | TAG:
47 |
48 | dimensionality_reduction:
49 | __init__.py:
50 | NAME: Dimensionality Reduction
51 | ICON: 🔽
52 | DESCRIPTION: Reduce dimensionality for high-D data!
53 | SIDEBAR: radio
54 | TAG: 📚 Statistics
55 |
56 | pca.py:
57 | NAME: PCA
58 | ICON: ⭕
59 | DESCRIPTION: |
60 | Perform Principal Component Analysis!
61 | > pandas, scikit-learn, plotly
62 | TAG:
63 |
64 | t-sne.py:
65 | NAME: t-SNE
66 | ICON: 📊
67 | DESCRIPTION: |
68 | Perform t-distributed Stochastic Neighbor Embedding!
69 | > pandas, scikit-learn, plotly
70 | TAG:
71 |
72 | umap.py:
73 | NAME: UMAP
74 | ICON: 🗺️
75 | DESCRIPTION: |
76 | Perform Uniform Manifold Approximation and Projection!
77 | > pandas, scikit-learn, umap, plotly
78 | TAG:
79 |
80 | landmarks:
81 | __init__.py:
82 | NAME: Landmarks Detection
83 | ICON: 📍
84 | DESCRIPTION: Perform live landmark detection using your webcam!
85 | SIDEBAR: radio
86 | TAG: 👁️ Computer Vision
87 |
88 | face_landmarks.py:
89 | NAME: Face Mesh
90 | ICON: 👤
91 | DESCRIPTION: |
92 | Detect face landmarks using Mediapipe!
93 | > OpenCV, Mediapipe, WebRTC
94 | TAG:
95 |
96 | pose_landmarks.py:
97 | NAME: Pose Landmarks
98 | ICON: 🤸♂️
99 | DESCRIPTION: |
100 | Detect body pose landmarks using Mediapipe!
101 | > OpenCV, Mediapipe, WebRTC
102 | TAG:
103 |
104 | object_detection:
105 | __init__.py:
106 | NAME: Object Detection
107 | ICON: 🔍
108 | DESCRIPTION: Perform live object detection using your webcam!
109 | SIDEBAR: radio
110 | TAG: 👁️ Computer Vision
111 |
112 | face_detection.py:
113 | NAME: Face Detection
114 | ICON: 👀
115 | DESCRIPTION: |
116 | Detect one or several faces using Mediapipe!
117 | > OpenCV, Mediapipe, WebRTC
118 | TAG:
119 |
120 | multi_objects.py:
121 | NAME: Multi-Object Detection
122 | ICON: 📦
123 | DESCRIPTION: |
124 | Detect 80 unique labels using YOLOv8!
125 | > OpenCV, ultralytics, WebRTC
126 | TAG:
127 |
128 | image_generation:
129 | __init__.py:
130 | NAME: Image Generation
131 | ICON: 🎨
132 | DESCRIPTION: Generate pictures with AI!
133 | SIDEBAR: radio
134 | TAG: 🧠 Generative AI
135 |
136 | dall_e.py:
137 | NAME: DALL·E
138 | ICON: 🖼️
139 | DESCRIPTION: DALL·E model from OpenAI
140 | TAG:
141 |
142 | stable_diffusion.py:
143 | NAME: Stable Diffusion
144 | ICON: 🖼️
145 | DESCRIPTION: Stable Diffusion model from Stability AI
146 | TAG:
147 |
148 | large_language_models:
149 | __init__.py:
150 | NAME: Large Language Models
151 | ICON: 💬
152 | DESCRIPTION: Interact with large language models!
153 | SIDEBAR: radio
154 | TAG: 🧠 Generative AI
155 |
156 | chatbot.py:
157 | NAME: Basic Chatbot
158 | ICON: 👋
159 | DESCRIPTION: |
160 | A regular chatbot.
161 | > LangChain, OpenAI, Together
162 | TAG: 🤖 Chatbots
163 |
164 | chatbot_rag.py:
165 | NAME: Chatbot with RAG
166 | ICON: 📄
167 | DESCRIPTION: |
168 | A chatbot with RAG (retrieval augmented generation).
169 | > LangChain, OpenAI, Together, FAISS
170 | TAG: 🤖 Chatbots
171 |
172 | chatbot_tools.py:
173 | NAME: Chatbot with Tools
174 | ICON: 🛠️
175 | DESCRIPTION: |
176 | A chatbot augmented with tools (web access, code interpreter, etc.)
177 | > LangChain (Agents), OpenAI, Together
178 | TAG: 🤖 Chatbots
179 |
180 | chatbot_web_summary.py:
181 | NAME: Webpage summary
182 | ICON: 🌐
183 | DESCRIPTION: |
184 | A model to synthetize to text content of a webpage.
185 | > LangChain, OpenAI, Together, unstructured
186 | TAG: 🔄 Other
187 |
188 | classification:
189 | __init__.py:
190 | NAME: Classification
191 | ICON: 🎯
192 | DESCRIPTION: Perform several types of classification!
193 | SIDEBAR: radio
194 | TAG: ⚙️ Machine Learning
195 |
196 | xgboost.py:
197 | NAME: Gradient Boosting
198 | ICON: 🌲
199 | DESCRIPTION: |
200 | Use gradient boosting for binary & multi-class classification!
201 | > pandas, XGBoost, scikit-learn, SHAP, plotly
202 | TAG:
203 |
204 | regression:
205 | __init__.py:
206 | NAME: Regression
207 | ICON: 📈
208 | DESCRIPTION: Perform several types of regression!
209 | SIDEBAR: radio
210 | TAG: ⚙️ Machine Learning
211 |
212 | xgboost.py:
213 | NAME: Gradient Boosting
214 | ICON: 🌲
215 | DESCRIPTION: |
216 | Use gradient boosting for regression!
217 | > pandas, XGBoost, scikit-learn, SHAP, plotly
218 | TAG:
219 |
220 | clustering:
221 | __init__.py:
222 | NAME: Clustering
223 | ICON: 🕸️
224 | DESCRIPTION: Perform different types of clustering!
225 | SIDEBAR: radio
226 | TAG: ⚙️ Machine Learning
227 |
228 | kmeans.py:
229 | NAME: K-Means
230 | ICON: 🇰
231 | DESCRIPTION: |
232 | Perform a K-Means clustering!
233 | > pandas, scikit-learn, plotly
234 | TAG:
235 |
236 | dbscan.py:
237 | NAME: DBSCAN
238 | ICON: 🇩
239 | DESCRIPTION: |
240 | Perform a DBSCAN clustering!
241 | > pandas, scikit-learn, plotly
242 | TAG:
243 |
--------------------------------------------------------------------------------
/src/machine_learning/xgboost_manager.py:
--------------------------------------------------------------------------------
1 | import typing as t
2 |
3 | import matplotlib.pyplot as plt
4 | import numpy as np
5 | import pandas as pd
6 | import shap
7 | import sklearn.metrics
8 | import streamlit as st
9 | from matplotlib.figure import Figure
10 | from xgboost import XGBClassifier, XGBRegressor
11 |
12 |
13 | def xgb_hash_func(model: XGBClassifier | XGBRegressor):
14 | return {key: val for key, val in vars(model).items() if key != "_Booster"}
15 |
16 |
17 | class XGBoostManager:
18 | def __init__(self, task: t.Literal["classification", "regression"]) -> None:
19 | self.task = task
20 | self.model: XGBClassifier | None = None
21 | self.classification_report: pd.DataFrame | None = None
22 | self.confusion_matrix: pd.DataFrame | None = None
23 | self.metrics_report: pd.DataFrame | None = None
24 |
25 | @property
26 | def params(self) -> t.Dict[str, float | int]:
27 | columns = st.columns(3)
28 | return {
29 | "max_depth": columns[0].slider(
30 | label="`max_depth`",
31 | min_value=1,
32 | max_value=5,
33 | value=3,
34 | step=1,
35 | help="Maximum depth of a tree. "
36 | "Increasing this value will make the model more complex and more likely to overfit. "
37 | "0 indicates no limit on depth.",
38 | ),
39 | "learning_rate": columns[0].select_slider(
40 | label="`learning_rate`",
41 | options=[0.01, 0.05, 0.1, 0.25, 0.5, 1.0],
42 | value=0.01,
43 | help="Step size shrinkage used in update to prevents overfitting. "
44 | "After each boosting step, we can directly get the weights of new features, and `learning_rate` shrinks the feature weights to make the boosting process more conservative.",
45 | ),
46 | "n_estimators": columns[0].slider(
47 | label="`n_estimators`",
48 | min_value=10,
49 | max_value=50,
50 | value=50,
51 | step=10,
52 | help="Number of gradient boosted trees. "
53 | "Equivalent to number of boosting rounds.",
54 | ),
55 | "subsample": columns[1].slider(
56 | label="`subsample`",
57 | min_value=0.1,
58 | max_value=1.0,
59 | value=0.8,
60 | step=0.1,
61 | help="Subsample ratio of the training instances. "
62 | "Setting it to 0.5 means that XGBoost would randomly sample half of the training data prior to growing trees and this will prevent overfitting. "
63 | "Subsampling will occur once in every boosting iteration.",
64 | ),
65 | "colsample_bytree": columns[1].slider(
66 | label="`colsample_bytree`",
67 | min_value=0.1,
68 | max_value=1.0,
69 | value=0.8,
70 | step=0.1,
71 | help="Subsample ratio of columns when constructing each tree. "
72 | "Subsampling occurs once for every tree constructed.",
73 | ),
74 | "min_split_loss": columns[1].slider(
75 | label="`min_split_loss`",
76 | min_value=0.0,
77 | max_value=5.0,
78 | value=0.0,
79 | step=0.5,
80 | help="Minimum loss reduction required to make a further partition on a leaf node of the tree. "
81 | "The larger `min_split_loss` is, the more conservative the algorithm will be.",
82 | ),
83 | "min_child_weight": columns[2].slider(
84 | label="`min_child_weight`",
85 | min_value=0.0,
86 | max_value=5.0,
87 | value=1.0,
88 | step=0.5,
89 | help="Minimum sum of instance weight (hessian) needed in a child. "
90 | "If the tree partition step results in a leaf node with the sum of instance weight less than `min_child_weight`, then the building process will give up further partitioning. "
91 | "In linear regression task, this simply corresponds to minimum number of instances needed to be in each node. "
92 | "The larger `min_child_weight` is, the more conservative the algorithm will be.",
93 | ),
94 | "reg_alpha": columns[2].slider(
95 | label="`reg_alpha`",
96 | min_value=0.0,
97 | max_value=5.0,
98 | value=1.0,
99 | step=0.5,
100 | help="L1 regularization term on weights. "
101 | "Increasing this value will make model more conservative.",
102 | ),
103 | "reg_lambda": columns[2].slider(
104 | label="`reg_lambda`",
105 | min_value=0.0,
106 | max_value=5.0,
107 | value=0.0,
108 | step=0.5,
109 | help="L2 regularization term on weights. "
110 | "Increasing this value will make model more conservative.",
111 | ),
112 | }
113 |
114 | @staticmethod
115 | @st.cache_resource(show_spinner=True)
116 | def _get_model(
117 | task: t.Literal["classification", "regression"],
118 | label_mapping: t.Dict[int, str] | None = None,
119 | **params
120 | ) -> XGBClassifier:
121 | if task == "classification":
122 | return XGBClassifier(**params)
123 | elif task == "regression":
124 | return XGBRegressor(**params)
125 |
126 | def set_model(self, label_mapping: t.Dict[int, str] | None = None) -> None:
127 | self.model = self._get_model(self.task, label_mapping, **self.params)
128 |
129 | @staticmethod
130 | @st.cache_resource(
131 | show_spinner=True,
132 | hash_funcs={XGBClassifier: xgb_hash_func, XGBRegressor: xgb_hash_func},
133 | )
134 | def _fit_model(
135 | model: XGBClassifier | XGBRegressor, X_train: pd.DataFrame, y_train: pd.Series
136 | ) -> XGBClassifier | XGBRegressor:
137 | return model.fit(X_train, y_train)
138 |
139 | def fit(self, X_train: pd.DataFrame, y_train: pd.Series):
140 | self.model = self._fit_model(self.model, X_train, y_train)
141 |
142 | @staticmethod
143 | @st.cache_data(show_spinner=True)
144 | def _classification_report(
145 | y_true: pd.Series, y_pred: pd.Series, target_names: t.List[str]
146 | ):
147 | return (
148 | pd.DataFrame(
149 | sklearn.metrics.classification_report(
150 | y_true=y_true,
151 | y_pred=y_pred,
152 | target_names=target_names,
153 | output_dict=True,
154 | zero_division=np.nan,
155 | )
156 | )
157 | .astype(float)
158 | .round(4)
159 | .transpose()
160 | )
161 |
162 | @staticmethod
163 | @st.cache_data(show_spinner=True)
164 | def _confusion_matrix(y_true: pd.Series, y_pred: pd.Series):
165 | return pd.DataFrame(
166 | sklearn.metrics.confusion_matrix(y_true=y_true, y_pred=y_pred)
167 | )
168 |
169 | @staticmethod
170 | @st.cache_data(show_spinner=True)
171 | def _metrics_report(y_true: pd.Series, y_pred: pd.Series):
172 | mean_absolute_error = sklearn.metrics.mean_absolute_error(y_true, y_pred)
173 | median_absolute_error = sklearn.metrics.median_absolute_error(y_true, y_pred)
174 | mean_squared_error = sklearn.metrics.mean_squared_error(y_true, y_pred)
175 | r2 = sklearn.metrics.r2_score(y_true, y_pred)
176 | explained_variance = sklearn.metrics.explained_variance_score(y_true, y_pred)
177 | return pd.DataFrame(
178 | {
179 | "Mean Absolute Error": [mean_absolute_error],
180 | "Median Absolute Error": [median_absolute_error],
181 | "Mean Squared Error": [mean_squared_error],
182 | "Root Mean Squared Error": [mean_squared_error**0.5],
183 | "R^2": [r2],
184 | "Explained Variance": [explained_variance],
185 | },
186 | index=["Value"],
187 | ).transpose()
188 |
189 | @staticmethod
190 | @st.cache_data(show_spinner=True)
191 | def _confusion_matrix_display(
192 | confusion_matrix: pd.DataFrame, display_labels: t.List[str]
193 | ) -> sklearn.metrics.ConfusionMatrixDisplay:
194 | return sklearn.metrics.ConfusionMatrixDisplay(
195 | confusion_matrix=confusion_matrix,
196 | display_labels=display_labels,
197 | )
198 |
199 | def confusion_matrix_display(self, display_labels: t.List[str]) -> Figure:
200 | confusion_matrix_display = self._confusion_matrix_display(
201 | confusion_matrix=self.confusion_matrix.to_numpy(),
202 | display_labels=display_labels,
203 | )
204 | fig, ax = plt.subplots()
205 | confusion_matrix_display.plot(ax=ax)
206 | return fig
207 |
208 | def evaluate(
209 | self,
210 | X_test: pd.DataFrame,
211 | y_test: pd.Series,
212 | target_names: t.List[str] | None = None,
213 | ):
214 | y_pred = self.model.predict(X_test)
215 | if self.task == "classification":
216 | self.classification_report = self._classification_report(
217 | y_true=y_test,
218 | y_pred=y_pred,
219 | target_names=target_names,
220 | )
221 | self.confusion_matrix = self._confusion_matrix(
222 | y_true=y_test,
223 | y_pred=y_pred,
224 | )
225 | elif self.task == "regression":
226 | self.metrics_report = self._metrics_report(
227 | y_true=y_test,
228 | y_pred=y_pred,
229 | )
230 |
231 | @staticmethod
232 | @st.cache_data(
233 | show_spinner=True,
234 | hash_funcs={XGBClassifier: xgb_hash_func, XGBRegressor: xgb_hash_func},
235 | )
236 | def _shap_values(model: XGBClassifier | XGBRegressor, X_test: pd.DataFrame):
237 | explainer = shap.TreeExplainer(model)
238 | shap_values = explainer.shap_values(X_test)
239 | return explainer, shap_values
240 |
241 | def shap_force_plot(self, X_test: pd.DataFrame):
242 | explainer, shap_values = self._shap_values(self.model, X_test)
243 | base_value = explainer.expected_value
244 | if isinstance(self.model, XGBClassifier):
245 | base_value = base_value[0]
246 | shap_values = shap_values[0]
247 | return shap.force_plot(
248 | base_value=base_value, shap_values=shap_values, features=X_test
249 | )
250 |
--------------------------------------------------------------------------------