├── cybulde ├── configs │ ├── __init__.py │ ├── automatically_generated │ │ ├── __init__.py │ │ ├── full_config_header.yaml │ │ ├── hydra │ │ │ └── job_logging │ │ │ │ └── custom.yaml │ │ └── config.yaml │ ├── config.yaml │ └── hydra │ │ └── job_logging │ │ └── custom.yaml ├── utils │ ├── mixins.py │ ├── utils.py │ ├── torch_utils.py │ ├── io_utils.py │ ├── config_utils.py │ ├── gcp_utils.py │ └── mlflow_utils.py ├── __init__.py ├── tests.py ├── training │ ├── loss_functions.py │ ├── tasks │ │ ├── bases.py │ │ ├── common_training_task.py │ │ └── tar_model_exporting_training_task.py │ ├── schedulers.py │ └── lightning_modules │ │ ├── bases.py │ │ └── binary_text_classification.py ├── config_schemas │ ├── infrastructure │ │ ├── instance_group_creator_schemas.py │ │ ├── infrastructure_schema.py │ │ └── instance_template_creator_schemas.py │ ├── base_schemas.py │ ├── training │ │ ├── loss_schemas.py │ │ ├── optimizer_schemas.py │ │ ├── scheduler_schemas.py │ │ ├── training_task_schemas.py │ │ └── training_lightning_module_schemas.py │ ├── models │ │ ├── head_schemas.py │ │ ├── transformation_schemas.py │ │ ├── backbone_schemas.py │ │ ├── model_schemas.py │ │ └── adapter_schemas.py │ ├── trainer │ │ ├── logger_schemas.py │ │ ├── callbacks_schemas.py │ │ └── trainer_schemas.py │ ├── evaluation │ │ ├── evaluation_lightning_module_schemas.py │ │ ├── evaluation_task_schemas.py │ │ └── model_selector_schemas.py │ ├── config_schema.py │ ├── data_module_schemas.py │ └── experiment │ │ └── bert │ │ └── local_bert.py ├── data_modules │ ├── datasets.py │ └── data_modules.py ├── models │ ├── heads.py │ ├── models.py │ ├── common │ │ ├── utils.py │ │ ├── io_utils.py │ │ └── exporter.py │ ├── backbones.py │ ├── transformations.py │ └── adapters.py ├── evaluation │ ├── lightning_modules │ │ ├── bases.py │ │ └── binary_text_evaluation.py │ ├── tasks │ │ ├── bases.py │ │ └── common_evaluation_task.py │ └── model_selector.py ├── web_app │ └── server.py ├── run_tasks.py ├── launch_job_on_gcp.py ├── generate_final_config.py └── infrastructure │ ├── instance_group_creator.py │ └── instance_template_creator.py ├── README.md ├── .envs ├── .postgres ├── .mlflow-common ├── .mlflow-prod ├── .infrastructure └── .mlflow-dev ├── .dockerignore ├── docker ├── scripts │ ├── start-tracking-server.sh │ ├── start-prediction-service.sh │ └── startup-script.sh └── Dockerfile ├── .gitattributes ├── setup.cfg ├── scripts ├── deploy-etcd-server.sh └── vm_startup │ └── task_runner_startup_script.sh ├── docker-compose.yaml ├── .gitignore ├── pyproject.toml ├── Makefile └── LICENSE /cybulde/configs/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /cybulde/configs/automatically_generated/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # cybulde-project-template 2 | A simple template for Cybulde project 3 | -------------------------------------------------------------------------------- /.envs/.postgres: -------------------------------------------------------------------------------- 1 | POSTGRES_DB=backend 2 | POSTGRES_USER=backend 3 | POSTGRES_PASSWORD=backend 4 | -------------------------------------------------------------------------------- /cybulde/utils/mixins.py: -------------------------------------------------------------------------------- 1 | class LoggableParamsMixin: 2 | def loggable_params(self) -> list[str]: 3 | return [] 4 | -------------------------------------------------------------------------------- /.envs/.mlflow-common: -------------------------------------------------------------------------------- 1 | 2 | 3 | LOCAL_DEV_MLFLOW_SERVER_HOST=127.0.0.1 4 | LOCAL_DEV_MLFLOW_SERVER_PORT=6101 5 | 6 | PROD_MLFLOW_SERVER_PORT=6100 7 | -------------------------------------------------------------------------------- /.dockerignore: -------------------------------------------------------------------------------- 1 | # We would like to ignore everything, then allow only required files and directories to pass 2 | * 3 | 4 | !docker 5 | !cybulde 6 | !pyproject.toml 7 | !poetry.lock 8 | -------------------------------------------------------------------------------- /cybulde/utils/utils.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import socket 3 | 4 | 5 | def get_logger(name: str) -> logging.Logger: 6 | return logging.getLogger(f"[{socket.gethostname()}] {name}") 7 | -------------------------------------------------------------------------------- /cybulde/configs/automatically_generated/full_config_header.yaml: -------------------------------------------------------------------------------- 1 | defaults: 2 | - override hydra/hydra_logging: disabled 3 | - _self_ 4 | hydra: 5 | output_subdir: null 6 | run: 7 | dir: . 8 | -------------------------------------------------------------------------------- /cybulde/configs/config.yaml: -------------------------------------------------------------------------------- 1 | defaults: 2 | - config_schema 3 | 4 | - override hydra/job_logging: custom 5 | - override hydra/hydra_logging: disabled 6 | - _self_ 7 | 8 | hydra: 9 | output_subdir: null 10 | run: 11 | dir: . 12 | -------------------------------------------------------------------------------- /cybulde/__init__.py: -------------------------------------------------------------------------------- 1 | import warnings 2 | 3 | warnings.filterwarnings(action="ignore", category=RuntimeWarning, module=".*schema.*") 4 | 5 | from cybulde.config_schemas.experiment.bert import local_bert # noqa: E402 6 | 7 | __all__ = ["local_bert"] 8 | -------------------------------------------------------------------------------- /docker/scripts/start-tracking-server.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | mlflow server \ 4 | --backend-store-uri "${MLFLOW_BACKEND_STORE}" \ 5 | --default-artifact-root "${MLFLOW_ARTIFACT_STORE}" \ 6 | --host 0.0.0.0 \ 7 | --port "${LOCAL_DEV_MLFLOW_SERVER_PORT}" 8 | -------------------------------------------------------------------------------- /.envs/.mlflow-prod: -------------------------------------------------------------------------------- 1 | IS_PROD_ENV=true 2 | GOOGLE_CLOUD_PROJECT=cybulde 3 | MLFLOW_INTERNAL_TRACKING_URI=http://cybulde-mlflow.europe-west4-a.c.${GOOGLE_CLOUD_PROJECT}.internal:${PROD_MLFLOW_SERVER_PORT} 4 | MLFLOW_TRACKING_URI=http://localhost:${PROD_MLFLOW_SERVER_PORT} 5 | 6 | -------------------------------------------------------------------------------- /docker/scripts/start-prediction-service.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -o errexit 4 | set -o pipefail 5 | set -o nounset 6 | 7 | uvicorn cybulde.web_app.server:app \ 8 | --host "${UVICORN_HOST:-0.0.0.0}" \ 9 | --port "${UVICORN_PORT:-8001}" \ 10 | --workers "${UVICORN_WORKERS:-1}" 11 | -------------------------------------------------------------------------------- /cybulde/tests.py: -------------------------------------------------------------------------------- 1 | from cybulde.utils.mlflow_utils import get_all_experiment_ids, get_best_run 2 | 3 | experiments = get_all_experiment_ids() 4 | 5 | print(f"{experiments=}") 6 | 7 | best_runs = get_best_run() 8 | 9 | print(f"{best_runs=}") 10 | print(f"{best_runs['metrics.test_f1_score']=}") 11 | -------------------------------------------------------------------------------- /.envs/.infrastructure: -------------------------------------------------------------------------------- 1 | GCP_PROJECT_ID=cybulde 2 | GCP_ARTIFACT_REGISTRY_REPOSITORY_NAME=cybulde 3 | VM_NAME=cybulde-mlflow 4 | ZONE=europe-west4-a 5 | DOCKER_IMAGE_NAME=cybulde-model 6 | GCP_DOCKER_REGISTRY_URL=europe-west4-docker.pkg.dev/${GCP_PROJECT_ID}/${GCP_ARTIFACT_REGISTRY_REPOSITORY_NAME}/${DOCKER_IMAGE_NAME} 7 | -------------------------------------------------------------------------------- /.envs/.mlflow-dev: -------------------------------------------------------------------------------- 1 | 2 | IS_PROD_ENV=false 3 | 4 | MLFLOW_BACKEND_STORE=postgresql://${POSTGRES_USER}:${POSTGRES_PASSWORD}@mlflow-backend-store/${POSTGRES_DB} 5 | MLFLOW_ARTIFACT_STORE=/mlflow-artifact-store 6 | 7 | MLFLOW_INTERNAL_TRACKING_URI=http://${LOCAL_DEV_MLFLOW_SERVER_HOST}:${LOCAL_DEV_MLFLOW_SERVER_PORT} 8 | MLFLOW_TRACKING_URI=${MLFLOW_INTERNAL_TRACKING_URI} 9 | -------------------------------------------------------------------------------- /.gitattributes: -------------------------------------------------------------------------------- 1 | *.pxd text diff=python 2 | *.py text diff=python 3 | *.py3 text diff=python 4 | *.pyw text diff=python 5 | *.pyx text diff=python 6 | *.pyz text diff=python 7 | *.pyi text diff=python 8 | 9 | *.pkl binary 10 | *.pickle binary 11 | *.pyc binary 12 | *.pyd binary 13 | *.pyo binary 14 | 15 | *.ipynb text 16 | * text=auto 17 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [flake8] 2 | # https://pycodestyle.readthedocs.io/en/latest/intro.html#error-codes 3 | # http://flake8.pycqa.org/en/latest/user/error-codes.html 4 | ignore = E501,W503,W504,E203,I201,I202 5 | max-line-length = 120 6 | import-order-style = pep8 7 | application_import_names = 8 | cybulde 9 | exclude = 10 | .git 11 | 12 | [pycodestyle] 13 | max-line-length = 120 14 | -------------------------------------------------------------------------------- /docker/scripts/startup-script.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -o errexit 4 | set -o pipefail 5 | set -o nounset 6 | 7 | if [[ "${IS_PROD_ENV}" == "true" ]]; then 8 | /usr/local/gcloud/google-cloud-sdk/bin/gcloud compute ssh "${VM_NAME}" --zone "${ZONE}" --tunnel-through-iap -- -4 -N -L ${PROD_MLFLOW_SERVER_PORT}:localhost:${PROD_MLFLOW_SERVER_PORT} 9 | else 10 | /start-prediction-service.sh & 11 | /start-tracking-server.sh & 12 | tail -F anything 13 | fi 14 | -------------------------------------------------------------------------------- /cybulde/training/loss_functions.py: -------------------------------------------------------------------------------- 1 | from typing import Optional 2 | 3 | import torch.nn.functional as F 4 | 5 | from torch import Tensor, nn 6 | 7 | 8 | class LossFunction(nn.Module): 9 | pass 10 | 11 | 12 | class BCEWithLogitsLoss(LossFunction): 13 | def __init__(self, reduction: str = "mean") -> None: 14 | super().__init__() 15 | self.reduction = reduction 16 | 17 | def forward(self, x: Tensor, target: Tensor, pos_weight: Optional[Tensor] = None) -> Tensor: 18 | return F.binary_cross_entropy_with_logits(x, target, reduction=self.reduction, pos_weight=pos_weight) 19 | -------------------------------------------------------------------------------- /cybulde/configs/hydra/job_logging/custom.yaml: -------------------------------------------------------------------------------- 1 | version: 1 2 | formatters: 3 | brief: 4 | format: '[%(levelname)s] %(asctime)s %(name)s: %(message)s' 5 | datefmt: '%Y-%m-%d %H:%M:%S' 6 | handlers: 7 | file: 8 | level: INFO 9 | class: logging.handlers.RotatingFileHandler 10 | formatter: brief 11 | maxBytes: 1024 12 | backupCount: 0 13 | filename: logs.log 14 | mode: w 15 | encoding: utf8 16 | console: 17 | level: DEBUG 18 | class: logging.StreamHandler 19 | formatter: brief 20 | stream: ext://sys.stdout 21 | root: 22 | level: INFO 23 | handlers: [file, console] 24 | 25 | disable_existing_loggers: false 26 | -------------------------------------------------------------------------------- /cybulde/configs/automatically_generated/hydra/job_logging/custom.yaml: -------------------------------------------------------------------------------- 1 | version: 1 2 | formatters: 3 | brief: 4 | format: '[%(levelname)s] %(asctime)s %(name)s: %(message)s' 5 | datefmt: '%Y-%m-%d %H:%M:%S' 6 | handlers: 7 | file: 8 | level: INFO 9 | class: logging.handlers.RotatingFileHandler 10 | formatter: brief 11 | maxBytes: 1024 12 | backupCount: 0 13 | filename: logs.log 14 | mode: w 15 | encoding: utf8 16 | console: 17 | level: DEBUG 18 | class: logging.StreamHandler 19 | formatter: brief 20 | stream: ext://sys.stdout 21 | root: 22 | level: INFO 23 | handlers: [file, console] 24 | 25 | disable_existing_loggers: false 26 | -------------------------------------------------------------------------------- /cybulde/config_schemas/infrastructure/instance_group_creator_schemas.py: -------------------------------------------------------------------------------- 1 | from omegaconf import SI 2 | from pydantic.dataclasses import dataclass 3 | 4 | from cybulde.config_schemas.infrastructure.instance_template_creator_schemas import InstanceTemplateCreatorConfig 5 | 6 | 7 | @dataclass 8 | class InstanceGroupCreatorConfig: 9 | _target_: str = "cybulde.infrastructure.instance_group_creator.InstanceGroupCreator" 10 | instance_template_creator: InstanceTemplateCreatorConfig = InstanceTemplateCreatorConfig() 11 | name: str = SI("${infrastructure.mlflow.experiment_name}-${infrastructure.mlflow.run_name}-${now:%Y%m%d%H%M%S}") 12 | node_count: int = 1 13 | project_id: str = SI("${infrastructure.project_id}") 14 | zone: str = SI("${infrastructure.zone}") 15 | -------------------------------------------------------------------------------- /cybulde/data_modules/datasets.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | 3 | from torch import Tensor 4 | from torch.utils.data import Dataset 5 | 6 | 7 | class TextClassificationDataset(Dataset): 8 | def __init__(self, df_path: str, text_column_name: str, label_column_name: str) -> None: 9 | super().__init__() 10 | self.df = pd.read_parquet(df_path) 11 | self.text_column_name = text_column_name 12 | self.label_column_name = label_column_name 13 | 14 | def __getitem__(self, idx: int) -> tuple[str, Tensor]: 15 | row = self.df.iloc[idx] 16 | 17 | text = row[self.text_column_name] 18 | label = row[self.label_column_name] 19 | 20 | return text, Tensor([label]) 21 | 22 | def __len__(self) -> int: 23 | return len(self.df) 24 | -------------------------------------------------------------------------------- /cybulde/config_schemas/base_schemas.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass 2 | 3 | from omegaconf import MISSING 4 | 5 | from cybulde.config_schemas.data_module_schemas import DataModuleConfig 6 | from cybulde.config_schemas.trainer.trainer_schemas import TrainerConfig 7 | from cybulde.utils.mixins import LoggableParamsMixin 8 | 9 | 10 | @dataclass 11 | class LightningModuleConfig(LoggableParamsMixin): 12 | _target_: str = MISSING 13 | 14 | 15 | @dataclass 16 | class TaskConfig(LoggableParamsMixin): 17 | _target_: str = MISSING 18 | name: str = MISSING 19 | data_module: DataModuleConfig = MISSING 20 | lightning_module: LightningModuleConfig = MISSING 21 | trainer: TrainerConfig = MISSING 22 | 23 | def loggable_params(self) -> list[str]: 24 | return ["_target_"] 25 | -------------------------------------------------------------------------------- /cybulde/models/heads.py: -------------------------------------------------------------------------------- 1 | from torch import Tensor, nn 2 | 3 | 4 | class Head(nn.Module): 5 | pass 6 | 7 | 8 | class SoftmaxHead(Head): 9 | def __init__(self, in_features: int, out_features: int, dim: int = 1) -> None: 10 | super().__init__() 11 | self.head = nn.Sequential(nn.Linear(in_features, out_features), nn.Softmax(dim=dim)) 12 | 13 | def forward(self, x: Tensor) -> Tensor: 14 | output: Tensor = self.head(x) 15 | return output 16 | 17 | 18 | class SigmoidHead(Head): 19 | def __init__(self, in_features: int, out_features: int) -> None: 20 | super().__init__() 21 | 22 | self.head = nn.Sequential(nn.Linear(in_features, out_features), nn.Sigmoid()) 23 | 24 | def forward(self, x: Tensor) -> Tensor: 25 | output: Tensor = self.head(x) 26 | return output 27 | -------------------------------------------------------------------------------- /cybulde/evaluation/lightning_modules/bases.py: -------------------------------------------------------------------------------- 1 | from abc import abstractmethod 2 | from typing import Any, Protocol 3 | 4 | from lightning.pytorch import LightningModule 5 | from torch import Tensor 6 | 7 | from cybulde.models.models import Model 8 | from cybulde.models.transformations import Transformation 9 | 10 | 11 | class EvaluationLightningModule(LightningModule): 12 | def __init__(self, model: Model) -> None: 13 | super().__init__() 14 | self.model = model 15 | 16 | @abstractmethod 17 | def test_step(self, batch: Any, batch_idx: int) -> Tensor: 18 | ... 19 | 20 | @abstractmethod 21 | def get_transformation(self) -> Transformation: 22 | ... 23 | 24 | 25 | class PartialEvaluationLightningModuleType(Protocol): 26 | def __call__(self, model: Model) -> EvaluationLightningModule: 27 | ... 28 | -------------------------------------------------------------------------------- /cybulde/config_schemas/training/loss_schemas.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass 2 | 3 | from hydra.core.config_store import ConfigStore 4 | from omegaconf import MISSING 5 | 6 | from cybulde.utils.mixins import LoggableParamsMixin 7 | 8 | 9 | @dataclass 10 | class LossFunctionConfig(LoggableParamsMixin): 11 | _target_: str = MISSING 12 | 13 | def loggable_params(self) -> list[str]: 14 | return ["_target_"] 15 | 16 | 17 | @dataclass 18 | class BCEWithLogitsLossConfig(LossFunctionConfig): 19 | _target_: str = "cybulde.training.loss_functions.BCEWithLogitsLoss" 20 | reduction: str = "mean" 21 | 22 | 23 | def setup_config() -> None: 24 | cs = ConfigStore.instance() 25 | cs.store( 26 | name="bce_with_logits_loss_schema", 27 | group="tasks/lightning_module/loss", 28 | node=BCEWithLogitsLossConfig, 29 | ) 30 | -------------------------------------------------------------------------------- /cybulde/config_schemas/models/head_schemas.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass 2 | 3 | from hydra.core.config_store import ConfigStore 4 | from omegaconf import MISSING 5 | 6 | from cybulde.utils.mixins import LoggableParamsMixin 7 | 8 | 9 | @dataclass 10 | class HeadConfig(LoggableParamsMixin): 11 | _target_: str = MISSING 12 | 13 | def loggable_params(self) -> list[str]: 14 | return ["_target_"] 15 | 16 | 17 | @dataclass 18 | class SigmoidHeadConfig(HeadConfig): 19 | _target_: str = "cybulde.models.heads.SigmoidHead" 20 | in_features: int = MISSING 21 | out_features: int = MISSING 22 | 23 | 24 | @dataclass 25 | class BinaryClassificationSigmoidHead(SigmoidHeadConfig): 26 | in_features: int = 128 27 | out_features: int = 1 28 | 29 | 30 | def setup_config() -> None: 31 | cs = ConfigStore.instance() 32 | cs.store( 33 | name="sigmoid_head_schema", 34 | group="tasks/lightning_module/model/head", 35 | node=SigmoidHeadConfig, 36 | ) 37 | -------------------------------------------------------------------------------- /cybulde/config_schemas/trainer/logger_schemas.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass 2 | from typing import Any, Optional 3 | 4 | from hydra.core.config_store import ConfigStore 5 | from omegaconf import MISSING, SI 6 | 7 | 8 | @dataclass 9 | class LoggerConfig: 10 | _target_: str = MISSING 11 | 12 | 13 | @dataclass 14 | class MLFlowLoggerConfig(LoggerConfig): 15 | _target_: str = "lightning.pytorch.loggers.mlflow.MLFlowLogger" 16 | experiment_name: str = SI("${infrastructure.mlflow.experiment_name}") 17 | run_name: Optional[str] = SI("${infrastructure.mlflow.run_name}") 18 | tracking_uri: Optional[str] = SI("${infrastructure.mlflow.mlflow_internal_tracking_uri}") 19 | tags: Optional[dict[str, Any]] = None 20 | save_dir: Optional[str] = None 21 | prefix: str = "" 22 | artifact_location: Optional[str] = None 23 | run_id: Optional[str] = SI("${infrastructure.mlflow.run_id}") 24 | 25 | 26 | def setup_config() -> None: 27 | cs = ConfigStore.instance() 28 | cs.store(name="mlflow_logger_schema", group="tasks/trainer/logger", node=MLFlowLoggerConfig) 29 | -------------------------------------------------------------------------------- /cybulde/web_app/server.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from fastapi import FastAPI 4 | from hydra.utils import instantiate 5 | 6 | from cybulde.models.common.exporter import TarModelLoader 7 | from cybulde.utils.config_utils import load_config 8 | from cybulde.utils.mlflow_utils import get_client 9 | 10 | config = load_config(config_path="../configs/automatically_generated", config_name="config") 11 | tokenizer = instantiate(config.tasks.binary_text_classification_task.data_module.transformation) 12 | 13 | model_name = "bert_tiny" 14 | model_version = "1" 15 | mlflow_client = get_client() 16 | 17 | mlflow_model = mlflow_client.get_model_version(name=model_name, version=model_version) 18 | model_path = os.path.join(mlflow_model.source, "exported_model.tar.gz") # type: ignore 19 | model = TarModelLoader(model_path).load() 20 | model.eval() 21 | 22 | app = FastAPI() 23 | 24 | 25 | @app.get("/predict_cyberbullying") 26 | def predict_cyberbullying(text: str) -> dict[str, int]: 27 | tokens = tokenizer([text]) 28 | probs = model(tokens) 29 | classes = (probs >= 0.5).item() 30 | return {"is_cyberbullying": int(classes)} 31 | -------------------------------------------------------------------------------- /cybulde/config_schemas/evaluation/evaluation_lightning_module_schemas.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass 2 | 3 | from hydra.core.config_store import ConfigStore 4 | from omegaconf import MISSING 5 | 6 | from cybulde.config_schemas.base_schemas import LightningModuleConfig 7 | 8 | 9 | @dataclass 10 | class EvaluationLightningModuleConfig(LightningModuleConfig): 11 | _target_: str = MISSING 12 | _partial_: bool = False 13 | 14 | def loggable_params(self) -> list[str]: 15 | return ["_target_"] 16 | 17 | 18 | @dataclass 19 | class PartialEvaluationLightningModuleConfig(EvaluationLightningModuleConfig): 20 | _partial_: bool = True 21 | 22 | 23 | @dataclass 24 | class BinaryTextEvaluationLightningModuleConfig(PartialEvaluationLightningModuleConfig): 25 | _target_: str = "cybulde.evaluation.lightning_modules.binary_text_evaluation.BinaryTextEvaluationLightningModule" 26 | 27 | 28 | def setup_config() -> None: 29 | cs = ConfigStore.instance() 30 | cs.store( 31 | name="binary_text_classification_prediction_lightning_module_schema", 32 | group="tasks/lightning_module", 33 | node=BinaryTextEvaluationLightningModuleConfig, 34 | ) 35 | -------------------------------------------------------------------------------- /cybulde/models/models.py: -------------------------------------------------------------------------------- 1 | from abc import abstractmethod 2 | from typing import Optional 3 | 4 | from torch import Tensor, nn 5 | from transformers import BatchEncoding 6 | 7 | from cybulde.models.adapters import Adapter 8 | from cybulde.models.backbones import Backbone 9 | from cybulde.models.heads import Head 10 | from cybulde.models.transformations import Transformation 11 | 12 | 13 | class Model(nn.Module): 14 | @abstractmethod 15 | def get_transformation(self) -> Transformation: 16 | ... 17 | 18 | 19 | class BinaryTextClassificationModel(Model): 20 | def __init__(self, backbone: Backbone, head: Head, adapter: Optional[Adapter]) -> None: 21 | super().__init__() 22 | self.backbone = backbone 23 | self.adapter = adapter 24 | self.head = head 25 | 26 | def forward(self, encodings: BatchEncoding) -> Tensor: 27 | output = self.backbone(encodings) 28 | if self.adapter is not None: 29 | output = self.adapter(output) 30 | output = self.head(output) 31 | assert isinstance(output, Tensor) 32 | return output 33 | 34 | def get_transformation(self) -> Transformation: 35 | return self.backbone.get_transformation() 36 | -------------------------------------------------------------------------------- /cybulde/run_tasks.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | from hydra.utils import instantiate 4 | from lightning.pytorch import seed_everything 5 | 6 | from cybulde.config_schemas.config_schema import Config 7 | from cybulde.utils.config_utils import get_config 8 | from cybulde.utils.torch_utils import get_local_rank 9 | from cybulde.utils.utils import get_logger 10 | 11 | 12 | @get_config( 13 | config_path="../configs/automatically_generated", config_name="config", to_object=False, return_dict_config=True 14 | ) 15 | def run_tasks(config: Config) -> None: 16 | logger = get_logger(__file__) 17 | assert config.infrastructure.mlflow.run_id is not None, "Run id has to be set for running tasks" 18 | 19 | backend = "gloo" 20 | if torch.cuda.is_available(): 21 | torch.cuda.set_device(f"cuda:{get_local_rank()}") 22 | backend = "nccl" 23 | 24 | torch.distributed.init_process_group(backend=backend) 25 | 26 | seed_everything(seed=config.seed, workers=True) 27 | 28 | for task_name, task_config in config.tasks.items(): 29 | logger.info(f"Running task: {task_name}") 30 | task = instantiate(task_config) 31 | task.run(config=config, task_config=task_config) 32 | 33 | 34 | if __name__ == "__main__": 35 | run_tasks() 36 | -------------------------------------------------------------------------------- /cybulde/config_schemas/config_schema.py: -------------------------------------------------------------------------------- 1 | from typing import Optional 2 | 3 | from hydra.core.config_store import ConfigStore 4 | from omegaconf import MISSING 5 | from pydantic.dataclasses import dataclass 6 | 7 | from cybulde.config_schemas import base_schemas 8 | from cybulde.config_schemas.evaluation import evaluation_task_schemas, model_selector_schemas 9 | from cybulde.config_schemas.infrastructure import infrastructure_schema 10 | from cybulde.config_schemas.training import training_task_schemas 11 | 12 | 13 | @dataclass 14 | class Config: 15 | infrastructure: infrastructure_schema.InfrastructureConfig = infrastructure_schema.InfrastructureConfig() 16 | save_last_checkpoint_every_n_train_steps: int = 500 17 | seed: int = 1234 18 | tasks: dict[str, base_schemas.TaskConfig] = MISSING 19 | model_selector: Optional[model_selector_schemas.ModelSelectorConfig] = None 20 | registered_model_name: Optional[str] = None 21 | docker_image: Optional[str] = None 22 | 23 | 24 | def setup_config() -> None: 25 | infrastructure_schema.setup_config() 26 | training_task_schemas.setup_config() 27 | evaluation_task_schemas.setup_config() 28 | model_selector_schemas.setup_config() 29 | 30 | cs = ConfigStore.instance() 31 | cs.store(name="config_schema", node=Config) 32 | -------------------------------------------------------------------------------- /cybulde/launch_job_on_gcp.py: -------------------------------------------------------------------------------- 1 | from typing import TYPE_CHECKING 2 | 3 | import mlflow 4 | 5 | from hydra.utils import instantiate 6 | 7 | from cybulde.utils.config_utils import get_config 8 | from cybulde.utils.gcp_utils import TrainingInfo 9 | 10 | if TYPE_CHECKING: 11 | from cybulde.config_schemas.config_schema import Config 12 | 13 | 14 | @get_config( 15 | config_path="../configs/automatically_generated", config_name="config", to_object=False, return_dict_config=True 16 | ) 17 | def run(config: "Config") -> None: 18 | run_id = config.infrastructure.mlflow.run_id 19 | assert run_id is not None 20 | 21 | instance_group_creator = instantiate(config.infrastructure.instance_group_creator) 22 | instance_ids = instance_group_creator.launch_instance_group() 23 | training_info = TrainingInfo( 24 | project_id=config.infrastructure.project_id, 25 | zone=config.infrastructure.zone, 26 | instance_group_name=config.infrastructure.instance_group_creator.name, 27 | instance_ids=instance_ids, 28 | mlflow_experiment_url=config.infrastructure.mlflow.experiment_url, 29 | ) 30 | mlflow.start_run(run_id=run_id, description=training_info.get_job_info_message()) 31 | training_info.print_job_info() 32 | 33 | 34 | if __name__ == "__main__": 35 | run() 36 | -------------------------------------------------------------------------------- /scripts/deploy-etcd-server.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | gcloud compute instances create-with-container etcd-server \ 4 | --project=cybulde \ 5 | --zone=europe-west4-a \ 6 | --machine-type=n1-standard-1 \ 7 | --network-interface=subnet=default,no-address \ 8 | --maintenance-policy=MIGRATE \ 9 | --provisioning-model=STANDARD \ 10 | --service-account=941446584999-compute@developer.gserviceaccount.com \ 11 | --scopes=https://www.googleapis.com/auth/devstorage.read_only,https://www.googleapis.com/auth/logging.write,https://www.googleapis.com/auth/monitoring.write,https://www.googleapis.com/auth/servicecontrol,https://www.googleapis.com/auth/service.management.readonly,https://www.googleapis.com/auth/trace.append \ 12 | --image=projects/cos-cloud/global/images/cos-stable-109-17800-66-15 \ 13 | --boot-disk-size=10GB \ 14 | --boot-disk-type=pd-balanced \ 15 | --boot-disk-device-name=etcd-server \ 16 | --container-image=docker.io/bitnami/etcd:3.5 \ 17 | --container-restart-policy=always \ 18 | --container-privileged \ 19 | --container-env=ALLOW_NONE_AUTHENTICATION=yes,ETCD_ADVERTISE_CLIENT_URLS=http://0.0.0.0:2379,ETCD_ENABLE_V2=true,ETCDCTL_API=2 \ 20 | --no-shielded-secure-boot \ 21 | --shielded-vtpm \ 22 | --shielded-integrity-monitoring \ 23 | --labels=goog-ec-src=vm_add-gcloud,container-vm=cos-stable-109-17800-66-15 24 | -------------------------------------------------------------------------------- /cybulde/config_schemas/infrastructure/infrastructure_schema.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass 2 | from typing import Optional 3 | 4 | from hydra.core.config_store import ConfigStore 5 | from omegaconf import SI 6 | 7 | from cybulde.config_schemas.infrastructure.instance_group_creator_schemas import InstanceGroupCreatorConfig 8 | 9 | 10 | @dataclass 11 | class MLFlowConfig: 12 | mlflow_external_tracking_uri: str = SI("${oc.env:MLFLOW_TRACKING_URI,localhost:6101}") 13 | mlflow_internal_tracking_uri: str = SI("${oc.env:MLFLOW_INTERNAL_TRACKING_URI,localhost:6101}") 14 | experiment_name: str = "Default" 15 | run_name: Optional[str] = None 16 | run_id: Optional[str] = None 17 | experiment_id: Optional[str] = None 18 | experiment_url: str = SI("${.mlflow_external_tracking_uri}/#/experiments/${.experiment_id}/runs/${.run_id}") 19 | artifact_uri: Optional[str] = None 20 | 21 | 22 | @dataclass 23 | class InfrastructureConfig: 24 | project_id: str = "cybulde" 25 | zone: str = "europe-west4-b" 26 | instance_group_creator: InstanceGroupCreatorConfig = InstanceGroupCreatorConfig() 27 | mlflow: MLFlowConfig = MLFlowConfig() 28 | etcd_ip: Optional[str] = "10.164.0.12:2379" 29 | 30 | 31 | def setup_config() -> None: 32 | cs = ConfigStore.instance() 33 | cs.store( 34 | name="infrastructure_schema", 35 | group="infrastructure", 36 | node=InfrastructureConfig, 37 | ) 38 | -------------------------------------------------------------------------------- /cybulde/config_schemas/models/transformation_schemas.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass 2 | 3 | from hydra.core.config_store import ConfigStore 4 | from omegaconf import MISSING 5 | 6 | from cybulde.utils.mixins import LoggableParamsMixin 7 | 8 | 9 | @dataclass 10 | class TransformationConfig(LoggableParamsMixin): 11 | _target_: str = MISSING 12 | 13 | def loggable_params(self) -> list[str]: 14 | return ["_target_"] 15 | 16 | 17 | @dataclass 18 | class HuggingFaceTokenizationTransformationConfig(TransformationConfig): 19 | _target_: str = "cybulde.models.transformations.HuggingFaceTokenizationTransformation" 20 | pretrained_tokenizer_name_or_path: str = MISSING 21 | max_sequence_length: int = MISSING 22 | 23 | def loggable_params(self) -> list[str]: 24 | return super().loggable_params() + ["pretrained_tokenizer_name_or_path", "max_sequence_length"] 25 | 26 | 27 | @dataclass 28 | class CustomHuggingFaceTokenizationTransformationConfig(HuggingFaceTokenizationTransformationConfig): 29 | pretrained_tokenizer_name_or_path: str = "gs://emkademy/cybulde/data/processed/rebalanced_splits/trained_tokenizer" 30 | max_sequence_length: int = 200 31 | 32 | 33 | def setup_config() -> None: 34 | cs = ConfigStore.instance() 35 | cs.store( 36 | name="text_classification_data_module_schema", 37 | group="tasks/data_module/transformation", 38 | node=HuggingFaceTokenizationTransformationConfig, 39 | ) 40 | -------------------------------------------------------------------------------- /cybulde/config_schemas/training/optimizer_schemas.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass 2 | from typing import Optional 3 | 4 | from hydra.core.config_store import ConfigStore 5 | from omegaconf import MISSING 6 | 7 | from cybulde.utils.mixins import LoggableParamsMixin 8 | 9 | 10 | @dataclass 11 | class OptimizerConfig(LoggableParamsMixin): 12 | _target_: str = MISSING 13 | _partial_: bool = True 14 | lr: float = MISSING 15 | 16 | def loggable_params(self) -> list[str]: 17 | return ["_target_", "lr"] 18 | 19 | 20 | @dataclass 21 | class AdamOptimizerConfig(OptimizerConfig): 22 | _target_: str = "torch.optim.Adam" 23 | lr: float = 5e-3 24 | betas: tuple[float, float] = (0.9, 0.999) 25 | eps: float = 1e-8 26 | weight_decay: float = 0.0 27 | amsgrad: bool = False 28 | foreach: Optional[bool] = None 29 | maximize: bool = False 30 | capturable: bool = False 31 | 32 | 33 | @dataclass 34 | class AdamWOptimizerConfig(AdamOptimizerConfig): 35 | _target_: str = "torch.optim.AdamW" 36 | lr: float = 5e-5 37 | weight_decay: float = 1e-3 38 | 39 | 40 | def setup_config() -> None: 41 | cs = ConfigStore.instance() 42 | cs.store( 43 | name="adam_optimizer_schema", 44 | group="tasks/lightning_module/optimizer", 45 | node=AdamOptimizerConfig, 46 | ) 47 | 48 | cs.store( 49 | name="adamw_optimizer_schema", 50 | group="tasks/lightning_module/optimizer", 51 | node=AdamWOptimizerConfig, 52 | ) 53 | -------------------------------------------------------------------------------- /cybulde/models/common/utils.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from contextlib import contextmanager 4 | from typing import Generator 5 | 6 | import torch 7 | 8 | 9 | def get_local_rank() -> int: 10 | return int(os.getenv("LOCAL_RANK", -1)) 11 | 12 | 13 | def get_global_rank() -> int: 14 | return int(os.getenv("RANK", get_local_rank())) 15 | 16 | 17 | @contextmanager 18 | def local_rank_zero_first() -> Generator[None, None, None]: 19 | if not torch.distributed.is_initialized() and os.getenv("RANK") is not None: 20 | raise RuntimeError("RANK is set but torch.distributed is not initialized") 21 | 22 | if torch.distributed.is_initialized(): 23 | rank = get_local_rank() 24 | if rank not in [-1, 0]: 25 | torch.distributed.barrier() # type: ignore 26 | yield 27 | if rank == 0: 28 | torch.distributed.barrier() # type: ignore 29 | else: 30 | yield 31 | 32 | 33 | @contextmanager 34 | def global_rank_zero_first() -> Generator[None, None, None]: 35 | if not torch.distributed.is_initialized() and os.getenv("RANK") is not None: 36 | raise RuntimeError("RANK is set but torch.distributed is not initialized") 37 | 38 | if torch.distributed.is_initialized(): 39 | rank = get_global_rank() 40 | if rank not in [-1, 0]: 41 | torch.distributed.barrier() # type: ignore 42 | yield 43 | if rank == 0: 44 | torch.distributed.barrier() # type: ignore 45 | else: 46 | yield 47 | -------------------------------------------------------------------------------- /cybulde/utils/torch_utils.py: -------------------------------------------------------------------------------- 1 | import itertools 2 | import os 3 | 4 | from typing import Any 5 | 6 | import matplotlib.pyplot as plt 7 | import numpy as np 8 | 9 | from matplotlib.pyplot import figure 10 | from torch import Tensor 11 | 12 | 13 | def plot_confusion_matrix(confusion_matrix: Tensor, class_names: list[str]) -> Any: 14 | confusion_matrix = confusion_matrix.cpu().detach().numpy() 15 | 16 | figure(num=None, figsize=(16, 12), dpi=60, facecolor="w", edgecolor="k") 17 | plt.imshow(confusion_matrix, interpolation="nearest", cmap=plt.cm.Purples) # type: ignore 18 | plt.colorbar() 19 | 20 | tick_marks = np.arange(len(class_names)) 21 | plt.xticks(tick_marks, class_names, rotation=90, fontsize=20) 22 | plt.yticks(tick_marks, class_names, fontsize=20) 23 | 24 | fmt = "d" 25 | thresh = confusion_matrix.max() / 2.0 26 | for i, j in itertools.product(range(confusion_matrix.shape[0]), range(confusion_matrix.shape[1])): 27 | plt.text( 28 | j, 29 | i, 30 | format(confusion_matrix[i, j], fmt), 31 | horizontalalignment="center", 32 | color="white" if confusion_matrix[i, j] > thresh else "black", 33 | fontsize=20, 34 | ) 35 | 36 | plt.title("Confusion matrix") 37 | plt.ylabel("Actual label", fontsize=20) 38 | plt.xlabel("Predicted label", fontsize=20) 39 | plt.tight_layout() 40 | 41 | return plt.gcf() 42 | 43 | 44 | def get_local_rank() -> int: 45 | return int(os.getenv("LOCAL_RANK", -1)) 46 | -------------------------------------------------------------------------------- /cybulde/models/backbones.py: -------------------------------------------------------------------------------- 1 | from torch import nn 2 | from transformers import AutoConfig, AutoModel, BatchEncoding 3 | from transformers.modeling_outputs import BaseModelOutputWithPooling 4 | 5 | from cybulde.models.transformations import Transformation 6 | from cybulde.utils.io_utils import translate_gcs_dir_to_local 7 | 8 | 9 | class Backbone(nn.Module): 10 | def __init__(self, transformation: Transformation) -> None: 11 | super().__init__() 12 | self.transformation = transformation 13 | 14 | def get_transformation(self) -> Transformation: 15 | return self.transformation 16 | 17 | 18 | class HuggingFaceBackbone(Backbone): 19 | def __init__( 20 | self, pretrained_model_name_or_path: str, transformation: Transformation, pretrained: bool = False 21 | ) -> None: 22 | super().__init__(transformation) 23 | self.backbone = self.get_backbone(pretrained_model_name_or_path, pretrained) 24 | 25 | def forward(self, encodings: BatchEncoding) -> BaseModelOutputWithPooling: 26 | output: BaseModelOutputWithPooling = self.backbone(**encodings) 27 | return output 28 | 29 | def get_backbone(self, pretrained_model_name_or_path: str, pretrained: bool) -> nn.Module: 30 | path = translate_gcs_dir_to_local(pretrained_model_name_or_path) 31 | config = AutoConfig.from_pretrained(path) 32 | if pretrained: 33 | backbone_from_pretrained: nn.Module = AutoModel.from_pretrained(path, config=config) 34 | return backbone_from_pretrained 35 | 36 | backbone_from_config: nn.Module = AutoModel.from_config(config) 37 | return backbone_from_config 38 | -------------------------------------------------------------------------------- /cybulde/config_schemas/models/backbone_schemas.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass 2 | 3 | from hydra.core.config_store import ConfigStore 4 | from omegaconf import MISSING 5 | 6 | from cybulde.config_schemas.models.transformation_schemas import ( 7 | CustomHuggingFaceTokenizationTransformationConfig, 8 | TransformationConfig, 9 | ) 10 | from cybulde.utils.mixins import LoggableParamsMixin 11 | 12 | 13 | @dataclass 14 | class BackboneConfig(LoggableParamsMixin): 15 | _target_: str = MISSING 16 | transformation: TransformationConfig = MISSING 17 | 18 | def loggable_params(self) -> list[str]: 19 | return ["_target_"] 20 | 21 | 22 | @dataclass 23 | class HuggingFaceBackboneConfig(BackboneConfig): 24 | _target_: str = "cybulde.models.backbones.HuggingFaceBackbone" 25 | pretrained_model_name_or_path: str = MISSING 26 | pretrained: bool = False 27 | 28 | def loggable_params(self) -> list[str]: 29 | return super().loggable_params() + ["pretrained_model_name_or_path", "pretrained"] 30 | 31 | 32 | @dataclass 33 | class BertTinyHuggingFaceBackboneConfig(HuggingFaceBackboneConfig): 34 | pretrained_model_name_or_path: str = "prajjwal1/bert-tiny" 35 | transformation: TransformationConfig = CustomHuggingFaceTokenizationTransformationConfig() 36 | 37 | 38 | def setup_config() -> None: 39 | cs = ConfigStore.instance() 40 | cs.store( 41 | name="hugging_face_backbone_schema", 42 | group="tasks/lightning_module/model/backbone", 43 | node=HuggingFaceBackboneConfig, 44 | ) 45 | 46 | cs.store( 47 | name="test_backbone_config", 48 | node=BertTinyHuggingFaceBackboneConfig, 49 | ) 50 | -------------------------------------------------------------------------------- /cybulde/config_schemas/models/model_schemas.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass 2 | from typing import Optional 3 | 4 | from hydra.core.config_store import ConfigStore 5 | from omegaconf import MISSING 6 | 7 | from cybulde.config_schemas.models import adapter_schemas, backbone_schemas, head_schemas 8 | from cybulde.utils.mixins import LoggableParamsMixin 9 | 10 | 11 | @dataclass 12 | class ModelConfig(LoggableParamsMixin): 13 | _target_: str = MISSING 14 | 15 | def loggable_params(self) -> list[str]: 16 | return ["_target_"] 17 | 18 | 19 | @dataclass 20 | class BinaryTextClassificationModelConfig(ModelConfig): 21 | _target_: str = "cybulde.models.models.BinaryTextClassificationModel" 22 | backbone: backbone_schemas.BackboneConfig = MISSING 23 | adapter: Optional[adapter_schemas.AdapterConfig] = None 24 | head: head_schemas.HeadConfig = MISSING 25 | 26 | 27 | @dataclass 28 | class BertTinyBinaryTextClassificationModelConfig(BinaryTextClassificationModelConfig): 29 | backbone: backbone_schemas.BackboneConfig = backbone_schemas.BertTinyHuggingFaceBackboneConfig() 30 | adapter: Optional[adapter_schemas.AdapterConfig] = adapter_schemas.PoolerOutputAdapterConfig() 31 | head: head_schemas.HeadConfig = head_schemas.BinaryClassificationSigmoidHead() 32 | 33 | 34 | def setup_config() -> None: 35 | backbone_schemas.setup_config() 36 | adapter_schemas.setup_config() 37 | head_schemas.setup_config() 38 | 39 | cs = ConfigStore.instance() 40 | cs.store( 41 | name="binary_text_classification_model_schema", 42 | group="tasks/lightning_module/model", 43 | node=BinaryTextClassificationModelConfig, 44 | ) 45 | -------------------------------------------------------------------------------- /cybulde/config_schemas/evaluation/evaluation_task_schemas.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass 2 | 3 | from hydra.core.config_store import ConfigStore 4 | from omegaconf import MISSING 5 | 6 | from cybulde.config_schemas import data_module_schemas 7 | from cybulde.config_schemas.base_schemas import TaskConfig 8 | from cybulde.config_schemas.evaluation import evaluation_lightning_module_schemas 9 | from cybulde.config_schemas.trainer import trainer_schemas 10 | 11 | 12 | @dataclass 13 | class EvaluationTaskConfig(TaskConfig): 14 | pass 15 | 16 | 17 | @dataclass 18 | class TarModelEvaluationTaskConfig(EvaluationTaskConfig): 19 | tar_model_path: str = MISSING 20 | lightning_module: evaluation_lightning_module_schemas.PartialEvaluationLightningModuleConfig = MISSING 21 | 22 | 23 | @dataclass 24 | class CommonEvaluationTaskConfig(TarModelEvaluationTaskConfig): 25 | _target_: str = "cybulde.evaluation.tasks.common_evaluation_task.CommonEvaluationTask" 26 | 27 | 28 | @dataclass 29 | class DefaultCommonEvaluationTaskConfig(CommonEvaluationTaskConfig): 30 | name: str = "binary_text_evaluation_task" 31 | lightning_module: evaluation_lightning_module_schemas.PartialEvaluationLightningModuleConfig = ( 32 | evaluation_lightning_module_schemas.BinaryTextEvaluationLightningModuleConfig() 33 | ) 34 | 35 | 36 | def setup_config() -> None: 37 | data_module_schemas.setup_config() 38 | evaluation_lightning_module_schemas.setup_config() 39 | trainer_schemas.setup_config() 40 | 41 | cs = ConfigStore.instance() 42 | cs.store( 43 | name="common_evaluation_task_schema", 44 | group="tasks", 45 | node=CommonEvaluationTaskConfig, 46 | ) 47 | -------------------------------------------------------------------------------- /cybulde/training/tasks/bases.py: -------------------------------------------------------------------------------- 1 | from abc import ABC, abstractmethod 2 | from functools import partial 3 | from typing import TYPE_CHECKING, Union 4 | 5 | from lightning.pytorch import Trainer 6 | 7 | from cybulde.data_modules.data_modules import DataModule, PartialDataModuleType 8 | from cybulde.training.lightning_modules.bases import TrainingLightningModule 9 | from cybulde.utils.utils import get_logger 10 | 11 | if TYPE_CHECKING: 12 | from cybulde.config_schemas.config_schema import Config 13 | from cybulde.config_schemas.training.training_task_schemas import TrainingTaskConfig 14 | 15 | 16 | class TrainingTask(ABC): 17 | def __init__( 18 | self, 19 | name: str, 20 | data_module: Union[DataModule, PartialDataModuleType], 21 | lightning_module: TrainingLightningModule, 22 | trainer: Trainer, 23 | best_training_checkpoint: str, 24 | last_training_checkpoint: str, 25 | ) -> None: 26 | super().__init__() 27 | self.name = name 28 | self.trainer = trainer 29 | self.best_training_checkpoint = best_training_checkpoint 30 | self.last_training_checkpoint = last_training_checkpoint 31 | self.logger = get_logger(self.__class__.__name__) 32 | 33 | self.lightning_module = lightning_module 34 | 35 | if isinstance(data_module, partial): 36 | transformation = self.lightning_module.get_transformation() 37 | self.data_module = data_module(transformation=transformation) 38 | else: 39 | self.data_module = data_module 40 | 41 | @abstractmethod 42 | def run(self, config: "Config", task_config: "TrainingTaskConfig") -> None: 43 | ... 44 | -------------------------------------------------------------------------------- /cybulde/training/schedulers.py: -------------------------------------------------------------------------------- 1 | from abc import ABC, abstractmethod 2 | from typing import Any, Literal, Optional, Protocol, Union 3 | 4 | from torch.optim import Optimizer 5 | from torch.optim.lr_scheduler import _LRScheduler 6 | 7 | 8 | class PartialSchedulerType(Protocol): 9 | def __call__( 10 | self, optimizer: Optimizer, estimated_stepping_batches: Optional[Union[int, float]] = None 11 | ) -> _LRScheduler: 12 | ... 13 | 14 | 15 | class LightningScheduler(ABC): 16 | def __init__( 17 | self, 18 | scheduler: PartialSchedulerType, 19 | interval: Literal["epoch", "step"] = "epoch", 20 | frequency: int = 1, 21 | monitor: str = "val_loss", 22 | strict: bool = True, 23 | name: Optional[str] = None, 24 | ) -> None: 25 | self.scheduler = scheduler 26 | self.interval = interval 27 | self.frequency = frequency 28 | self.monitor = monitor 29 | self.strict = strict 30 | self.name = name 31 | 32 | @abstractmethod 33 | def configure_scheduler( 34 | self, optimizer: Optimizer, estimated_stepping_batches: Union[int, float] 35 | ) -> dict[str, Any]: 36 | ... 37 | 38 | 39 | class CommonLightningScheduler(LightningScheduler): 40 | def configure_scheduler( 41 | self, optimizer: Optimizer, estimated_stepping_batches: Union[int, float] 42 | ) -> dict[str, Any]: 43 | return { 44 | "scheduler": self.scheduler(optimizer), 45 | "interval": self.interval, 46 | "frequency": self.frequency, 47 | "monitor": self.monitor, 48 | "strict": self.strict, 49 | "name": self.name, 50 | } 51 | -------------------------------------------------------------------------------- /cybulde/models/transformations.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from abc import ABC, abstractmethod 4 | 5 | from transformers import AutoTokenizer, BatchEncoding, PreTrainedTokenizerBase 6 | 7 | from cybulde.utils.io_utils import is_dir, is_file, translate_gcs_dir_to_local 8 | 9 | 10 | class Transformation(ABC): 11 | @abstractmethod 12 | def __call__(self, texts: list[str]) -> BatchEncoding: 13 | ... 14 | 15 | 16 | class HuggingFaceTokenizationTransformation(Transformation): 17 | def __init__(self, pretrained_tokenizer_name_or_path: str, max_sequence_length: int) -> None: 18 | super().__init__() 19 | self.max_sequence_length = max_sequence_length 20 | self.tokenizer = self.get_tokenizer(pretrained_tokenizer_name_or_path) 21 | 22 | def __call__(self, texts: list[str]) -> BatchEncoding: 23 | output: BatchEncoding = self.tokenizer.batch_encode_plus( 24 | texts, truncation=True, padding=True, return_tensors="pt", max_length=self.max_sequence_length 25 | ) 26 | return output 27 | 28 | def get_tokenizer(self, pretrained_tokenizer_name_or_path: str) -> PreTrainedTokenizerBase: 29 | if is_dir(pretrained_tokenizer_name_or_path): 30 | tokenizer_dir = translate_gcs_dir_to_local(pretrained_tokenizer_name_or_path) 31 | elif is_file(pretrained_tokenizer_name_or_path): 32 | pretrained_tokenizer_name_or_path = translate_gcs_dir_to_local(pretrained_tokenizer_name_or_path) 33 | tokenizer_dir = os.path.dirname(pretrained_tokenizer_name_or_path) 34 | else: 35 | tokenizer_dir = pretrained_tokenizer_name_or_path 36 | 37 | tokenizer: PreTrainedTokenizerBase = AutoTokenizer.from_pretrained(tokenizer_dir) 38 | return tokenizer 39 | -------------------------------------------------------------------------------- /cybulde/generate_final_config.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | from typing import TYPE_CHECKING 3 | 4 | import mlflow 5 | 6 | from omegaconf import DictConfig 7 | 8 | from cybulde.utils.config_utils import get_config_and_dict_config, save_config_as_yaml 9 | from cybulde.utils.mlflow_utils import activate_mlflow, log_artifacts_for_reproducibility, log_training_hparams 10 | 11 | if TYPE_CHECKING: 12 | from cybulde.config_schemas.config_schema import Config 13 | 14 | 15 | @get_config_and_dict_config(config_path="../configs", config_name="config") # type: ignore 16 | def generate_final_config(config: "Config", dict_config: DictConfig) -> None: 17 | run: mlflow.ActiveRun 18 | with activate_mlflow( 19 | config.infrastructure.mlflow.experiment_name, 20 | run_id=config.infrastructure.mlflow.run_id, 21 | run_name=config.infrastructure.mlflow.run_name, 22 | ) as run: 23 | run_id: str = run.info.run_id 24 | experiment_id: str = run.info.experiment_id 25 | artifact_uri: str = run.info.artifact_uri 26 | 27 | dict_config.infrastructure.mlflow.artifact_uri = artifact_uri 28 | dict_config.infrastructure.mlflow.run_id = run_id 29 | dict_config.infrastructure.mlflow.experiment_id = experiment_id 30 | 31 | config_save_dir = Path("./cybulde/configs/automatically_generated/") 32 | config_save_dir.mkdir(parents=True, exist_ok=True) 33 | (config_save_dir / "__init__.py").touch(exist_ok=True) 34 | 35 | yaml_config_save_path = config_save_dir / "config.yaml" 36 | save_config_as_yaml(dict_config, str(yaml_config_save_path)) 37 | mlflow.log_artifact(str(yaml_config_save_path)) 38 | 39 | log_training_hparams(config) 40 | log_artifacts_for_reproducibility() 41 | 42 | 43 | if __name__ == "__main__": 44 | generate_final_config() 45 | -------------------------------------------------------------------------------- /cybulde/config_schemas/training/scheduler_schemas.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass 2 | from typing import Optional 3 | 4 | from hydra.core.config_store import ConfigStore 5 | from omegaconf import MISSING 6 | 7 | from cybulde.utils.mixins import LoggableParamsMixin 8 | 9 | 10 | @dataclass 11 | class SchedulerConfig(LoggableParamsMixin): 12 | _target_: str = MISSING 13 | _partial_: bool = True 14 | 15 | def loggable_params(self) -> list[str]: 16 | return ["_target_"] 17 | 18 | 19 | @dataclass 20 | class ReduceLROnPlateauSchedulerConfig(SchedulerConfig): 21 | _target_: str = "torch.optim.lr_scheduler.ReduceLROnPlateau" 22 | mode: str = "max" 23 | factor: float = 0.1 24 | patience: int = 10 25 | threshold: float = 1e-4 26 | threshold_mode: str = "rel" 27 | cooldown: int = 0 28 | min_lr: float = 0 29 | eps: float = 1e-8 30 | verbose: bool = False 31 | 32 | 33 | @dataclass 34 | class LightningSchedulerConfig: 35 | _target_: str = MISSING 36 | scheduler: SchedulerConfig = MISSING 37 | interval: str = "epoch" 38 | frequency: int = 1 39 | monitor: str = "validation_f1_score" 40 | strict: bool = True 41 | name: Optional[str] = None 42 | 43 | 44 | @dataclass 45 | class CommonLightningSchedulerConfig(LightningSchedulerConfig): 46 | _target_: str = "cybulde.training.schedulers.CommonLightningScheduler" 47 | 48 | 49 | @dataclass 50 | class ReduceLROnPlateauLightningSchedulerConfig(CommonLightningSchedulerConfig): 51 | scheduler: SchedulerConfig = ReduceLROnPlateauSchedulerConfig(patience=5) 52 | 53 | 54 | def setup_config() -> None: 55 | cs = ConfigStore.instance() 56 | cs.store( 57 | name="reduce_lr_on_plateau_scheduler_schema", 58 | group="tasks/lightning_module/scheduler", 59 | node=ReduceLROnPlateauLightningSchedulerConfig, 60 | ) 61 | -------------------------------------------------------------------------------- /cybulde/config_schemas/models/adapter_schemas.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass, field 2 | from typing import Optional 3 | 4 | from hydra.core.config_store import ConfigStore 5 | from omegaconf import MISSING 6 | 7 | from cybulde.utils.mixins import LoggableParamsMixin 8 | 9 | 10 | @dataclass 11 | class AdapterConfig(LoggableParamsMixin): 12 | _target_: str = MISSING 13 | 14 | def loggable_params(self) -> list[str]: 15 | return ["_target_"] 16 | 17 | 18 | @dataclass 19 | class MLPWithPoolingConfig(AdapterConfig): 20 | _target_: str = "cybulde.models.adapters.MLPWithPooling" 21 | output_feature_sizes: list[int] = MISSING 22 | biases: Optional[list[bool]] = None 23 | activation_fns: Optional[list[Optional[str]]] = None 24 | dropout_drop_probs: Optional[list[float]] = None 25 | batch_norms: Optional[list[bool]] = None 26 | order: str = "LABDN" 27 | standardize_input: bool = True 28 | pooling_method: Optional[str] = None 29 | output_attribute_to_use: Optional[str] = None 30 | 31 | def loggable_params(self) -> list[str]: 32 | return super().loggable_params() + [ 33 | "output_feature_sizes", 34 | "biases", 35 | "activation_fns", 36 | "dropout_drop_probs", 37 | "batch_norms", 38 | "order", 39 | "pooling_method", 40 | "output_attribute_to_use", 41 | ] 42 | 43 | 44 | @dataclass 45 | class PoolerOutputAdapterConfig(MLPWithPoolingConfig): 46 | output_feature_sizes: list[int] = field(default_factory=lambda: [-1]) 47 | output_attribute_to_use: str = "pooler_output" 48 | 49 | 50 | def setup_config() -> None: 51 | cs = ConfigStore.instance() 52 | cs.store( 53 | name="mlp_with_pooling_schema", 54 | group="tasks/lightning_module/model/adapter", 55 | node=MLPWithPoolingConfig, 56 | ) 57 | -------------------------------------------------------------------------------- /cybulde/config_schemas/evaluation/model_selector_schemas.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass, field 2 | from typing import Optional 3 | 4 | from hydra.core.config_store import ConfigStore 5 | from omegaconf import MISSING, SI 6 | 7 | 8 | @dataclass 9 | class MetricComparerConfig: 10 | _target_: str = "cybulde.evaluation.model_selector.MetricComparer" 11 | bigger_is_better: bool = MISSING 12 | can_be_equal: bool = False 13 | metric_name: str = MISSING 14 | threshold: float = 0.0 15 | 16 | 17 | @dataclass 18 | class BinaryF1ScoreMetricComparerConfig(MetricComparerConfig): 19 | bigger_is_better: bool = True 20 | metric_name: str = "test_f1_score" 21 | 22 | 23 | @dataclass 24 | class ModelSizeMetricComparerConfig(MetricComparerConfig): 25 | bigger_is_better: bool = False 26 | metric_name: str = "model_size" 27 | can_be_equal: bool = True 28 | 29 | 30 | @dataclass 31 | class ModelSelectorConfig: 32 | _target_: str = "cybulde.evaluation.model_selector.ModelSelector" 33 | mlflow_run_id: Optional[str] = SI("${infrastructure.mlflow.run_id}") 34 | must_be_better_metric_comparers: dict[str, MetricComparerConfig] = field(default_factory=lambda: {}) 35 | to_be_thresholded_metric_comparers: dict[str, MetricComparerConfig] = field(default_factory=lambda: {}) 36 | threshold: float = 0.0 37 | 38 | 39 | @dataclass 40 | class CyberBullyingDetectionModelSelectorConfig(ModelSelectorConfig): 41 | must_be_better_metric_comparers: dict[str, MetricComparerConfig] = field( 42 | default_factory=lambda: { 43 | "f1_score": BinaryF1ScoreMetricComparerConfig(), 44 | "model_size": ModelSizeMetricComparerConfig(), 45 | } 46 | ) 47 | 48 | 49 | def setup_config() -> None: 50 | cs = ConfigStore.instance() 51 | cs.store(name="metric_comparer_schema", group="model_selector/metric_comparers", node=MetricComparerConfig) 52 | cs.store(name="model_selector_schema", group="model_selector", node=ModelSelectorConfig) 53 | -------------------------------------------------------------------------------- /cybulde/evaluation/tasks/bases.py: -------------------------------------------------------------------------------- 1 | from abc import ABC, abstractmethod 2 | from typing import TYPE_CHECKING, Union 3 | 4 | from lightning.pytorch import Trainer 5 | 6 | from cybulde.data_modules.data_modules import DataModule, PartialDataModuleType 7 | from cybulde.evaluation.lightning_modules.bases import EvaluationLightningModule, PartialEvaluationLightningModuleType 8 | from cybulde.models.common.exporter import TarModelLoader 9 | 10 | if TYPE_CHECKING: 11 | from cybulde.config_schemas.config_schema import Config 12 | from cybulde.config_schemas.evaluation.evaluation_task_schemas import EvaluationTaskConfig 13 | 14 | 15 | class EvaluationTask(ABC): 16 | def __init__( 17 | self, 18 | name: str, 19 | data_module: Union[DataModule, PartialDataModuleType], 20 | lightning_module: EvaluationLightningModule, 21 | trainer: Trainer, 22 | ) -> None: 23 | super().__init__() 24 | 25 | self.name = name 26 | self.trainer = trainer 27 | self.lightning_module = lightning_module 28 | self.lightning_module.eval() 29 | 30 | if isinstance(data_module, DataModule): 31 | self.data_module = data_module 32 | else: 33 | self.data_module = data_module(transformation=self.lightning_module.get_transformation()) 34 | 35 | @abstractmethod 36 | def run(self, config: "Config", task_config: "EvaluationTaskConfig") -> None: 37 | ... 38 | 39 | 40 | class TarModelEvaluationTask(EvaluationTask): 41 | def __init__( 42 | self, 43 | name: str, 44 | data_module: Union[DataModule, PartialDataModuleType], 45 | lightning_module: PartialEvaluationLightningModuleType, 46 | trainer: Trainer, 47 | tar_model_path: str, 48 | ) -> None: 49 | model = TarModelLoader(tar_model_path).load() 50 | _lightning_module = lightning_module(model=model) 51 | super().__init__(name=name, data_module=data_module, lightning_module=_lightning_module, trainer=trainer) 52 | -------------------------------------------------------------------------------- /cybulde/config_schemas/data_module_schemas.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass 2 | 3 | from hydra.core.config_store import ConfigStore 4 | from omegaconf import MISSING, SI 5 | 6 | from cybulde.config_schemas.models import transformation_schemas 7 | from cybulde.utils.mixins import LoggableParamsMixin 8 | 9 | 10 | @dataclass 11 | class DataModuleConfig(LoggableParamsMixin): 12 | _target_: str = MISSING 13 | batch_size: int = MISSING 14 | shuffle: bool = False 15 | num_workers: int = 8 16 | pin_memory: bool = True 17 | drop_last: bool = True 18 | persistent_workers: bool = False 19 | 20 | def loggable_params(self) -> list[str]: 21 | return ["_target_", "batch_size"] 22 | 23 | 24 | @dataclass 25 | class TextClassificationDataModuleConfig(DataModuleConfig): 26 | _target_: str = "cybulde.data_modules.data_modules.TextClassificationDataModule" 27 | train_df_path: str = MISSING 28 | dev_df_path: str = MISSING 29 | test_df_path: str = MISSING 30 | transformation: transformation_schemas.TransformationConfig = MISSING 31 | text_column_name: str = "cleaned_text" 32 | label_column_name: str = "label" 33 | 34 | 35 | @dataclass 36 | class ScrappedDataTextClassificationDataModuleConfig(TextClassificationDataModuleConfig): 37 | batch_size: int = 64 38 | train_df_path: str = "gs://emkademy/cybulde/data/processed/rebalanced_splits/train.parquet" 39 | dev_df_path: str = "gs://emkademy/cybulde/data/processed/rebalanced_splits/dev.parquet" 40 | test_df_path: str = "gs://emkademy/cybulde/data/processed/rebalanced_splits/test.parquet" 41 | transformation: transformation_schemas.TransformationConfig = SI( 42 | "${..lightning_module.model.backbone.transformation}" 43 | ) 44 | 45 | 46 | def setup_config() -> None: 47 | transformation_schemas.setup_config() 48 | 49 | cs = ConfigStore.instance() 50 | cs.store( 51 | name="text_classification_data_module_schema", 52 | group="tasks/data_module", 53 | node=TextClassificationDataModuleConfig, 54 | ) 55 | -------------------------------------------------------------------------------- /cybulde/config_schemas/experiment/bert/local_bert.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass, field 2 | from typing import Optional 3 | 4 | from hydra.core.config_store import ConfigStore 5 | from omegaconf import OmegaConf 6 | 7 | from cybulde.config_schemas.base_schemas import TaskConfig 8 | from cybulde.config_schemas.config_schema import Config 9 | from cybulde.config_schemas.evaluation import model_selector_schemas 10 | from cybulde.config_schemas.evaluation.evaluation_task_schemas import DefaultCommonEvaluationTaskConfig 11 | from cybulde.config_schemas.trainer.trainer_schemas import GPUProd 12 | from cybulde.config_schemas.training.training_task_schemas import DefaultCommonTrainingTaskConfig 13 | 14 | 15 | @dataclass 16 | class LocalBertExperiment(Config): 17 | tasks: dict[str, TaskConfig] = field( 18 | default_factory=lambda: { 19 | "binary_text_classification_task": DefaultCommonTrainingTaskConfig(trainer=GPUProd()), 20 | "binary_text_evaluation_task": DefaultCommonEvaluationTaskConfig(), 21 | } 22 | ) 23 | model_selector: Optional[ 24 | model_selector_schemas.ModelSelectorConfig 25 | ] = model_selector_schemas.CyberBullyingDetectionModelSelectorConfig() 26 | registered_model_name: Optional[str] = "bert_tiny" 27 | 28 | 29 | FinalLocalBertExperiment = OmegaConf.merge( 30 | LocalBertExperiment, 31 | OmegaConf.from_dotlist( 32 | [ 33 | "infrastructure.mlflow.experiment_name=cybulde", 34 | "tasks.binary_text_classification_task.data_module.batch_size=1024", 35 | "tasks.binary_text_evaluation_task.tar_model_path=${tasks.binary_text_classification_task.tar_model_export_path}", 36 | "tasks.binary_text_evaluation_task.data_module=${tasks.binary_text_classification_task.data_module}", 37 | "tasks.binary_text_evaluation_task.trainer=${tasks.binary_text_classification_task.trainer}", 38 | ] 39 | ), 40 | ) 41 | 42 | cs = ConfigStore.instance() 43 | cs.store(name="local_bert", group="experiment/bert", node=FinalLocalBertExperiment, package="_global_") 44 | -------------------------------------------------------------------------------- /docker/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM python:3.10-slim-bullseye 2 | 3 | ARG USER_ID 4 | ARG USER_NAME 5 | ENV HOME=/home/${USER_NAME} \ 6 | VIRTUAL_ENV=/home/${USER_NAME}/venv 7 | ENV \ 8 | PYTHONUNBUFFERED=1 \ 9 | DEBIAN_FRONTEND=noninteractive \ 10 | TZ=Europe/Warsaw \ 11 | PATH="/usr/local/gcloud/google-cloud-sdk/bin:${HOME}/.local/bin:${VIRTUAL_ENV}/bin:${PATH}" \ 12 | PYTHONPATH="/app:${PYTHONPATH}" \ 13 | BUILD_POETRY_LOCK="${HOME}/poetry.lock.build" 14 | 15 | RUN apt-get -qq update \ 16 | && apt-get -qq -y install vim gcc curl git build-essential libb64-dev software-properties-common \ 17 | && rm -rf /var/lib/apt/lists/* \ 18 | && apt-get -qq -y clean 19 | 20 | RUN curl https://dl.google.com/dl/cloudsdk/channels/rapid/downloads/google-cloud-sdk-426.0.0-linux-x86_64.tar.gz > /tmp/google-cloud-sdk.tar.gz 21 | RUN mkdir -p /usr/local/gcloud \ 22 | && tar -C /usr/local/gcloud -xf /tmp/google-cloud-sdk.tar.gz \ 23 | && /usr/local/gcloud/google-cloud-sdk/install.sh --usage-reporting false --command-completion true --bash-completion true --path-update true --quiet 24 | 25 | RUN addgroup --system --gid ${USER_ID} ${USER_NAME} \ 26 | && useradd --system -m --no-log-init --home-dir ${HOME} --uid ${USER_ID} --gid ${USER_NAME} --groups ${USER_NAME} ${USER_NAME} 27 | 28 | RUN chown -R ${USER_NAME}:${USER_NAME} ${HOME} 29 | RUN mkdir -p /app /mlflow-artifact-store "${HOME}/.local/share" && chown -R ${USER_NAME}:${USER_NAME} /app /tmp /mlflow-artifact-store "${HOME}/.local" 30 | 31 | RUN curl -sSL https://install.python-poetry.org | python3 - --version 1.7.1 32 | 33 | USER ${USER_NAME} 34 | 35 | COPY pyproject.toml *.lock /app/ 36 | WORKDIR /app 37 | 38 | RUN poetry config virtualenvs.create false \ 39 | && python3.10 -m venv ${VIRTUAL_ENV} \ 40 | && pip install --upgrade pip setuptools \ 41 | && poetry install && cp poetry.lock ${BUILD_POETRY_LOCK} \ 42 | && rm -rf ${HOME}/.cache/* 43 | 44 | USER root 45 | COPY ./docker/scripts/* / 46 | RUN chown -R ${USER_NAME} /*.sh && chmod +x /*.sh 47 | USER ${USER_NAME} 48 | 49 | COPY . /app/ 50 | CMD ["/startup-script.sh"] 51 | -------------------------------------------------------------------------------- /docker-compose.yaml: -------------------------------------------------------------------------------- 1 | version: "3.8" 2 | 3 | volumes: 4 | postgresql-mlflow-data: 5 | mlflow-artifact-store: 6 | 7 | x-app-template: &APP_TEMPLATE 8 | user: "${USER_ID:-1000}" 9 | hostname: "${HOST_NAME:-emkademy}" 10 | image: cybulde-model 11 | build: 12 | context: . 13 | dockerfile: ./docker/Dockerfile 14 | args: 15 | USER_NAME: "${USER_NAME:-kyuksel}" 16 | USER_ID: "${USER_ID:-1000}" 17 | env_file: 18 | - .envs/.postgres 19 | - .envs/.mlflow-common 20 | - .envs/.mlflow-dev 21 | volumes: 22 | - ./:/app/ 23 | - mlflow-artifact-store:/mlflow-artifact-store 24 | - ~/.config/gcloud/:/home/${USER_NAME}/.config/gcloud 25 | ipc: host 26 | init: true 27 | 28 | services: 29 | mlflow-db: 30 | container_name: mlflow-backend-store 31 | image: postgres:14 32 | env_file: 33 | - .envs/.mlflow-common 34 | - .envs/.mlflow-dev 35 | - .envs/.postgres 36 | volumes: 37 | - postgresql-mlflow-data:/var/lib/postgresql/data 38 | ports: 39 | - 5433:5432 40 | profiles: 41 | - dev 42 | 43 | app-dev: 44 | <<: *APP_TEMPLATE 45 | container_name: cybulde-model-dev-container 46 | ports: 47 | - ${LOCAL_DEV_MLFLOW_SERVER_PORT}:${LOCAL_DEV_MLFLOW_SERVER_PORT} 48 | - 8080:8080 49 | - 8888:8888 50 | - 8001:8001 51 | deploy: 52 | resources: 53 | reservations: 54 | devices: 55 | - driver: nvidia 56 | count: all 57 | capabilities: [gpu] 58 | profiles: 59 | - dev 60 | 61 | app-prod: 62 | <<: *APP_TEMPLATE 63 | container_name: cybulde-model-prod-container 64 | env_file: 65 | - .envs/.mlflow-common 66 | - .envs/.mlflow-prod 67 | - .envs/.infrastructure 68 | ports: 69 | - ${PROD_MLFLOW_SERVER_PORT}:${PROD_MLFLOW_SERVER_PORT} 70 | profiles: 71 | - prod 72 | 73 | app-ci: 74 | <<: *APP_TEMPLATE 75 | container_name: cybulde-model-ci-container 76 | ports: 77 | - ${LOCAL_DEV_MLFLOW_SERVER_PORT}:${LOCAL_DEV_MLFLOW_SERVER_PORT} 78 | profiles: 79 | - ci 80 | -------------------------------------------------------------------------------- /cybulde/evaluation/tasks/common_evaluation_task.py: -------------------------------------------------------------------------------- 1 | from typing import TYPE_CHECKING, Union 2 | 3 | from hydra.utils import instantiate 4 | from lightning.pytorch import Trainer 5 | 6 | from cybulde.data_modules.data_modules import DataModule, PartialDataModuleType 7 | from cybulde.evaluation.lightning_modules.bases import PartialEvaluationLightningModuleType 8 | from cybulde.evaluation.tasks.bases import TarModelEvaluationTask 9 | from cybulde.utils.mlflow_utils import activate_mlflow, log_model 10 | 11 | if TYPE_CHECKING: 12 | from cybulde.config_schemas.config_schema import Config 13 | from cybulde.config_schemas.evaluation.evaluation_task_schemas import EvaluationTaskConfig 14 | 15 | 16 | class CommonEvaluationTask(TarModelEvaluationTask): 17 | def __init__( 18 | self, 19 | name: str, 20 | data_module: Union[DataModule, PartialDataModuleType], 21 | lightning_module: PartialEvaluationLightningModuleType, 22 | trainer: Trainer, 23 | tar_model_path: str, 24 | ) -> None: 25 | super().__init__( 26 | name=name, 27 | data_module=data_module, 28 | lightning_module=lightning_module, 29 | trainer=trainer, 30 | tar_model_path=tar_model_path, 31 | ) 32 | 33 | def run(self, config: "Config", task_config: "EvaluationTaskConfig") -> None: 34 | experiment_name = config.infrastructure.mlflow.experiment_name 35 | run_id = config.infrastructure.mlflow.run_id 36 | run_name = config.infrastructure.mlflow.run_name 37 | 38 | with activate_mlflow(experiment_name=experiment_name, run_id=run_id, run_name=run_name) as _: 39 | self.trainer.test(model=self.lightning_module, datamodule=self.data_module) 40 | 41 | model_selector = instantiate(config.model_selector) 42 | assert config.registered_model_name is not None 43 | if model_selector is not None: 44 | if model_selector.is_selected(): 45 | log_model( 46 | config.infrastructure.mlflow, model_selector.get_new_best_run_tag(), config.registered_model_name 47 | ) 48 | -------------------------------------------------------------------------------- /cybulde/config_schemas/training/training_task_schemas.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass 2 | 3 | from hydra.core.config_store import ConfigStore 4 | from omegaconf import MISSING, SI 5 | 6 | from cybulde.config_schemas import data_module_schemas 7 | from cybulde.config_schemas.base_schemas import TaskConfig 8 | from cybulde.config_schemas.trainer import trainer_schemas 9 | from cybulde.config_schemas.training import training_lightning_module_schemas 10 | 11 | 12 | @dataclass 13 | class TrainingTaskConfig(TaskConfig): 14 | best_training_checkpoint: str = SI("${infrastructure.mlflow.artifact_uri}/best-checkpoints/last.ckpt") 15 | last_training_checkpoint: str = SI("${infrastructure.mlflow.artifact_uri}/last-checkpoints/last.ckpt") 16 | 17 | 18 | @dataclass 19 | class TarModelExportingTrainingTaskConfig(TrainingTaskConfig): 20 | tar_model_export_path: str = SI("${infrastructure.mlflow.artifact_uri}/exported_model.tar.gz") 21 | 22 | 23 | @dataclass 24 | class CommonTrainingTaskConfig(TrainingTaskConfig): 25 | _target_: str = "cybulde.training.tasks.common_training_task.CommonTrainingTask" 26 | 27 | 28 | @dataclass 29 | class DefaultCommonTrainingTaskConfig(TarModelExportingTrainingTaskConfig): 30 | _target_: str = "cybulde.training.tasks.tar_model_exporting_training_task.TarModelExportingTrainingTask" 31 | name: str = "binary_text_classfication_task" 32 | data_module: data_module_schemas.DataModuleConfig = ( 33 | data_module_schemas.ScrappedDataTextClassificationDataModuleConfig() 34 | ) 35 | lightning_module: training_lightning_module_schemas.TrainingLightningModuleConfig = ( 36 | training_lightning_module_schemas.CybuldeBinaryTextClassificationTrainingLightningModuleConfig() 37 | ) 38 | trainer: trainer_schemas.TrainerConfig = trainer_schemas.GPUDev() 39 | 40 | 41 | def setup_config() -> None: 42 | data_module_schemas.setup_config() 43 | training_lightning_module_schemas.setup_config() 44 | trainer_schemas.setup_config() 45 | 46 | cs = ConfigStore.instance() 47 | cs.store( 48 | name="common_training_task_schema", 49 | group="tasks", 50 | node=CommonTrainingTaskConfig, 51 | ) 52 | -------------------------------------------------------------------------------- /cybulde/config_schemas/training/training_lightning_module_schemas.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass 2 | from typing import Optional 3 | 4 | from hydra.core.config_store import ConfigStore 5 | from omegaconf import MISSING 6 | 7 | from cybulde.config_schemas.base_schemas import LightningModuleConfig 8 | from cybulde.config_schemas.models.model_schemas import BertTinyBinaryTextClassificationModelConfig, ModelConfig 9 | from cybulde.config_schemas.training import loss_schemas, optimizer_schemas, scheduler_schemas 10 | from cybulde.utils.mixins import LoggableParamsMixin 11 | 12 | 13 | @dataclass 14 | class TrainingLightningModuleConfig(LightningModuleConfig, LoggableParamsMixin): 15 | _target_: str = MISSING 16 | model: ModelConfig = MISSING 17 | loss: loss_schemas.LossFunctionConfig = MISSING 18 | optimizer: optimizer_schemas.OptimizerConfig = MISSING 19 | scheduler: Optional[scheduler_schemas.LightningSchedulerConfig] = None 20 | 21 | def loggable_params(self) -> list[str]: 22 | return ["_target_"] 23 | 24 | 25 | @dataclass 26 | class BinaryTextClassificationTrainingLightningModuleConfig(TrainingLightningModuleConfig): 27 | _target_: str = ( 28 | "cybulde.training.lightning_modules.binary_text_classification.BinaryTextClassificationTrainingLightningModule" 29 | ) 30 | 31 | 32 | @dataclass 33 | class CybuldeBinaryTextClassificationTrainingLightningModuleConfig( 34 | BinaryTextClassificationTrainingLightningModuleConfig 35 | ): 36 | model: ModelConfig = BertTinyBinaryTextClassificationModelConfig() 37 | loss: loss_schemas.LossFunctionConfig = loss_schemas.BCEWithLogitsLossConfig() 38 | optimizer: optimizer_schemas.OptimizerConfig = optimizer_schemas.AdamWOptimizerConfig() 39 | scheduler: Optional[ 40 | scheduler_schemas.LightningSchedulerConfig 41 | ] = scheduler_schemas.ReduceLROnPlateauLightningSchedulerConfig() 42 | 43 | 44 | def setup_config() -> None: 45 | cs = ConfigStore.instance() 46 | cs.store( 47 | name="binary_text_classification_training_lightning_module_schema", 48 | group="tasks/lightning_module", 49 | node=BinaryTextClassificationTrainingLightningModuleConfig, 50 | ) 51 | -------------------------------------------------------------------------------- /cybulde/evaluation/lightning_modules/binary_text_evaluation.py: -------------------------------------------------------------------------------- 1 | from collections import defaultdict 2 | 3 | import mlflow 4 | import torch 5 | 6 | from torch import Tensor 7 | from torchmetrics.classification import BinaryAccuracy, BinaryConfusionMatrix, BinaryF1Score 8 | from transformers import BatchEncoding 9 | 10 | from cybulde.evaluation.lightning_modules.bases import EvaluationLightningModule 11 | from cybulde.models.models import Model 12 | from cybulde.models.transformations import Transformation 13 | from cybulde.utils.torch_utils import plot_confusion_matrix 14 | 15 | 16 | class BinaryTextEvaluationLightningModule(EvaluationLightningModule): 17 | def __init__( 18 | self, 19 | model: Model, 20 | ) -> None: 21 | super().__init__(model=model) 22 | 23 | self.test_accuracy = BinaryAccuracy() 24 | self.test_f1_score = BinaryF1Score() 25 | self.test_confusion_matrix = BinaryConfusionMatrix() 26 | 27 | self.test_step_outputs: dict[str, list[Tensor]] = defaultdict(list) 28 | 29 | def forward(self, texts: BatchEncoding) -> Tensor: 30 | output: Tensor = self.model(texts) 31 | return output 32 | 33 | def test_step(self, batch: tuple[BatchEncoding, Tensor], batch_idx: int) -> None: # type: ignore 34 | texts, labels = batch 35 | logits = self(texts) 36 | 37 | self.test_accuracy(logits, labels) 38 | self.test_f1_score(logits, labels) 39 | self.test_confusion_matrix(logits, labels) 40 | 41 | self.log("test_accuracy", self.test_accuracy, on_step=False, on_epoch=True) 42 | self.log("test_f1_score", self.test_f1_score, on_step=False, on_epoch=True) 43 | 44 | self.test_step_outputs["logits"].append(logits) 45 | self.test_step_outputs["labels"].append(labels) 46 | 47 | def on_test_epoch_end(self) -> None: 48 | all_logits = torch.stack(self.test_step_outputs["logits"]) 49 | all_labels = torch.stack(self.test_step_outputs["labels"]) 50 | 51 | confusion_matrix = self.test_confusion_matrix(all_logits, all_labels) 52 | figure = plot_confusion_matrix(confusion_matrix, ["0", "1"]) 53 | mlflow.log_figure(figure, "test_confusion_matrix.png") 54 | 55 | self.test_step_outputs = defaultdict(list) 56 | 57 | def get_transformation(self) -> Transformation: 58 | return self.model.get_transformation() 59 | -------------------------------------------------------------------------------- /cybulde/config_schemas/infrastructure/instance_template_creator_schemas.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass, field 2 | from typing import Any, Optional 3 | 4 | from omegaconf import SI 5 | 6 | from cybulde.infrastructure.instance_template_creator import VMType 7 | 8 | 9 | @dataclass 10 | class BootDiskConfig: 11 | project_id: str = "deeplearning-platform-release" 12 | name: str = "common-cu113-v20230925" 13 | size_gb: int = 50 14 | labels: Any = SI("${..labels}") 15 | 16 | 17 | @dataclass 18 | class VMConfig: 19 | machine_type: str = "n1-standard-8" 20 | accelerator_count: int = 1 21 | accelerator_type: str = "nvidia-tesla-t4" 22 | vm_type: VMType = VMType.STANDARD 23 | disks: list[str] = field(default_factory=lambda: []) 24 | 25 | 26 | @dataclass 27 | class VMMetadataConfig: 28 | instance_group_name: str = SI("${infrastructure.instance_group_creator.name}") 29 | docker_image: Optional[str] = SI("${docker_image}") 30 | zone: str = SI("${infrastructure.zone}") 31 | python_hash_seed: int = 42 32 | mlflow_tracking_uri: str = SI("${infrastructure.mlflow.mlflow_internal_tracking_uri}") 33 | node_count: int = 1 34 | disks: Any = SI("${..vm_config.disks}") 35 | etcd_ip: Optional[str] = SI("${infrastructure.etcd_ip}") 36 | 37 | 38 | @dataclass 39 | class InstanceTemplateCreatorConfig: 40 | _target_: str = "cybulde.infrastructure.instance_template_creator.InstanceTemplateCreator" 41 | scopes: list[str] = field( 42 | default_factory=lambda: [ 43 | "https://www.googleapis.com/auth/cloud-platform", 44 | "https://www.googleapis.com/auth/cloud.useraccounts.readonly", 45 | "https://www.googleapis.com/auth/cloudruntimeconfig", 46 | ] 47 | ) 48 | network: str = "https://www.googleapis.com/compute/v1/projects/cybulde/global/networks/default" 49 | subnetwork: str = "https://www.googleapis.com/compute/v1/projects/cybulde/regions/europe-west4/subnetworks/default" 50 | startup_script_path: str = "scripts/vm_startup/task_runner_startup_script.sh" 51 | vm_config: VMConfig = VMConfig() 52 | boot_disk_config: BootDiskConfig = BootDiskConfig() 53 | vm_metadata_config: VMMetadataConfig = VMMetadataConfig() 54 | template_name: str = SI("${infrastructure.instance_group_creator.name}") 55 | project_id: str = SI("${infrastructure.project_id}") 56 | labels: dict[str, str] = field(default_factory=lambda: {"project": "cybulde"}) 57 | -------------------------------------------------------------------------------- /cybulde/training/tasks/common_training_task.py: -------------------------------------------------------------------------------- 1 | from typing import TYPE_CHECKING, Union 2 | 3 | from lightning.pytorch import Trainer 4 | 5 | from cybulde.data_modules.data_modules import DataModule, PartialDataModuleType 6 | from cybulde.training.lightning_modules.bases import TrainingLightningModule 7 | from cybulde.training.tasks.bases import TrainingTask 8 | from cybulde.utils.io_utils import is_file 9 | from cybulde.utils.mlflow_utils import activate_mlflow, log_artifacts_for_reproducibility 10 | 11 | if TYPE_CHECKING: 12 | from cybulde.config_schemas.config_schema import Config 13 | from cybulde.config_schemas.training.training_task_schemas import TrainingTaskConfig 14 | 15 | 16 | class CommonTrainingTask(TrainingTask): 17 | def __init__( 18 | self, 19 | name: str, 20 | data_module: Union[DataModule, PartialDataModuleType], 21 | lightning_module: TrainingLightningModule, 22 | trainer: Trainer, 23 | best_training_checkpoint: str, 24 | last_training_checkpoint: str, 25 | ) -> None: 26 | super().__init__( 27 | name=name, 28 | data_module=data_module, 29 | lightning_module=lightning_module, 30 | trainer=trainer, 31 | best_training_checkpoint=best_training_checkpoint, 32 | last_training_checkpoint=last_training_checkpoint, 33 | ) 34 | 35 | def run(self, config: "Config", task_config: "TrainingTaskConfig") -> None: 36 | experiment_name = config.infrastructure.mlflow.experiment_name 37 | run_id = config.infrastructure.mlflow.run_id 38 | run_name = config.infrastructure.mlflow.run_name 39 | 40 | with activate_mlflow(experiment_name=experiment_name, run_id=run_id, run_name=run_name) as _: 41 | if self.trainer.is_global_zero: 42 | log_artifacts_for_reproducibility() 43 | 44 | assert isinstance(self.data_module, DataModule) 45 | if is_file(self.last_training_checkpoint): 46 | self.logger.info("Found checkpoint here: {self.last_training_checkpoint}. Resuming training...") 47 | self.trainer.fit( 48 | model=self.lightning_module, datamodule=self.data_module, ckpt_path=self.last_training_checkpoint 49 | ) 50 | else: 51 | self.trainer.fit(model=self.lightning_module, datamodule=self.data_module) 52 | 53 | self.logger.info("training finished...") 54 | -------------------------------------------------------------------------------- /cybulde/config_schemas/trainer/callbacks_schemas.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass 2 | from typing import Optional 3 | 4 | from hydra.core.config_store import ConfigStore 5 | from omegaconf import MISSING, SI 6 | 7 | 8 | @dataclass 9 | class CallbackConfig: 10 | _target_: str = MISSING 11 | 12 | 13 | @dataclass 14 | class ModelCheckpointConfig(CallbackConfig): 15 | _target_: str = "lightning.pytorch.callbacks.ModelCheckpoint" 16 | dirpath: Optional[str] = "./data/pytorch-lightning" 17 | filename: Optional[str] = None 18 | monitor: Optional[str] = None 19 | verbose: bool = False 20 | save_last: Optional[bool] = None 21 | save_top_k: int = 1 22 | mode: str = "min" 23 | auto_insert_metric_name: bool = False 24 | save_weights_only: bool = False 25 | every_n_train_steps: Optional[int] = None 26 | train_time_interval: Optional[str] = None 27 | every_n_epochs: Optional[int] = None 28 | save_on_train_epoch_end: Optional[bool] = None 29 | 30 | 31 | @dataclass 32 | class BestModelCheckpointConfig(ModelCheckpointConfig): 33 | dirpath: Optional[str] = SI("${infrastructure.mlflow.artifact_uri}/best-checkpoints/") 34 | monitor: str = MISSING 35 | save_last: Optional[bool] = True 36 | save_top_k: int = 2 37 | mode: str = MISSING 38 | 39 | 40 | @dataclass 41 | class ValidationF1ScoreBestModelCheckpointConfig(BestModelCheckpointConfig): 42 | monitor: str = "validation_f1_score" 43 | mode: str = "max" 44 | 45 | 46 | @dataclass 47 | class LastModelCheckpointConfig(ModelCheckpointConfig): 48 | dirpath: Optional[str] = SI("${infrastructure.mlflow.artifact_uri}/last-checkpoints/") 49 | every_n_train_steps: int = SI("${save_last_checkpoint_every_n_train_steps}") 50 | save_last: Optional[bool] = True 51 | filename: Optional[str] = "checkpoint-{epoch}" 52 | save_top_k: int = -1 53 | 54 | 55 | @dataclass 56 | class LearningRateMonitorConfig(CallbackConfig): 57 | _target_: str = "lightning.pytorch.callbacks.LearningRateMonitor" 58 | logging_interval: str = "step" 59 | 60 | 61 | def setup_config() -> None: 62 | cs = ConfigStore.instance() 63 | 64 | cs.store(name="best_model_checkpoint_schema", group="tasks/trainer/callbacks", node=BestModelCheckpointConfig) 65 | cs.store(name="last_model_checkpoint_schema", group="tasks/trainer/callbacks", node=LastModelCheckpointConfig) 66 | cs.store(name="learning_rate_monitor_schema", group="tasks/trainer/callbacks", node=LearningRateMonitorConfig) 67 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | 80 | # IPython 81 | profile_default/ 82 | ipython_config.py 83 | 84 | # pyenv 85 | .python-version 86 | 87 | # pipenv 88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 91 | # install all needed dependencies. 92 | #Pipfile.lock 93 | 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 95 | __pypackages__/ 96 | 97 | # Celery stuff 98 | celerybeat-schedule 99 | celerybeat.pid 100 | 101 | # SageMath parsed files 102 | *.sage.py 103 | 104 | # Environments 105 | .env 106 | .venv 107 | env/ 108 | venv/ 109 | ENV/ 110 | env.bak/ 111 | venv.bak/ 112 | .envs/.app 113 | .envs/.dvc 114 | .envs/.mlflow 115 | .envs/.postgres 116 | .envs/.triton 117 | .envs/.secrets 118 | .envs/.gcp 119 | 120 | # Spyder project settings 121 | .spyderproject 122 | .spyproject 123 | 124 | # Rope project settings 125 | .ropeproject 126 | 127 | # mkdocs documentation 128 | /site 129 | 130 | # mypy 131 | .mypy_cache/ 132 | .dmypy.json 133 | dmypy.json 134 | 135 | # Pyre type checker 136 | .pyre/ 137 | -------------------------------------------------------------------------------- /cybulde/utils/io_utils.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from typing import Any 4 | 5 | import yaml 6 | 7 | from fsspec import AbstractFileSystem, filesystem 8 | 9 | GCS_PREFIX = "gs://" 10 | GCS_FILE_SYSTEM_NAME = "gcs" 11 | LOCAL_FILE_SYSTEM_NAME = "file" 12 | TMP_FILE_PATH = "/tmp/" 13 | 14 | 15 | def choose_file_system(path: str) -> AbstractFileSystem: 16 | return filesystem(GCS_FILE_SYSTEM_NAME) if path.startswith(GCS_PREFIX) else filesystem(LOCAL_FILE_SYSTEM_NAME) 17 | 18 | 19 | def open_file(path: str, mode: str = "r") -> Any: 20 | file_system = choose_file_system(path) 21 | return file_system.open(path, mode) 22 | 23 | 24 | def write_yaml_file(yaml_file_path: str, yaml_file_content: dict[Any, Any]) -> None: 25 | with open_file(yaml_file_path, "w") as yaml_file: 26 | yaml.dump(yaml_file_content, yaml_file) 27 | 28 | 29 | def is_dir(path: str) -> bool: 30 | file_system = choose_file_system(path) 31 | is_dir: bool = file_system.isdir(path) 32 | return is_dir 33 | 34 | 35 | def is_file(path: str) -> bool: 36 | file_system = choose_file_system(path) 37 | is_file: bool = file_system.isfile(path) 38 | return is_file 39 | 40 | 41 | def make_dirs(path: str) -> None: 42 | file_system = choose_file_system(path) 43 | file_system.makedirs(path, exist_ok=True) 44 | 45 | 46 | def list_paths(path: str) -> list[str]: 47 | file_system = choose_file_system(path) 48 | if not is_dir(path): 49 | return [] 50 | paths: list[str] = file_system.ls(path) 51 | if GCS_FILE_SYSTEM_NAME in file_system.protocol: 52 | gs_paths: list[str] = [f"{GCS_PREFIX}{path}" for path in paths] 53 | return gs_paths 54 | return paths 55 | 56 | 57 | def copy_dir(source_dir: str, target_dir: str) -> None: 58 | if not is_dir(target_dir): 59 | make_dirs(target_dir) 60 | source_files = list_paths(source_dir) 61 | for source_file in source_files: 62 | target_file = os.path.join(target_dir, os.path.basename(source_file)) 63 | if is_file(source_file): 64 | with open_file(source_file, mode="rb") as source, open_file(target_file, mode="wb") as target: 65 | content = source.read() 66 | target.write(content) 67 | else: 68 | raise ValueError(f"Source file {source_file} is not a file.") 69 | 70 | 71 | def translate_gcs_dir_to_local(path: str) -> str: 72 | if path.startswith(GCS_PREFIX): 73 | path = path.rstrip("/") 74 | local_path = os.path.join(TMP_FILE_PATH, os.path.split(path)[-1]) 75 | os.makedirs(local_path, exist_ok=True) 76 | copy_dir(path, local_path) 77 | return local_path 78 | return path 79 | -------------------------------------------------------------------------------- /scripts/vm_startup/task_runner_startup_script.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -euo pipefail 3 | IFS=$'\n\t' 4 | 5 | export NCCL_ASYNC_ERROR_HANDLING=1 6 | export GCP_LOGGING_ENABLED="TRUE" 7 | 8 | INSTANCE_GROUP_NAME=$(curl --silent --fail http://metadata.google.internal/computeMetadata/v1/instance/attributes/instance_group_name -H "Metadata-Flavor: Google") 9 | DOCKER_IMAGE=$(curl --silent --fail http://metadata.google.internal/computeMetadata/v1/instance/attributes/docker_image -H "Metadata-Flavor: Google") 10 | ZONE=$(curl --silent http://metadata.google.internal/computeMetadata/v1/instance/attributes/zone -H "Metadata-Flavor: Google") 11 | PYTHON_HASH_SEED=$(curl --silent --fail http://metadata.google.internal/computeMetadata/v1/instance/attributes/python_hash_seed -H "Metadata-Flavor: Google" || echo "42") 12 | MLFLOW_TRACKING_URI=$(curl --silent http://metadata.google.internal/computeMetadata/v1/instance/attributes/mlflow_tracking_uri -H "Metadata-Flavor: Google") 13 | NODE_COUNT=$(curl --silent http://metadata.google.internal/computeMetadata/v1/instance/attributes/node_count -H "Metadata-Flavor: Google") 14 | DISKS=$(curl --silent http://metadata.google.internal/computeMetadata/v1/instance/attributes/disks -H "Metadata-Flavor: Google") 15 | ETCD_IP=$(curl --silent http://metadata.google.internal/computeMetadata/v1/instance/attributes/etcd_ip -H "Metadata-Flavor: Google") 16 | 17 | INSTANCE_GROUP_NAME=$(echo ${INSTANCE_GROUP_NAME} | tr '[:upper:]' '[:lower:]') 18 | 19 | echo -e "TRAINING: instance group name: ${INSTANCE_GROUP_NAME}, docker image: ${DOCKER_IMAGE}, node count: ${NODE_COUNT}, python hash seed: ${PYTHON_HASH_SEED}" 20 | 21 | echo "============= Installing Nvidia Drivers ===============" 22 | apt-get update && /opt/deeplearning/install-driver.sh 23 | 24 | echo "============= Downloading docker image ===============" 25 | gcloud auth configure-docker --quiet europe-west4-docker.pkg.dev 26 | time docker pull "${DOCKER_IMAGE}" 27 | 28 | echo "============= TRAINING: start ===============" 29 | 30 | if [ "${ETCD_IP}" = "None" ]; then 31 | docker run --init --rm --gpus all --ipc host --user root --hostname "$(hostname)" --privileged \ 32 | --log-driver=gcplogs \ 33 | -e PYTHONHASHSEED="${PYTHON_HASH_SEED}" \ 34 | -e MLFLOW_TRACKING_URI="${MLFLOW_TRACKING_URI}" \ 35 | -e TOKENIZERS_PARALLELISM=false \ 36 | ${DOCKER_IMAGE} \ 37 | torchrun \ 38 | --nnodes="${NODE_COUNT}" \ 39 | --nproc_per_node='gpu' \ 40 | cybulde/run_tasks.py || echo '================ TRAINING: job failed ===============' 41 | else 42 | docker run --init --rm --gpus all --ipc host --user root --hostname "$(hostname)" --privileged \ 43 | --log-driver=gcplogs \ 44 | -e PYTHONHASHSEED="${PYTHON_HASH_SEED}" \ 45 | -e MLFLOW_TRACKING_URI="${MLFLOW_TRACKING_URI}" \ 46 | -e TOKENIZERS_PARALLELISM=false \ 47 | ${DOCKER_IMAGE} \ 48 | torchrun \ 49 | --nnodes="${NODE_COUNT}" \ 50 | --nproc_per_node='gpu' \ 51 | --rdzv_id="${INSTANCE_GROUP_NAME}" \ 52 | --rdzv_backend=etcd-v2 \ 53 | --rdzv_endpoint="${ETCD_IP}" \ 54 | cybulde/run_tasks.py || echo '================ TRAINING: job failed ===============' 55 | 56 | fi 57 | 58 | echo "============= Cleaning up ===============" 59 | gcloud compute instance-groups managed delete --quiet "${INSTANCE_GROUP_NAME}" --zone "${ZONE}" 60 | -------------------------------------------------------------------------------- /cybulde/training/tasks/tar_model_exporting_training_task.py: -------------------------------------------------------------------------------- 1 | from typing import TYPE_CHECKING, Union 2 | 3 | import pandas as pd 4 | 5 | from lightning.pytorch import Trainer 6 | from torch import Tensor 7 | 8 | from cybulde.data_modules.data_modules import DataModule, PartialDataModuleType 9 | from cybulde.models.common.exporter import TarModelExporter 10 | from cybulde.training.lightning_modules.bases import ModelStateDictExportingTrainingLightningModule 11 | from cybulde.training.tasks.bases import TrainingTask 12 | from cybulde.utils.io_utils import is_file 13 | from cybulde.utils.mlflow_utils import activate_mlflow, log_artifacts_for_reproducibility 14 | 15 | if TYPE_CHECKING: 16 | from cybulde.config_schemas.config_schema import Config 17 | from cybulde.config_schemas.training.training_task_schemas import TrainingTaskConfig 18 | 19 | 20 | class TarModelExportingTrainingTask(TrainingTask): 21 | def __init__( 22 | self, 23 | name: str, 24 | data_module: Union[DataModule, PartialDataModuleType], 25 | lightning_module: ModelStateDictExportingTrainingLightningModule, 26 | trainer: Trainer, 27 | best_training_checkpoint: str, 28 | last_training_checkpoint: str, 29 | tar_model_export_path: str, 30 | ) -> None: 31 | super().__init__( 32 | name=name, 33 | data_module=data_module, 34 | lightning_module=lightning_module, 35 | trainer=trainer, 36 | best_training_checkpoint=best_training_checkpoint, 37 | last_training_checkpoint=last_training_checkpoint, 38 | ) 39 | 40 | self.tar_model_export_path = tar_model_export_path 41 | 42 | def run(self, config: "Config", task_config: "TrainingTaskConfig") -> None: 43 | experiment_name = config.infrastructure.mlflow.experiment_name 44 | run_id = config.infrastructure.mlflow.run_id 45 | run_name = config.infrastructure.mlflow.run_name 46 | 47 | train_df = pd.read_parquet(task_config.data_module.train_df_path) 48 | value_counts = train_df["label"].value_counts() 49 | pos_weight = value_counts[0] / value_counts[1] 50 | 51 | self.lightning_module.set_pos_weight(Tensor([pos_weight])) 52 | 53 | with activate_mlflow(experiment_name=experiment_name, run_id=run_id, run_name=run_name) as _: 54 | if self.trainer.is_global_zero: 55 | log_artifacts_for_reproducibility() 56 | 57 | assert isinstance(self.data_module, DataModule) 58 | if is_file(self.last_training_checkpoint): 59 | self.logger.info("Found checkpoint here: {self.last_training_checkpoint}. Resuming training...") 60 | self.trainer.fit( 61 | model=self.lightning_module, datamodule=self.data_module, ckpt_path=self.last_training_checkpoint 62 | ) 63 | else: 64 | self.trainer.fit(model=self.lightning_module, datamodule=self.data_module) 65 | 66 | self.logger.info("Training finished. Exporting model state dict...") 67 | 68 | model_state_dict_path = self.lightning_module.export_model_state_dict(self.best_training_checkpoint) # type: ignore 69 | 70 | model_config = task_config.lightning_module.model # type: ignore 71 | model_exporter = TarModelExporter(model_state_dict_path, model_config, self.tar_model_export_path) 72 | model_exporter.export() 73 | -------------------------------------------------------------------------------- /cybulde/infrastructure/instance_group_creator.py: -------------------------------------------------------------------------------- 1 | import time 2 | 3 | from google.cloud import compute_v1 4 | 5 | from cybulde.infrastructure.instance_template_creator import InstanceTemplateCreator 6 | from cybulde.utils.gcp_utils import wait_for_extended_operation 7 | from cybulde.utils.utils import get_logger 8 | 9 | 10 | class InstanceGroupCreator: 11 | def __init__( 12 | self, 13 | instance_template_creator: InstanceTemplateCreator, 14 | name: str, 15 | node_count: int, 16 | project_id: str, 17 | zone: str, 18 | ): 19 | self.logger = get_logger(self.__class__.__name__) 20 | self.instance_template_creator = instance_template_creator 21 | self.name = name.lower() 22 | self.node_count = node_count 23 | self.project_id = project_id 24 | self.zone = zone 25 | 26 | def launch_instance_group(self) -> list[int]: 27 | instance_group = self._create_instance_group() 28 | self.logger.debug(f"{instance_group=}") 29 | 30 | instance_ids = self._get_instance_ids(self.name, self.node_count) 31 | return instance_ids 32 | 33 | def _create_instance_group(self) -> compute_v1.InstanceGroupManager: 34 | self.logger.info("Starting to create instance group...") 35 | instance_template = self.instance_template_creator.create_template() 36 | 37 | name = self.name 38 | instance_group_manager_resource = compute_v1.InstanceGroupManager( 39 | name=name, 40 | base_instance_name=self.name, 41 | instance_template=instance_template.self_link, 42 | target_size=self.node_count, 43 | ) 44 | 45 | project_id = self.project_id 46 | zone = self.zone 47 | 48 | instance_group_managers_client = compute_v1.InstanceGroupManagersClient() 49 | operation = instance_group_managers_client.insert( 50 | project=project_id, instance_group_manager_resource=instance_group_manager_resource, zone=zone 51 | ) 52 | wait_for_extended_operation(operation, "managed instance group creation") 53 | 54 | self.logger.info("Instance group has been created...") 55 | return instance_group_managers_client.get(project=project_id, instance_group_manager=name, zone=zone) 56 | 57 | def _get_instance_ids(self, name: str, node_count: int) -> list[int]: 58 | instance_ids = set() 59 | trial = 0 60 | max_trials = 10 61 | base_sleep_time = 1.5 62 | while trial <= max_trials: 63 | self.logger.info(f"Waiting for instances ({trial=})...") 64 | pager = self.list_instances_in_group(name) 65 | for instance in pager: 66 | if instance.id: 67 | self.logger.info(f"Instance {instance.id} ready") 68 | instance_ids.add(instance.id) 69 | 70 | if len(instance_ids) >= node_count: 71 | break 72 | 73 | time.sleep(pow(base_sleep_time, trial)) 74 | trial += 1 75 | return list(instance_ids) 76 | 77 | def list_instances_in_group( 78 | self, name: str 79 | ) -> compute_v1.services.instance_group_managers.pagers.ListManagedInstancesPager: 80 | instance_group_managers_client = compute_v1.InstanceGroupManagersClient() 81 | pager = instance_group_managers_client.list_managed_instances( 82 | project=self.project_id, instance_group_manager=name, zone=self.zone 83 | ) 84 | return pager 85 | -------------------------------------------------------------------------------- /cybulde/models/common/io_utils.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from typing import Any 4 | 5 | import yaml 6 | 7 | from fsspec import AbstractFileSystem, filesystem 8 | 9 | GCS_PREFIX = "gs://" 10 | GCS_FILE_SYSTEM_NAME = "gcs" 11 | LOCAL_FILE_SYSTEM_NAME = "file" 12 | TMP_FILE_PATH = "/tmp/" 13 | 14 | 15 | def choose_file_system(path: str) -> AbstractFileSystem: 16 | return filesystem(GCS_FILE_SYSTEM_NAME) if path.startswith(GCS_PREFIX) else filesystem(LOCAL_FILE_SYSTEM_NAME) 17 | 18 | 19 | def open_file(path: str, mode: str = "r") -> Any: 20 | file_system = choose_file_system(path) 21 | return file_system.open(path, mode) 22 | 23 | 24 | def write_yaml_file(yaml_file_path: str, yaml_file_content: dict[Any, Any]) -> None: 25 | with open_file(yaml_file_path, "w") as yaml_file: 26 | yaml.dump(yaml_file_content, yaml_file) 27 | 28 | 29 | def is_dir(path: str) -> bool: 30 | file_system = choose_file_system(path) 31 | is_dir: bool = file_system.isdir(path) 32 | return is_dir 33 | 34 | 35 | def is_file(path: str) -> bool: 36 | file_system = choose_file_system(path) 37 | is_file: bool = file_system.isfile(path) 38 | return is_file 39 | 40 | 41 | def make_dirs(path: str) -> None: 42 | file_system = choose_file_system(path) 43 | file_system.makedirs(path, exist_ok=True) 44 | 45 | 46 | def list_paths(path: str) -> list[str]: 47 | file_system = choose_file_system(path) 48 | if not is_dir(path): 49 | return [] 50 | paths: list[str] = file_system.ls(path) 51 | if GCS_FILE_SYSTEM_NAME in file_system.protocol: 52 | gs_paths: list[str] = [f"{GCS_PREFIX}{path}" for path in paths] 53 | return gs_paths 54 | return paths 55 | 56 | 57 | def copy_file(source_file: str, target_path: str) -> None: 58 | with open_file(source_file, mode="rb") as source, open_file(target_path, "wb") as target: 59 | content = source.read() 60 | target.write(content) 61 | 62 | 63 | def copy_dir(source_dir: str, target_dir: str) -> None: 64 | if not is_dir(target_dir): 65 | make_dirs(target_dir) 66 | source_files = list_paths(source_dir) 67 | for source_file in source_files: 68 | target_file = os.path.join(target_dir, os.path.basename(source_file)) 69 | if is_file(source_file): 70 | with open_file(source_file, mode="rb") as source, open_file(target_file, mode="wb") as target: 71 | content = source.read() 72 | target.write(content) 73 | else: 74 | raise ValueError(f"Source file {source_file} is not a file.") 75 | 76 | 77 | def convert_gcs_path_to_local_path(path: str) -> str: 78 | if path.startswith(GCS_PREFIX): 79 | path = path.rstrip("/") 80 | local_path = os.path.join(TMP_FILE_PATH, os.path.split(path)[-1]) 81 | return local_path 82 | return path 83 | 84 | 85 | def cache_gcs_resource_locally(path: str) -> str: 86 | if path.startswith(GCS_PREFIX): 87 | local_path = convert_gcs_path_to_local_path(path) 88 | 89 | if os.path.exists(local_path): 90 | return local_path 91 | 92 | if is_dir(path): 93 | os.makedirs(local_path, exist_ok=True) 94 | copy_dir(path, local_path) 95 | else: 96 | copy_file(path, local_path) 97 | return local_path 98 | 99 | return path 100 | 101 | 102 | def translate_gcs_dir_to_local(path: str) -> str: 103 | if path.startswith(GCS_PREFIX): 104 | path = path.rstrip("/") 105 | local_path = os.path.join(TMP_FILE_PATH, os.path.split(path)[-1]) 106 | os.makedirs(local_path, exist_ok=True) 107 | copy_dir(path, local_path) 108 | return local_path 109 | return path 110 | -------------------------------------------------------------------------------- /cybulde/config_schemas/trainer/trainer_schemas.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass, field 2 | from typing import Optional 3 | 4 | from hydra.core.config_store import ConfigStore 5 | 6 | from cybulde.config_schemas.trainer import callbacks_schemas, logger_schemas 7 | from cybulde.utils.mixins import LoggableParamsMixin 8 | 9 | 10 | @dataclass 11 | class TrainerConfig(LoggableParamsMixin): 12 | _target_: str = "lightning.pytorch.trainer.trainer.Trainer" 13 | accelerator: str = "auto" 14 | strategy: str = "ddp_find_unused_parameters_true" 15 | devices: str = "auto" 16 | num_nodes: int = 1 # SI("${}") 17 | precision: str = "16-mixed" 18 | logger: Optional[list[logger_schemas.LoggerConfig]] = field(default_factory=lambda: []) # type: ignore 19 | callbacks: Optional[list[callbacks_schemas.CallbackConfig]] = field(default_factory=lambda: []) # type: ignore 20 | fast_dev_run: bool = False 21 | max_epochs: Optional[int] = None 22 | min_epochs: Optional[int] = None 23 | max_steps: int = -1 24 | min_steps: Optional[int] = None 25 | max_time: Optional[str] = None 26 | limit_train_batches: Optional[float] = 1 27 | limit_val_batches: Optional[float] = 1 28 | limit_test_batches: Optional[float] = 1 29 | limit_predict_batches: Optional[float] = 1 30 | overfit_batches: float = 0.0 31 | val_check_interval: Optional[float] = 1 32 | check_val_every_n_epoch: Optional[int] = 1 33 | num_sanity_val_steps: int = 2 34 | log_every_n_steps: int = 50 35 | enable_checkpointing: bool = True 36 | enable_progress_bar: bool = True 37 | enable_model_summary: bool = True 38 | accumulate_grad_batches: int = 1 39 | gradient_clip_val: Optional[float] = 5 40 | gradient_clip_algorithm: Optional[str] = "value" 41 | deterministic: Optional[bool] = None 42 | benchmark: Optional[bool] = None 43 | inference_mode: bool = True 44 | use_distributed_sampler: bool = True 45 | detect_anomaly: bool = False 46 | barebones: bool = False 47 | sync_batchnorm: bool = True 48 | reload_dataloaders_every_n_epochs: int = 0 49 | default_root_dir: Optional[str] = "./data/pytorch-lightning" 50 | 51 | def loggable_params(self) -> list[str]: 52 | return ["max_epochs", "max_steps", "strategy", "precision"] 53 | 54 | 55 | @dataclass 56 | class GPUDev(TrainerConfig): 57 | max_epochs: int = 3 58 | accelerator: str = "gpu" 59 | log_every_n_steps: int = 1 60 | limit_train_batches: float = 0.01 61 | limit_val_batches: float = 0.01 62 | limit_test_batches: float = 0.01 63 | logger: Optional[list[logger_schemas.LoggerConfig]] = field( 64 | default_factory=lambda: [logger_schemas.MLFlowLoggerConfig()] 65 | ) # type: ignore 66 | callbacks: Optional[list[callbacks_schemas.CallbackConfig]] = field( 67 | default_factory=lambda: [ 68 | callbacks_schemas.ValidationF1ScoreBestModelCheckpointConfig(), 69 | callbacks_schemas.LastModelCheckpointConfig(), 70 | callbacks_schemas.LearningRateMonitorConfig(), 71 | ] 72 | ) 73 | 74 | 75 | @dataclass 76 | class GPUProd(TrainerConfig): 77 | max_epochs: int = 20 78 | accelerator: str = "gpu" 79 | log_every_n_steps: int = 20 80 | logger: Optional[list[logger_schemas.LoggerConfig]] = field( 81 | default_factory=lambda: [logger_schemas.MLFlowLoggerConfig()] 82 | ) # type: ignore 83 | callbacks: Optional[list[callbacks_schemas.CallbackConfig]] = field( 84 | default_factory=lambda: [ 85 | callbacks_schemas.ValidationF1ScoreBestModelCheckpointConfig(), 86 | callbacks_schemas.LastModelCheckpointConfig(), 87 | callbacks_schemas.LearningRateMonitorConfig(), 88 | ] 89 | ) 90 | 91 | 92 | def setup_config() -> None: 93 | logger_schemas.setup_config() 94 | callbacks_schemas.setup_config() 95 | 96 | cs = ConfigStore.instance() 97 | cs.store(name="trainer_schema", group="tasks/trainer", node=TrainerConfig) 98 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.poetry] 2 | name = "emkademy" 3 | version = "0.1.0" 4 | description = "" 5 | authors = ["Kıvanç Yüksel "] 6 | readme = "README.md" 7 | 8 | [tool.poetry.dependencies] 9 | python = ">=3.10,<3.11" 10 | pandas = "~=2.0" 11 | matplotlib = "~=3.7" 12 | hydra-core = "~=1.3" 13 | pydantic = "~=1.10" 14 | fsspec = { version = "~=2023.6", extras = ["gcs"] } 15 | gcsfs = "~=2023.6" 16 | torch = "==2.1.1" 17 | lightning = "==2.1.2" 18 | torchmetrics = "~=1.2" 19 | transformers = { version = "~=4.35", extras = ["torch"] } 20 | mlflow = "==2.5.0" 21 | psycopg2-binary = "~=2.9" 22 | google-cloud-compute = "~=1.13" 23 | google-cloud-secret-manager = "~=2.16" 24 | python-etcd = "~=0.4" 25 | fastapi = "~=0.104" 26 | uvicorn = {version = "~=0.24", extras=["standard"]} 27 | 28 | [tool.poetry.group.dev.dependencies] 29 | jupyterlab = "~=4.0" 30 | pytest = "~=7.4" 31 | black = "~=23.7.0" 32 | isort = "~=5.12" 33 | flake8 = "~=6.0" 34 | mypy = "~=1.4" 35 | 36 | [build-system] 37 | requires = ["poetry-core"] 38 | build-backend = "poetry.core.masonry.api" 39 | 40 | [tool.black] 41 | line-length = 120 42 | 43 | [tool.isort] 44 | profile = "black" 45 | multi_line_output = 3 46 | include_trailing_comma = true 47 | force_grid_wrap = 0 48 | use_parentheses = true 49 | ensure_newline_before_comments = true 50 | line_length = 120 51 | indent = 4 52 | lines_between_types = 1 53 | known_first_party = ["cybulde"] 54 | default_section = "THIRDPARTY" 55 | sections = ["FUTURE", "STDLIB", "THIRDPARTY", "FIRSTPARTY", "LOCALFOLDER"] 56 | 57 | [tool.mypy] 58 | python_version = "3.10" 59 | show_error_codes = true 60 | ignore_missing_imports = true 61 | install_types = true 62 | non_interactive = true 63 | follow_imports = "silent" 64 | no_strict_optional = true 65 | 66 | warn_redundant_casts = false 67 | warn_unused_ignores = true 68 | warn_unused_configs = true 69 | warn_return_any = true 70 | warn_no_return = true 71 | warn_incomplete_stub = true 72 | 73 | disallow_subclassing_any = true 74 | 75 | disallow_untyped_calls = true 76 | disallow_untyped_defs = true 77 | disallow_incomplete_defs = true 78 | disallow_untyped_decorators = true 79 | check_untyped_defs = true 80 | strict_optional = true 81 | 82 | verbosity = 0 83 | 84 | [tool.ruff] 85 | # Enable Pyflakes (`F`) and a subset of the pycodestyle (`E`) codes by default. 86 | select = ["E4", "E7", "E9", "F"] 87 | ignore = [] 88 | 89 | # Allow fix for all enabled rules (when `--fix`) is provided. 90 | fixable = [ 91 | "A", 92 | "B", 93 | "C", 94 | "D", 95 | "E", 96 | "F", 97 | "G", 98 | "I", 99 | "N", 100 | "Q", 101 | "S", 102 | "T", 103 | "W", 104 | "ANN", 105 | "ARG", 106 | "BLE", 107 | "COM", 108 | "DJ", 109 | "DTZ", 110 | "EM", 111 | "ERA", 112 | "EXE", 113 | "FBT", 114 | "ICN", 115 | "INP", 116 | "ISC", 117 | "NPY", 118 | "PD", 119 | "PGH", 120 | "PIE", 121 | "PL", 122 | "PT", 123 | "PTH", 124 | "PYI", 125 | "RET", 126 | "RSE", 127 | "RUF", 128 | "SIM", 129 | "SLF", 130 | "TCH", 131 | "TID", 132 | "TRY", 133 | "UP", 134 | "YTT", 135 | ] 136 | unfixable = [] 137 | 138 | # Exclude a variety of commonly ignored directories. 139 | exclude = [ 140 | ".bzr", 141 | ".direnv", 142 | ".eggs", 143 | ".git", 144 | ".git-rewrite", 145 | ".hg", 146 | ".mypy_cache", 147 | ".nox", 148 | ".pants.d", 149 | ".pytype", 150 | ".ruff_cache", 151 | ".svn", 152 | ".tox", 153 | ".venv", 154 | "__pypackages__", 155 | "_build", 156 | "buck-out", 157 | "build", 158 | "dist", 159 | "node_modules", 160 | "venv", 161 | ] 162 | 163 | # Same as Black. 164 | line-length = 120 165 | 166 | # Allow unused variables when underscore-prefixed. 167 | dummy-variable-rgx = "^(_+|(_+[a-zA-Z0-9_]*[a-zA-Z0-9]+?))$" 168 | 169 | # Assume Python 3.10 170 | target-version = "py310" 171 | 172 | [tool.ruff.mccabe] 173 | # Unlike Flake8, default to a complexity level of 10. 174 | max-complexity = 10 175 | -------------------------------------------------------------------------------- /cybulde/training/lightning_modules/bases.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from abc import abstractmethod 4 | from typing import Any, Callable, Iterable, Optional, Union 5 | 6 | import mlflow 7 | import torch 8 | 9 | from lightning.pytorch import LightningModule 10 | from torch import Tensor 11 | from torch.optim import Optimizer 12 | 13 | from cybulde.models.models import Model 14 | from cybulde.models.transformations import Transformation 15 | from cybulde.training.loss_functions import LossFunction 16 | from cybulde.training.schedulers import LightningScheduler 17 | from cybulde.utils.io_utils import open_file 18 | from cybulde.utils.utils import get_logger 19 | 20 | PartialOptimizerType = Callable[[Union[Iterable[Tensor], dict[str, Iterable[Tensor]]]], Optimizer] 21 | 22 | 23 | class TrainingLightningModule(LightningModule): 24 | def __init__( 25 | self, 26 | model: Model, 27 | loss: LossFunction, 28 | optimizer: PartialOptimizerType, 29 | scheduler: Optional[LightningScheduler] = None, 30 | ) -> None: 31 | super().__init__() 32 | 33 | self.model = model 34 | self.loss = loss 35 | self.partial_optimizer = optimizer 36 | self.scheduler = scheduler 37 | 38 | self.model_size = self._calculate_model_size() 39 | self.logging_logger = get_logger(self.__class__.__name__) 40 | 41 | def _calculate_model_size(self) -> float: 42 | param_size = 0 43 | for param in self.parameters(): 44 | param_size += param.nelement() & param.element_size() 45 | 46 | buffer_size = 0 47 | for buffer in self.buffers(): 48 | buffer_size += buffer.nelement() * buffer.element_size() 49 | 50 | size_all_mb = (param_size + buffer_size) / 1024**2 51 | return size_all_mb 52 | 53 | def configure_optimizers(self) -> Union[Optimizer, tuple[list[Optimizer], list[dict[str, Any]]]]: 54 | optimizer = self.partial_optimizer(self.parameters()) 55 | 56 | if self.scheduler is not None: 57 | scheduler = self.scheduler.configure_scheduler( 58 | optimizer=optimizer, estimated_stepping_batches=self.trainer.estimated_stepping_batches 59 | ) 60 | return [optimizer], [scheduler] 61 | 62 | return optimizer 63 | 64 | def on_train_end(self) -> None: 65 | try: 66 | mlflow.log_metric("model_size", self.model_size) 67 | except Exception: 68 | pass 69 | return super().on_train_end() 70 | 71 | @abstractmethod 72 | def training_step(self, batch: Any, batch_idx: int) -> Tensor: 73 | ... 74 | 75 | @abstractmethod 76 | def validation_step(self, batch: Any, batch_idx: int) -> Tensor: 77 | ... 78 | 79 | @abstractmethod 80 | def get_transformation(self) -> Transformation: 81 | ... 82 | 83 | 84 | class ModelStateDictExportingTrainingLightningModule(TrainingLightningModule): 85 | @abstractmethod 86 | def export_model_state_dict(self, checkpoint_path: str) -> str: 87 | """ 88 | Export model state dict from LightningModule checkpoint and save it 89 | to the same location as the checkpoint_path, and return the save path 90 | """ 91 | 92 | def common_export_model_state_dict(self, checkpoint_path: str) -> str: 93 | with open_file(checkpoint_path, "rb") as f: 94 | state_dict = torch.load(f, map_location=torch.device("cpu"))["state_dict"] 95 | 96 | model_state_dict = {} 97 | for key, value in state_dict.items(): 98 | if not key.startswith("loss."): 99 | model_state_dict[key.replace("model.", "", 1)] = value 100 | 101 | model_state_dict_save_path = os.path.join(os.path.dirname(checkpoint_path), "model_state_dict.pth") 102 | 103 | with open_file(model_state_dict_save_path, "wb") as f: 104 | torch.save(model_state_dict, f) 105 | 106 | return model_state_dict_save_path 107 | -------------------------------------------------------------------------------- /cybulde/utils/config_utils.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import logging.config 3 | import sys 4 | 5 | from io import StringIO 6 | from pathlib import Path 7 | from typing import TYPE_CHECKING, Any, Optional, Union 8 | 9 | import hydra 10 | import yaml 11 | 12 | from hydra import compose, initialize 13 | from hydra.types import TaskFunction 14 | from omegaconf import DictConfig, OmegaConf 15 | 16 | from cybulde.config_schemas import config_schema 17 | from cybulde.utils.io_utils import open_file, write_yaml_file 18 | 19 | if TYPE_CHECKING: 20 | from cybulde.config_schemas.config_schema import Config 21 | 22 | 23 | def get_config( 24 | config_path: str, config_name: str, to_object: bool = True, return_dict_config: bool = False 25 | ) -> TaskFunction: 26 | setup_config() 27 | setup_logger() 28 | 29 | def main_decorator(task_function: TaskFunction) -> Any: 30 | @hydra.main(config_path=config_path, config_name=config_name, version_base=None) 31 | def decorated_main(dict_config: Optional[DictConfig] = None) -> Any: 32 | if to_object: 33 | config = OmegaConf.to_object(dict_config) 34 | 35 | if not return_dict_config: 36 | assert to_object 37 | return task_function(config) 38 | return task_function(dict_config) 39 | 40 | return decorated_main 41 | 42 | return main_decorator 43 | 44 | 45 | def get_config_and_dict_config(config_path: str, config_name: str) -> Any: 46 | setup_config() 47 | setup_logger() 48 | 49 | def main_decorator(task_function: Any) -> Any: 50 | @hydra.main(config_path=config_path, config_name=config_name, version_base=None) 51 | def decorated_main(dict_config: Optional[DictConfig] = None) -> Any: 52 | config = OmegaConf.to_object(dict_config) 53 | return task_function(config, dict_config) 54 | 55 | return decorated_main 56 | 57 | return main_decorator 58 | 59 | 60 | def setup_config() -> None: 61 | config_schema.setup_config() 62 | 63 | 64 | def setup_logger() -> None: 65 | with open("./cybulde/configs/hydra/job_logging/custom.yaml", "r") as stream: 66 | config = yaml.load(stream, Loader=yaml.FullLoader) 67 | logging.config.dictConfig(config) 68 | 69 | 70 | def save_config_as_yaml(config: Union["Config", DictConfig], save_path: str) -> None: 71 | text_io = StringIO() 72 | text_io.writelines( 73 | [ 74 | f"# Do not edit this file. It is automatically generated by {sys.argv[0]}.\n", 75 | "# If you want to modify configuration, edit source files in cybulde/configs directory.\n", 76 | "\n", 77 | ] 78 | ) 79 | 80 | config_header = load_config_header() 81 | text_io.write(config_header) 82 | text_io.write("\n") 83 | 84 | OmegaConf.save(config, text_io, resolve=True) 85 | with open_file(save_path, "w") as f: 86 | f.write(text_io.getvalue()) 87 | 88 | 89 | def load_config_header() -> str: 90 | config_header_path = Path("./cybulde/configs/automatically_generated/full_config_header.yaml") 91 | if not config_header_path.exists(): 92 | config_header = { 93 | "defaults": [ 94 | # {"override hydra/job_logging": "custom"}, 95 | {"override hydra/hydra_logging": "disabled"}, 96 | "_self_", 97 | ], 98 | "hydra": {"output_subdir": None, "run": {"dir": "."}}, 99 | } 100 | config_header_path.parent.mkdir(parents=True, exist_ok=True) 101 | write_yaml_file(str(config_header_path), config_header) 102 | 103 | with open(config_header_path, "r") as f: 104 | return f.read() 105 | 106 | 107 | def load_config(config_path: str, config_name: str, overrides: Optional[list[str]] = None) -> Any: 108 | setup_config() 109 | setup_logger() 110 | 111 | if overrides is None: 112 | overrides = [] 113 | 114 | with initialize(version_base=None, config_path=config_path, job_name="config-compose"): 115 | config = compose(config_name=config_name, overrides=overrides) 116 | 117 | return config 118 | -------------------------------------------------------------------------------- /cybulde/evaluation/model_selector.py: -------------------------------------------------------------------------------- 1 | from typing import Any, Optional 2 | 3 | from mlflow.entities import Run 4 | 5 | from cybulde.utils.mlflow_utils import get_best_run, get_client 6 | from cybulde.utils.utils import get_logger 7 | 8 | 9 | class MetricComparer: 10 | def __init__(self, bigger_is_better: bool, can_be_equal: bool, metric_name: str, threshold: float = 0.0) -> None: 11 | self.bigger_is_better = bigger_is_better 12 | self.can_be_equal = can_be_equal 13 | self.metric_name = metric_name 14 | self.threshold = threshold 15 | 16 | def is_metric_better(self, run: Run, best_run_data: dict[str, Any]) -> bool: 17 | if not best_run_data: 18 | return True 19 | 20 | current_metric_value = self.get_current_metric_value(run) 21 | best_metric_value = best_run_data[f"metrics.{self.metric_name}"] 22 | 23 | if self.can_be_equal and current_metric_value == best_metric_value: 24 | return True 25 | 26 | if self.bigger_is_better: 27 | current_metric_value -= self.threshold 28 | result = current_metric_value > best_metric_value 29 | assert isinstance(result, bool) 30 | return result 31 | else: 32 | current_metric_value += self.threshold 33 | result = current_metric_value < best_metric_value 34 | assert isinstance(result, bool) 35 | return result 36 | 37 | def get_current_metric_value(self, run: Run) -> float: 38 | current_metric_value = run.data.metrics.get(self.metric_name, None) 39 | if current_metric_value is None: 40 | raise RuntimeError(f"Metric: {self.metric_name} couldn't be found on MLFlow. Was it logged?") 41 | assert isinstance(current_metric_value, float) 42 | return current_metric_value 43 | 44 | 45 | class ModelSelector: 46 | def __init__( 47 | self, 48 | mlflow_run_id: str, 49 | must_be_better_metric_comparers: dict[str, MetricComparer] = {}, 50 | to_be_thresholded_metric_comparers: dict[str, MetricComparer] = {}, 51 | threshold: float = 0.0, 52 | ) -> None: 53 | if not must_be_better_metric_comparers and not to_be_thresholded_metric_comparers: 54 | raise ValueError( 55 | "Both 'must_be_better_metric_comparers' and 'to_be_thresholded_metric_comparers' cannot be empty..." 56 | ) 57 | 58 | self.logger = get_logger(self.__class__.__name__) 59 | 60 | self.mlflow_run_id = mlflow_run_id 61 | self.must_be_better_metric_comparers = must_be_better_metric_comparers 62 | self.to_be_thresholded_metric_comparers = to_be_thresholded_metric_comparers 63 | self.threshold = threshold 64 | 65 | client = get_client() 66 | self.run = client.get_run(mlflow_run_id) 67 | self.best_run_data = get_best_run() 68 | self.new_best_run_tag: Optional[str] = None 69 | 70 | def is_selected(self) -> bool: 71 | is_selected = self._is_selected(self.run) 72 | if is_selected: 73 | self.new_best_run_tag = self.get_new_best_run_tag() 74 | return is_selected 75 | 76 | def _is_selected(self, run: Run) -> bool: 77 | for metric_name, metric_comparer in self.must_be_better_metric_comparers.items(): 78 | if not metric_comparer.is_metric_better(run, self.best_run_data): 79 | self.logger.info(f"'{metric_name}' is a must have metric, and its value is not better than before...") 80 | return False 81 | 82 | hits = [] 83 | for metric_comparer in self.to_be_thresholded_metric_comparers.values(): 84 | is_metric_better = metric_comparer.is_metric_better(run, self.best_run_data) 85 | hits.append(int(is_metric_better)) 86 | 87 | if not hits: 88 | return True 89 | 90 | mean_hits = sum(hits) / len(hits) 91 | return mean_hits > self.threshold 92 | 93 | def get_new_best_run_tag(self) -> str: 94 | if len(self.best_run_data) == 0: 95 | return "v1" 96 | last_tag: str = self.best_run_data["tags.best_run"] 97 | last_version = int(last_tag[1:]) 98 | return f"v{last_version + 1}" 99 | -------------------------------------------------------------------------------- /cybulde/utils/gcp_utils.py: -------------------------------------------------------------------------------- 1 | import inspect 2 | import typing as t 3 | 4 | from dataclasses import dataclass 5 | 6 | from google.api_core.exceptions import GoogleAPICallError 7 | from google.api_core.extended_operation import ExtendedOperation 8 | from google.cloud import compute_v1, secretmanager 9 | 10 | from cybulde.utils.utils import get_logger 11 | 12 | GCP_UTILS_LOGGER = get_logger(__name__) 13 | 14 | 15 | def access_secret_version(project_id: str, secret_id: str, version_id: str = "1") -> str: 16 | """ 17 | Access the payload for the given secret version if one exists. The version 18 | can be a version number as a string (e.g. "5") or an alias (e.g. "latest"). 19 | """ 20 | client = secretmanager.SecretManagerServiceClient() 21 | name = f"projects/{project_id}/secrets/{secret_id}/versions/{version_id}" 22 | response = client.access_secret_version(request={"name": name}) 23 | payload: str = response.payload.data.decode("UTF-8") 24 | 25 | return payload 26 | 27 | 28 | def wait_for_extended_operation( 29 | operation: ExtendedOperation, verbose_name: str = "operation", timeout: int = 300 30 | ) -> t.Any: 31 | try: 32 | result = operation.result(timeout=timeout) # type: ignore 33 | except GoogleAPICallError as ex: 34 | GCP_UTILS_LOGGER.exception("Exception occurred") 35 | for attr in ["details", "domain", "errors", "metadata", "reason", "response"]: 36 | value = getattr(ex, attr, None) 37 | if value: 38 | GCP_UTILS_LOGGER.error(f"ex.{attr}:\n{value}") 39 | if isinstance(ex.response, compute_v1.Operation): 40 | for error in ex.response.error.errors: 41 | GCP_UTILS_LOGGER.error(f"Error message: {error.message}") 42 | 43 | raise RuntimeError("Exception during extended operation") from ex 44 | 45 | if operation.error_code: 46 | GCP_UTILS_LOGGER.error( 47 | f"Error during {verbose_name}: [Code: {operation.error_code}]: {operation.error_message}" 48 | ) 49 | GCP_UTILS_LOGGER.error(f"Operation ID: {operation.name}") 50 | raise operation.exception() or RuntimeError(operation.error_message) # type: ignore 51 | 52 | if operation.warnings: 53 | GCP_UTILS_LOGGER.warning(f"Warnings during {verbose_name}:\n") 54 | for warning in operation.warnings: 55 | GCP_UTILS_LOGGER.warning(f" - {warning.code}: {warning.message}") 56 | 57 | return result 58 | 59 | 60 | @dataclass 61 | class TrainingInfo: 62 | project_id: str 63 | zone: str 64 | instance_group_name: str 65 | instance_ids: list[int] 66 | mlflow_experiment_url: str 67 | 68 | def get_job_info_message(self) -> str: 69 | instance_ids_regex, log_viewer_url, train_cluster_url = self._get_job_tracking_links() 70 | 71 | run_description = f""" 72 | Deployed training cluster: {train_cluster_url} 73 | Experiment logs (python): {log_viewer_url} 74 | if something goes wrong type in log viewer query field: 75 | ``` 76 | resource.type="gce_instance" 77 | logName="projects/{self.project_id}/logs/GCEMetadataScripts" 78 | resource.labels.instance_id={instance_ids_regex} 79 | ``` 80 | """ 81 | return inspect.cleandoc(run_description) 82 | 83 | def _get_job_tracking_links(self) -> tuple[str, str, str]: 84 | instance_ids = [str(id) for id in self.instance_ids] 85 | instance_ids_regex = " OR ".join(instance_ids) 86 | instance_ids_url = "%20OR%20".join(instance_ids) 87 | cluster_url = f"https://console.cloud.google.com/compute/instanceGroups/details/{self.zone}/{self.instance_group_name}?project={self.project_id}" 88 | log_viewer_url = f"https://console.cloud.google.com/logs/query;query=resource.type%3D%22gce_instance%22%0Aresource.labels.instance_id%3D%2528{instance_ids_url}%2529?project={self.project_id}" 89 | return instance_ids_regex, log_viewer_url, cluster_url 90 | 91 | def print_job_info(self) -> None: 92 | print(f"============ Task {self.instance_group_name} details ============") 93 | print(f"MLFlow experiment url: {self.mlflow_experiment_url}") 94 | print(self.get_job_info_message()) 95 | -------------------------------------------------------------------------------- /cybulde/utils/mlflow_utils.py: -------------------------------------------------------------------------------- 1 | import dataclasses 2 | import os 3 | 4 | from contextlib import contextmanager 5 | from typing import TYPE_CHECKING, Any, Generator, Iterable, Optional 6 | 7 | import mlflow 8 | 9 | from mlflow.pyfunc import PythonModel 10 | from mlflow.tracking.fluent import ActiveRun 11 | 12 | from cybulde.config_schemas.infrastructure.infrastructure_schema import MLFlowConfig 13 | from cybulde.utils.mixins import LoggableParamsMixin 14 | 15 | MLFLOW_TRACKING_URI = os.getenv("MLFLOW_TRACKING_URI") 16 | 17 | if TYPE_CHECKING: 18 | from cybulde.config_schemas.config_schema import Config 19 | 20 | 21 | @contextmanager # type: ignore 22 | def activate_mlflow( 23 | experiment_name: Optional[str] = None, 24 | run_id: Optional[str] = None, 25 | run_name: Optional[str] = None, 26 | ) -> Iterable[mlflow.ActiveRun]: 27 | set_experiment(experiment_name) 28 | 29 | run: ActiveRun 30 | with mlflow.start_run(run_name=run_name, run_id=run_id) as run: 31 | yield run 32 | 33 | 34 | def set_experiment(experiment_name: Optional[str] = None) -> None: 35 | if experiment_name is None: 36 | experiment_name = "Default" 37 | 38 | try: 39 | mlflow.create_experiment(experiment_name) 40 | except mlflow.exceptions.RestException: 41 | pass 42 | 43 | mlflow.set_experiment(experiment_name) 44 | 45 | 46 | def log_artifacts_for_reproducibility() -> None: 47 | locations_to_store = ["./cybulde", "./docker", "./pyproject.toml", "./poetry.lock"] 48 | 49 | for location_to_store in locations_to_store: 50 | mlflow.log_artifact(location_to_store, "reproduction") 51 | 52 | 53 | def log_training_hparams(config: "Config") -> None: 54 | logged_nodes = set() 55 | 56 | def loggable_params(node: Any, path: list[str]) -> Generator[tuple[str, Any], None, None]: 57 | if isinstance(node, LoggableParamsMixin) and id(node) not in logged_nodes: 58 | for param_name in node.loggable_params(): 59 | yield ".".join(path + [param_name]), getattr(node, param_name) 60 | logged_nodes.add(id(node)) 61 | children = None 62 | if isinstance(node, dict): 63 | children = node.items() 64 | if dataclasses.is_dataclass(node): 65 | children = ((f.name, getattr(node, f.name)) for f in dataclasses.fields(node)) # type: ignore 66 | 67 | if children is None: 68 | return 69 | for key, val in children: 70 | for item in loggable_params(val, path + [key]): 71 | yield item 72 | 73 | params = dict(loggable_params(config, [])) 74 | mlflow.log_params(params) 75 | 76 | 77 | def get_client() -> mlflow.tracking.MlflowClient: 78 | return mlflow.tracking.MlflowClient(MLFLOW_TRACKING_URI) 79 | 80 | 81 | def get_all_experiment_ids() -> list[str]: 82 | return [exp.experiment_id for exp in mlflow.search_experiments()] 83 | 84 | 85 | def get_best_run() -> dict[str, Any]: 86 | best_runs = mlflow.search_runs(get_all_experiment_ids(), filter_string="tag.best_run LIKE 'v%'") 87 | if len(best_runs) == 0: 88 | return {} 89 | 90 | indices = best_runs["tags.best_run"].str.split("v").str[-1].astype(int).sort_values() 91 | best_runs = best_runs.reindex(index=indices.index) 92 | best_runs_dict: dict[str, Any] = best_runs.iloc[-1].to_dict() 93 | return best_runs_dict 94 | 95 | 96 | class DummyWrapper(PythonModel): # type: ignore 97 | def load_context(self, some_path: str) -> None: 98 | pass 99 | 100 | def predict(self, some_input: Any, some_other_parameter: Any) -> Optional[float]: 101 | pass 102 | 103 | 104 | def log_model(mlflow_config: MLFlowConfig, new_best_run_tag: str, registered_model_name: str) -> None: 105 | experiment_name = mlflow_config.experiment_name 106 | run_id = mlflow_config.run_id 107 | run_name = mlflow_config.run_name 108 | 109 | with activate_mlflow(experiment_name=experiment_name, run_id=run_id, run_name=run_name) as _: 110 | mlflow.pyfunc.log_model( 111 | artifact_path="", python_model=DummyWrapper(), registered_model_name=registered_model_name 112 | ) 113 | mlflow.set_tag("best_run", new_best_run_tag) 114 | -------------------------------------------------------------------------------- /cybulde/models/common/exporter.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import tarfile 4 | import tempfile 5 | 6 | from typing import Any 7 | 8 | import torch 9 | 10 | from hydra.utils import instantiate 11 | from omegaconf import DictConfig, OmegaConf 12 | 13 | from cybulde.models.common.io_utils import cache_gcs_resource_locally, copy_file 14 | from cybulde.models.common.utils import get_global_rank, get_local_rank, global_rank_zero_first, local_rank_zero_first 15 | from cybulde.utils.utils import get_logger 16 | 17 | MODELS_MODULE_PATH = "cybulde/models" 18 | TEMP_MODELS_MODULE_PATH = "temp_module/models" 19 | MODEL_CONFIG_PATH = "model_config.yaml" 20 | STATE_DICT_PATH = "model_state_dict.pth" 21 | EXPORTED_MODEL_FILE_NAME = "exported_model.tar.gz" 22 | 23 | 24 | class TarModelExporter: 25 | def __init__( 26 | self, 27 | model_state_dict_path: str, 28 | model_config: Any, 29 | tar_model_export_path: str, 30 | ) -> None: 31 | self.model_state_dict_path = model_state_dict_path 32 | self.model_config = model_config 33 | self.tar_model_export_path = tar_model_export_path 34 | 35 | self.logger = get_logger(self.__class__.__name__) 36 | 37 | def export(self) -> None: 38 | with global_rank_zero_first(): 39 | if get_global_rank() in [0, -1]: 40 | state_dict_path = self.download_model_state_dict() 41 | model_config_path = self.save_model_config() 42 | 43 | local_tar_path = os.path.join(tempfile.gettempdir(), EXPORTED_MODEL_FILE_NAME) 44 | with tarfile.open(local_tar_path, "w:gz") as tar: 45 | tar.add(MODELS_MODULE_PATH, arcname=TEMP_MODELS_MODULE_PATH) 46 | tar.add(state_dict_path, arcname=STATE_DICT_PATH) 47 | tar.add(model_config_path, arcname=MODEL_CONFIG_PATH) 48 | 49 | copy_file(local_tar_path, self.tar_model_export_path) 50 | 51 | self.logger.info("Model exported successfully!") 52 | 53 | def download_model_state_dict(self) -> str: 54 | return cache_gcs_resource_locally(self.model_state_dict_path) 55 | 56 | def save_model_config(self) -> str: 57 | model_config_save_path = os.path.join(tempfile.gettempdir(), MODEL_CONFIG_PATH) 58 | OmegaConf.save(self.model_config, model_config_save_path) 59 | return model_config_save_path 60 | 61 | 62 | class TarModelLoader: 63 | def __init__(self, exported_model_path: str) -> None: 64 | self.exported_model_path = exported_model_path 65 | self.replace_module_from = MODELS_MODULE_PATH.split("/")[0] 66 | self.replace_module_to = TEMP_MODELS_MODULE_PATH.split("/")[0] 67 | self.logger = get_logger(self.__class__.__name__) 68 | 69 | def load(self) -> Any: 70 | temp_export_path = "/tmp/temp_cybulde" 71 | 72 | with local_rank_zero_first(): 73 | if get_local_rank() in [0, -1]: 74 | self.extract_tar_gz(temp_export_path) 75 | 76 | model_config = self.load_model_config(temp_export_path) 77 | model = self.load_model(temp_export_path, model_config) 78 | return model 79 | 80 | def extract_tar_gz(self, export_path: str) -> None: 81 | local_exported_model_path = cache_gcs_resource_locally(self.exported_model_path) 82 | with tarfile.open(local_exported_model_path, "r:gz") as tar: 83 | tar.extractall(path=export_path) 84 | 85 | def load_model_config(self, model_dir: str) -> Any: 86 | model_config = OmegaConf.load(f"{model_dir}/{MODEL_CONFIG_PATH}") 87 | model_config = self._replace_module_in_model_config(model_config) 88 | return model_config 89 | 90 | def load_model(self, model_dir: str, model_config: Any) -> Any: 91 | sys.path.append(model_dir) 92 | 93 | model = instantiate(model_config) 94 | state_dict = torch.load(f"{model_dir}/{STATE_DICT_PATH}") 95 | model.load_state_dict(state_dict) 96 | 97 | sys.path.remove(model_dir) 98 | return model 99 | 100 | def _replace_module_in_model_config(self, config: Any) -> Any: 101 | for key, value in config.items(): 102 | if isinstance(value, (dict, DictConfig)): 103 | self._replace_module_in_model_config(value) 104 | 105 | if key == "_target_": 106 | assert isinstance(value, str) 107 | config[key] = value.replace(self.replace_module_from, self.replace_module_to) 108 | 109 | return config 110 | -------------------------------------------------------------------------------- /cybulde/data_modules/data_modules.py: -------------------------------------------------------------------------------- 1 | from typing import Any, Callable, Optional, Protocol 2 | 3 | from lightning.pytorch import LightningDataModule 4 | from torch import Tensor 5 | from torch.utils.data import BatchSampler, DataLoader, Dataset, Sampler, default_collate 6 | from transformers import BatchEncoding 7 | 8 | from cybulde.data_modules.datasets import TextClassificationDataset 9 | from cybulde.models.transformations import HuggingFaceTokenizationTransformation, Transformation 10 | 11 | 12 | class DataModule(LightningDataModule): 13 | def __init__( 14 | self, 15 | batch_size: int, 16 | shuffle: bool = False, 17 | sampler: Optional[Sampler] = None, 18 | batch_sampler: Optional[BatchSampler] = None, 19 | num_workers: int = 0, 20 | collate_fn: Optional[Callable[[Any], Any]] = None, 21 | pin_memory: bool = False, 22 | drop_last: bool = False, 23 | persistent_workers: bool = False, 24 | ) -> None: 25 | super().__init__() 26 | 27 | self.batch_size = batch_size 28 | self.shuffle = shuffle 29 | self.sampler = sampler 30 | self.batch_sampler = batch_sampler 31 | self.num_workers = num_workers 32 | self.collate_fn = collate_fn 33 | self.pin_memory = pin_memory 34 | self.drop_last = drop_last 35 | self.persistent_workers = persistent_workers 36 | 37 | def initialize_dataloader(self, dataset: Dataset, is_test: bool) -> DataLoader: 38 | return DataLoader( 39 | dataset, 40 | batch_size=self.batch_size, 41 | shuffle=self.shuffle and not is_test, 42 | sampler=self.sampler, 43 | batch_sampler=self.batch_sampler, 44 | num_workers=self.num_workers, 45 | collate_fn=self.collate_fn, 46 | pin_memory=self.pin_memory, 47 | drop_last=self.drop_last, 48 | persistent_workers=self.persistent_workers, 49 | ) 50 | 51 | 52 | class PartialDataModuleType(Protocol): 53 | def __call__(self, transformation: Transformation) -> DataModule: 54 | ... 55 | 56 | 57 | class TextClassificationDataModule(DataModule): 58 | def __init__( 59 | self, 60 | train_df_path: str, 61 | dev_df_path: str, 62 | test_df_path: str, 63 | transformation: HuggingFaceTokenizationTransformation, 64 | text_column_name: str, 65 | label_column_name: str, 66 | batch_size: int, 67 | shuffle: bool = False, 68 | sampler: Optional[Sampler] = None, 69 | batch_sampler: Optional[BatchSampler] = None, 70 | num_workers: int = 0, 71 | pin_memory: bool = False, 72 | drop_last: bool = False, 73 | persistent_workers: bool = False, 74 | ) -> None: 75 | def tokenization_collate_fn(batch: list[tuple[str, int]]) -> tuple[BatchEncoding, Tensor]: 76 | texts, labels = default_collate(batch) 77 | encodings = transformation(texts) 78 | return encodings, labels 79 | 80 | super().__init__( 81 | batch_size=batch_size, 82 | shuffle=shuffle, 83 | sampler=sampler, 84 | batch_sampler=batch_sampler, 85 | num_workers=num_workers, 86 | collate_fn=tokenization_collate_fn, 87 | pin_memory=pin_memory, 88 | drop_last=drop_last, 89 | persistent_workers=persistent_workers, 90 | ) 91 | 92 | self.train_df_path = train_df_path 93 | self.dev_df_path = dev_df_path 94 | self.test_df_path = test_df_path 95 | 96 | self.text_column_name = text_column_name 97 | self.label_column_name = label_column_name 98 | 99 | def setup(self, stage: Optional[str] = None) -> None: 100 | if stage == "fit" or stage is None: 101 | self.train_dataset = TextClassificationDataset( 102 | self.train_df_path, self.text_column_name, self.label_column_name 103 | ) 104 | self.dev_dataset = TextClassificationDataset( 105 | self.dev_df_path, self.text_column_name, self.label_column_name 106 | ) 107 | 108 | if stage == "test": 109 | self.test_dataset = TextClassificationDataset( 110 | self.test_df_path, self.text_column_name, self.label_column_name 111 | ) 112 | 113 | def train_dataloader(self) -> DataLoader: 114 | return self.initialize_dataloader(self.train_dataset, is_test=False) 115 | 116 | def val_dataloader(self) -> DataLoader: 117 | return self.initialize_dataloader(self.dev_dataset, is_test=True) 118 | 119 | def test_dataloader(self) -> DataLoader: 120 | return self.initialize_dataloader(self.test_dataset, is_test=True) 121 | -------------------------------------------------------------------------------- /cybulde/training/lightning_modules/binary_text_classification.py: -------------------------------------------------------------------------------- 1 | from collections import defaultdict 2 | from typing import Optional 3 | 4 | import mlflow 5 | import torch 6 | 7 | from torch import Tensor 8 | from torchmetrics.classification import BinaryAccuracy, BinaryConfusionMatrix, BinaryF1Score 9 | from transformers import BatchEncoding 10 | 11 | from cybulde.models.models import Model 12 | from cybulde.models.transformations import Transformation 13 | from cybulde.training.lightning_modules.bases import ( 14 | ModelStateDictExportingTrainingLightningModule, 15 | PartialOptimizerType, 16 | ) 17 | from cybulde.training.loss_functions import LossFunction 18 | from cybulde.training.schedulers import LightningScheduler 19 | from cybulde.utils.torch_utils import plot_confusion_matrix 20 | 21 | 22 | class BinaryTextClassificationTrainingLightningModule(ModelStateDictExportingTrainingLightningModule): 23 | def __init__( 24 | self, 25 | model: Model, 26 | loss: LossFunction, 27 | optimizer: PartialOptimizerType, 28 | scheduler: Optional[LightningScheduler] = None, 29 | ) -> None: 30 | super().__init__(model=model, loss=loss, optimizer=optimizer, scheduler=scheduler) 31 | 32 | self.training_accuracy = BinaryAccuracy() 33 | self.validation_accuracy = BinaryAccuracy() 34 | 35 | self.training_f1_score = BinaryF1Score() 36 | self.validation_f1_score = BinaryF1Score() 37 | 38 | self.training_confusion_matrix = BinaryConfusionMatrix() 39 | self.validation_confusion_matrix = BinaryConfusionMatrix() 40 | 41 | self.train_step_outputs: dict[str, list[Tensor]] = defaultdict(list) 42 | self.validation_step_outputs: dict[str, list[Tensor]] = defaultdict(list) 43 | 44 | self.pos_weight: Optional[Tensor] = None 45 | 46 | def set_pos_weight(self, pos_weight: Tensor) -> None: 47 | self.pos_weight = pos_weight 48 | 49 | def forward(self, texts: BatchEncoding) -> Tensor: 50 | output: Tensor = self.model(texts) 51 | return output 52 | 53 | def training_step(self, batch: tuple[BatchEncoding, Tensor], batch_idx: int) -> Tensor: 54 | texts, labels = batch 55 | logits = self(texts) 56 | 57 | self.pos_weight = self.pos_weight.to(self.device) 58 | loss = self.loss(logits, labels, pos_weight=self.pos_weight) 59 | self.log("loss", loss, sync_dist=True) 60 | 61 | self.training_accuracy(logits, labels) 62 | self.training_f1_score(logits, labels) 63 | self.training_confusion_matrix(logits, labels) 64 | 65 | self.log("training_accuracy", self.training_accuracy, on_step=False, on_epoch=True) 66 | self.log("training_f1_score", self.training_f1_score, on_step=False, on_epoch=True) 67 | 68 | self.train_step_outputs["logits"].append(logits) 69 | self.train_step_outputs["labels"].append(labels) 70 | 71 | assert isinstance(loss, Tensor) 72 | return loss 73 | 74 | def on_train_epoch_end(self) -> None: 75 | all_logits = torch.stack(self.train_step_outputs["logits"]) 76 | all_labels = torch.stack(self.train_step_outputs["labels"]) 77 | 78 | confusion_matrix = self.training_confusion_matrix(all_logits, all_labels) 79 | figure = plot_confusion_matrix(confusion_matrix, ["0", "1"]) 80 | mlflow.log_figure(figure, "training_confusion_matrix.png") 81 | 82 | self.train_step_outputs = defaultdict(list) 83 | 84 | def validation_step(self, batch: tuple[BatchEncoding, Tensor], batch_idx: int) -> dict[str, Tensor]: # type: ignore 85 | texts, labels = batch 86 | logits = self(texts) 87 | 88 | loss = self.loss(logits, labels) 89 | self.log("validation_loss", loss, sync_dist=True) 90 | 91 | self.validation_accuracy(logits, labels) 92 | self.validation_f1_score(logits, labels) 93 | 94 | self.log("validation_accuracy", self.validation_accuracy, on_step=False, on_epoch=True) 95 | self.log("validation_f1_score", self.validation_f1_score, on_step=False, on_epoch=True) 96 | 97 | self.validation_step_outputs["logits"].append(logits) 98 | self.validation_step_outputs["labels"].append(labels) 99 | 100 | return {"loss": loss, "predictions": logits, "labels": labels} 101 | 102 | def on_validation_epoch_end(self) -> None: 103 | all_logits = torch.stack(self.validation_step_outputs["logits"]) 104 | all_labels = torch.stack(self.validation_step_outputs["labels"]) 105 | 106 | confusion_matrix = self.validation_confusion_matrix(all_logits, all_labels) 107 | figure = plot_confusion_matrix(confusion_matrix, ["0", "1"]) 108 | mlflow.log_figure(figure, "validation_confusion_matrix.png") 109 | 110 | self.validation_step_outputs = defaultdict(list) 111 | 112 | def get_transformation(self) -> Transformation: 113 | return self.model.get_transformation() 114 | 115 | def export_model_state_dict(self, checkpoint_path: str) -> str: 116 | return self.common_export_model_state_dict(checkpoint_path) 117 | -------------------------------------------------------------------------------- /cybulde/models/adapters.py: -------------------------------------------------------------------------------- 1 | from operator import attrgetter 2 | from typing import Literal, Optional 3 | 4 | import torch 5 | import torch.nn.functional as F 6 | 7 | from torch import Tensor, nn 8 | from transformers.modeling_outputs import BaseModelOutputWithPooling 9 | 10 | 11 | class Adapter(nn.Module): 12 | pass 13 | 14 | 15 | class Normalization(nn.Module): 16 | def __init__(self, p: float = 2.0) -> None: 17 | super().__init__() 18 | self.p = p 19 | 20 | def forward(self, x: Tensor) -> Tensor: 21 | return F.normalize(x, p=self.p, dim=1) 22 | 23 | 24 | class FCLayer(Adapter): 25 | def __init__( 26 | self, 27 | in_features: int, 28 | out_features: int, 29 | bias: bool, 30 | activation_fn: Optional[nn.Module] = None, 31 | dropout: float = 0.0, 32 | batch_norm: bool = False, 33 | order: str = "LABDN", 34 | ) -> None: 35 | super().__init__() 36 | 37 | order = order.upper() 38 | 39 | layers: dict[str, tuple[str, nn.Module]] = {"L": ("linear", nn.Linear(in_features, out_features, bias=bias))} 40 | 41 | if activation_fn is not None: 42 | layers["A"] = ("activation_fn", activation_fn) 43 | 44 | if batch_norm: 45 | layers["B"] = ( 46 | "batch_norm", 47 | nn.BatchNorm1d(out_features if order.index("L") < order.index("B") else in_features), 48 | ) 49 | 50 | if dropout > 0.0: 51 | layers["D"] = ("dropout", nn.Dropout(dropout)) 52 | 53 | if "N" in order: 54 | layers["N"] = ("normalization", Normalization()) 55 | 56 | self.layers = nn.Sequential() 57 | for layer_code in order: 58 | if layer_code in layers: 59 | name, layer = layers[layer_code] 60 | self.layers.add_module(name, layer) 61 | 62 | def forward(self, x: Tensor) -> Tensor: 63 | output: Tensor = self.layers(x) 64 | return output 65 | 66 | 67 | class MLPLayer(Adapter): 68 | def __init__( 69 | self, 70 | output_feature_sizes: list[int], 71 | biases: Optional[list[bool]] = None, 72 | activation_fns: Optional[list[Optional[str]]] = None, 73 | dropout_drop_probs: Optional[list[float]] = None, 74 | batch_norms: Optional[list[bool]] = None, 75 | order: str = "LABDN", 76 | standardize_input: bool = True, 77 | ) -> None: 78 | super().__init__() 79 | 80 | self.output_feature_sizes = output_feature_sizes 81 | self.output_embedding_size = output_feature_sizes[-1] 82 | 83 | nrof_layers = len(self.output_feature_sizes) - 1 84 | biases = [False] * nrof_layers if biases is None else biases 85 | activation_functions: list[Optional[str]] = [None] * nrof_layers if activation_fns is None else activation_fns 86 | dropout_drop_probabilities = [0.0] * nrof_layers if dropout_drop_probs is None else dropout_drop_probs 87 | batch_normalizations = [False] * nrof_layers if batch_norms is None else batch_norms 88 | 89 | assert ( 90 | nrof_layers 91 | == len(activation_functions) 92 | == len(dropout_drop_probabilities) 93 | == len(batch_normalizations) 94 | == len(biases) 95 | ) 96 | 97 | self.adapter = nn.Sequential() 98 | 99 | if standardize_input: 100 | self.adapter.add_module( 101 | "standardize_input", nn.LayerNorm(output_feature_sizes[0], elementwise_affine=False) 102 | ) 103 | 104 | for i in range(nrof_layers): 105 | activation_function = activation_functions[i] 106 | self.adapter.add_module( 107 | f"fc_layer_{i}", 108 | FCLayer( 109 | in_features=output_feature_sizes[i], 110 | out_features=output_feature_sizes[i + 1], 111 | bias=biases[i], 112 | activation_fn=getattr(nn, activation_function)() if activation_function is not None else None, 113 | dropout=dropout_drop_probabilities[i], 114 | batch_norm=batch_normalizations[i], 115 | order=order, 116 | ), 117 | ) 118 | 119 | def forward(self, backbone_output: Tensor) -> Tensor: 120 | output: Tensor = self.adapter(backbone_output) 121 | return output 122 | 123 | 124 | class MLPWithPooling(Adapter): 125 | def __init__( 126 | self, 127 | output_feature_sizes: list[int], 128 | biases: Optional[list[bool]] = None, 129 | activation_fns: Optional[list[Optional[str]]] = None, 130 | dropout_drop_probs: Optional[list[float]] = None, 131 | batch_norms: Optional[list[bool]] = None, 132 | order: str = "LABDN", 133 | standardize_input: bool = True, 134 | pooling_method: Optional[str] = None, 135 | output_attribute_to_use: Optional[Literal["pooler_output", "last_hidden_state"]] = None, 136 | ) -> None: 137 | super().__init__() 138 | 139 | self.output_feature_sizes = output_feature_sizes 140 | self.output_embedding_size = output_feature_sizes[-1] 141 | 142 | nrof_layers = len(output_feature_sizes) - 1 143 | if nrof_layers > 0: 144 | self.projection = MLPLayer( 145 | output_feature_sizes=output_feature_sizes, 146 | biases=biases, 147 | activation_fns=activation_fns, 148 | dropout_drop_probs=dropout_drop_probs, 149 | batch_norms=batch_norms, 150 | order=order, 151 | standardize_input=standardize_input, 152 | ) 153 | else: 154 | self.projection = nn.Identity() # type: ignore 155 | 156 | if pooling_method == "mean_pooler": 157 | self.pooler = mean_pool_tokens 158 | elif pooling_method == "cls_pooler": 159 | self.pooler = cls_pool_tokens 160 | else: 161 | self.pooler = nn.Identity() 162 | 163 | if output_attribute_to_use is not None: 164 | self.get_output_tensor = attrgetter(output_attribute_to_use) 165 | else: 166 | self.get_output_tensor = nn.Identity() # type: ignore 167 | 168 | def forward(self, backbone_output: BaseModelOutputWithPooling) -> Tensor: 169 | output = self.get_output_tensor(backbone_output) 170 | output = self.pooler(output) 171 | output = self.projection(output) 172 | assert isinstance(output, Tensor) 173 | return output 174 | 175 | 176 | def mean_pool_tokens(tensor: Tensor) -> Tensor: 177 | dims = len(tensor.shape) 178 | if dims != 3: 179 | raise ValueError(f"Tokens pooling expects exactly 3 dimensional tensor, got: {dims}") 180 | return torch.mean(tensor, dim=1) 181 | 182 | 183 | def cls_pool_tokens(tensor: Tensor) -> Tensor: 184 | dims = len(tensor.shape) 185 | if dims != 3: 186 | raise ValueError(f"Tokens pooling expects exactly 3 dimensional tensor, got: {dims}") 187 | return tensor[:, 0, :] 188 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | # Make all targets .PHONY 2 | .PHONY: $(shell sed -n -e '/^$$/ { n ; /^[^ .\#][^ ]*:/ { s/:.*$$// ; p ; } ; }' $(MAKEFILE_LIST)) 3 | 4 | include .envs/.postgres 5 | include .envs/.mlflow-common 6 | include .envs/.mlflow-dev 7 | include .envs/.infrastructure 8 | export 9 | 10 | SHELL = /usr/bin/env bash 11 | USER_NAME = $(shell whoami) 12 | USER_ID = $(shell id -u) 13 | HOST_NAME = $(shell hostname) 14 | 15 | ifeq (, $(shell which docker-compose)) 16 | DOCKER_COMPOSE_COMMAND = docker compose 17 | else 18 | DOCKER_COMPOSE_COMMAND = docker-compose 19 | endif 20 | 21 | PROD_SERVICE_NAME = app-prod 22 | PROD_CONTAINER_NAME = cybulde-model-prod-container 23 | PROD_PROFILE_NAME = prod 24 | 25 | ifeq (, $(shell which nvidia-smi)) 26 | PROFILE = ci 27 | CONTAINER_NAME = cybulde-model-ci-container 28 | SERVICE_NAME = app-ci 29 | else 30 | PROFILE = dev 31 | CONTAINER_NAME = cybulde-model-dev-container 32 | SERVICE_NAME = app-dev 33 | endif 34 | 35 | DIRS_TO_VALIDATE = cybulde 36 | DOCKER_COMPOSE_RUN = $(DOCKER_COMPOSE_COMMAND) run --rm $(SERVICE_NAME) 37 | DOCKER_COMPOSE_EXEC = $(DOCKER_COMPOSE_COMMAND) exec $(SERVICE_NAME) 38 | 39 | DOCKER_COMPOSE_RUN_PROD = $(DOCKER_COMPOSE_COMMAND) run --rm $(PROD_SERVICE_NAME) 40 | DOCKER_COMPOSE_EXEC_PROD = $(DOCKER_COMPOSE_COMMAND) exec $(PROD_SERVICE_NAME) 41 | 42 | IMAGE_TAG := $(shell echo "train-$$(uuidgen)") 43 | 44 | # Returns true if the stem is a non-empty environment variable, or else raises an error. 45 | guard-%: 46 | @#$(or ${$*}, $(error $* is not set)) 47 | 48 | ## Generate final config. For overrides use: OVERRIDES= 49 | generate-final-config: up-prod 50 | @$(DOCKER_COMPOSE_EXEC_PROD) python cybulde/generate_final_config.py docker_image=${GCP_DOCKER_REGISTRY_URL}:${IMAGE_TAG} ${OVERRIDES} 51 | 52 | ## Generate final config local. For overrides use: OVERRIDES= 53 | local-generate-final-config: up 54 | @$(DOCKER_COMPOSE_EXEC) python cybulde/generate_final_config.py ${OVERRIDES} 55 | 56 | ## Local run tasks 57 | run-tasks: generate-final-config push 58 | $(DOCKER_COMPOSE_EXEC_PROD) python cybulde/launch_job_on_gcp.py 59 | 60 | ## Local run tasks 61 | local-run-tasks: local-generate-final-config 62 | $(DOCKER_COMPOSE_EXEC) torchrun cybulde/run_tasks.py 63 | 64 | ## Starts jupyter lab 65 | notebook: up 66 | $(DOCKER_COMPOSE_EXEC) jupyter-lab --ip 0.0.0.0 --port 8888 --no-browser 67 | 68 | ## Sort code using isort 69 | sort: up 70 | $(DOCKER_COMPOSE_EXEC) isort --atomic $(DIRS_TO_VALIDATE) 71 | 72 | ## Check sorting using isort 73 | sort-check: up 74 | $(DOCKER_COMPOSE_EXEC) isort --check-only --atomic $(DIRS_TO_VALIDATE) 75 | 76 | ## Format code using black 77 | format: up 78 | $(DOCKER_COMPOSE_EXEC) black $(DIRS_TO_VALIDATE) 79 | 80 | ## Check format using black 81 | format-check: up 82 | $(DOCKER_COMPOSE_EXEC) black --check $(DIRS_TO_VALIDATE) 83 | 84 | ## Format and sort code using black and isort 85 | format-and-sort: sort format 86 | 87 | ## Lint code using flake8 88 | lint: up format-check sort-check 89 | $(DOCKER_COMPOSE_EXEC) flake8 $(DIRS_TO_VALIDATE) 90 | 91 | ## Check type annotations using mypy 92 | check-type-annotations: up 93 | $(DOCKER_COMPOSE_EXEC) mypy $(DIRS_TO_VALIDATE) 94 | 95 | ## Run tests with pytest 96 | test: up 97 | $(DOCKER_COMPOSE_EXEC) pytest 98 | 99 | ## Perform a full check 100 | full-check: lint check-type-annotations 101 | $(DOCKER_COMPOSE_EXEC) pytesta --cov --cov-report xml --verbose 102 | 103 | ## Builds docker image 104 | build: 105 | $(DOCKER_COMPOSE_COMMAND) build $(SERVICE_NAME) 106 | 107 | ## Remove poetry.lock and build docker image 108 | build-for-dependencies: 109 | rm -f *.lock 110 | $(DOCKER_COMPOSE_COMMAND) build $(SERVICE_NAME) 111 | 112 | ## Lock dependencies with poetry 113 | lock-dependencies: build-for-dependencies 114 | $(DOCKER_COMPOSE_RUN) bash -c "if [ -e /home/$(USER_NAME)/poetry.lock.build ]; then cp /home/$(USER_NAME)/poetry.lock.build ./poetry.lock; else poetry lock; fi" 115 | 116 | ## Starts docker containers using "docker-compose up -d" 117 | up: 118 | ifeq (, $(shell docker ps -a | grep $(CONTAINER_NAME))) 119 | @make down 120 | endif 121 | @$(DOCKER_COMPOSE_COMMAND) --profile $(PROFILE) up -d --remove-orphans 122 | 123 | ## Starts prod docker containers 124 | up-prod: 125 | ifeq (, $(shell docker ps -a | grep $(PROD_CONTAINER_NAME))) 126 | @make down 127 | endif 128 | @$(DOCKER_COMPOSE_COMMAND) --profile $(PROD_PROFILE_NAME) up -d --remove-orphans 129 | 130 | ## docker-compose down 131 | down: 132 | $(DOCKER_COMPOSE_COMMAND) down 133 | 134 | ## Open an interactive shell in docker container 135 | exec-in: up 136 | docker exec -it $(CONTAINER_NAME) bash 137 | 138 | push: guard-IMAGE_TAG build 139 | @gcloud auth configure-docker --quiet europe-west4-docker.pkg.dev 140 | @docker tag "$${DOCKER_IMAGE_NAME}:latest" "$${GCP_DOCKER_REGISTRY_URL}:$${IMAGE_TAG}" 141 | @docker push "$${GCP_DOCKER_REGISTRY_URL}:$${IMAGE_TAG}" 142 | 143 | ## Run ssh tunnel for MLFlow 144 | mlflow-ssh-tunnel: 145 | gcloud compute ssh "$${VM_NAME}" --zone "$${ZONE}" --tunnel-through-iap -- -N -L "$${PROD_MLFLOW_SERVER_PORT}:localhost:$${PROD_MLFLOW_SERVER_PORT}" 146 | 147 | ## Clean MLFlow volumes 148 | clean-mlflow-volumes: down 149 | docker volume rm cybulde-model_postgresql-mlflow-data cybulde-model_mlflow-artifact-store 150 | 151 | ## Deploy etcd server on GCE 152 | deploy-etcd-server: 153 | chmod +x ./scripts/deploy-etcd-server.sh 154 | ./scripts/deploy-etcd-server.sh 155 | 156 | .DEFAULT_GOAL := help 157 | 158 | # Inspired by 159 | # sed script explained: 160 | # /^##/: 161 | # * save line in hold space 162 | # * purge line 163 | # * Loop: 164 | # * append newline + line to hold space 165 | # * go to next line 166 | # * if line starts with doc comment, strip comment character off and loop 167 | # * remove target prerequisites 168 | # * append hold space (+ newline) to line 169 | # * replace newline plus comments by `---` 170 | # * print line 171 | # Separate expressions are necessary because labels cannot be delimited by 172 | # semicolon; see 173 | .PHONY: help 174 | help: 175 | @echo "$$(tput bold)Available rules:$$(tput sgr0)" 176 | @echo 177 | @sed -n -e "/^## / { \ 178 | h; \ 179 | s/.*//; \ 180 | :doc" \ 181 | -e "H; \ 182 | n; \ 183 | s/^## //; \ 184 | t doc" \ 185 | -e "s/:.*//; \ 186 | G; \ 187 | s/\\n## /---/; \ 188 | s/\\n/ /g; \ 189 | p; \ 190 | }" ${MAKEFILE_LIST} \ 191 | | LC_ALL='C' sort --ignore-case \ 192 | | awk -F '---' \ 193 | -v ncol=$$(tput cols) \ 194 | -v indent=36 \ 195 | -v col_on="$$(tput setaf 6)" \ 196 | -v col_off="$$(tput sgr0)" \ 197 | '{ \ 198 | printf "%s%*s%s ", col_on, -indent, $$1, col_off; \ 199 | n = split($$2, words, " "); \ 200 | line_length = ncol - indent; \ 201 | for (i = 1; i <= n; i++) { \ 202 | line_length -= length(words[i]) + 1; \ 203 | if (line_length <= 0) { \ 204 | line_length = ncol - indent - length(words[i]) - 1; \ 205 | printf "\n%*s ", -indent, " "; \ 206 | } \ 207 | printf "%s ", words[i]; \ 208 | } \ 209 | printf "\n"; \ 210 | }' \ 211 | | more $(shell test $(shell uname) = Darwin && echo '--no-init --raw-control-chars') 212 | 213 | -------------------------------------------------------------------------------- /cybulde/infrastructure/instance_template_creator.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass 2 | from enum import Enum 3 | from pathlib import Path 4 | 5 | from google.cloud import compute_v1 6 | 7 | from cybulde.utils.gcp_utils import wait_for_extended_operation 8 | from cybulde.utils.utils import get_logger 9 | 10 | 11 | class VMType(Enum): 12 | STANDARD = "STANDARD" 13 | SPOT = "SPOT" 14 | PREEMPTIBLE = "PREEMPTIBLE" 15 | 16 | 17 | @dataclass 18 | class BootDiskConfig: 19 | project_id: str 20 | name: str 21 | size_gb: int 22 | labels: dict[str, str] 23 | 24 | 25 | @dataclass 26 | class VMConfig: 27 | machine_type: str 28 | accelerator_count: int 29 | accelerator_type: str 30 | vm_type: VMType 31 | disks: list[str] 32 | 33 | 34 | @dataclass 35 | class VMMetadataConfig: 36 | instance_group_name: str 37 | docker_image: str 38 | zone: str 39 | python_hash_seed: int 40 | mlflow_tracking_uri: str 41 | node_count: int 42 | disks: list[str] 43 | 44 | 45 | class InstanceTemplateCreator: 46 | def __init__( 47 | self, 48 | scopes: list[str], 49 | network: str, 50 | subnetwork: str, 51 | startup_script_path: str, 52 | vm_config: VMConfig, 53 | boot_disk_config: BootDiskConfig, 54 | vm_metadata_config: VMMetadataConfig, 55 | template_name: str, 56 | project_id: str, 57 | labels: dict[str, str] = {}, 58 | ) -> None: 59 | self.logger = get_logger(self.__class__.__name__) 60 | 61 | self.scopes = scopes 62 | self.network = network 63 | self.subnetwork = subnetwork 64 | self.startup_script_path = startup_script_path 65 | self.vm_config = vm_config 66 | self.boot_disk_config = boot_disk_config 67 | self.vm_metadata_config = vm_metadata_config 68 | self.template_name = template_name.lower() 69 | self.project_id = project_id 70 | self.labels = labels 71 | 72 | self.template = compute_v1.InstanceTemplate() 73 | self.template.name = self.template_name 74 | 75 | def create_template(self) -> compute_v1.InstanceTemplate: 76 | self.logger.info("Started creating instance template...") 77 | self.logger.info(f"{self.vm_metadata_config=}") 78 | 79 | self._create_boot_disk() 80 | self._attach_disks() 81 | self._create_network_interface() 82 | self._create_machine_configuration() 83 | self._attach_metadata() 84 | 85 | self.logger.info("Creating instance template...") 86 | template_client = compute_v1.InstanceTemplatesClient() 87 | operation = template_client.insert(project=self.project_id, instance_template_resource=self.template) 88 | wait_for_extended_operation(operation, "instance template creation") 89 | 90 | self.logger.info("Instance template has been created...") 91 | return template_client.get(project=self.project_id, instance_template=self.template_name) 92 | 93 | def _create_boot_disk(self) -> None: 94 | boot_disk = compute_v1.AttachedDisk() 95 | boot_disk_initialize_params = compute_v1.AttachedDiskInitializeParams() 96 | boot_disk_image = self._get_disk_image(self.boot_disk_config.project_id, self.boot_disk_config.name) 97 | boot_disk_initialize_params.source_image = boot_disk_image.self_link 98 | boot_disk_initialize_params.disk_size_gb = self.boot_disk_config.size_gb 99 | boot_disk_initialize_params.labels = self.boot_disk_config.labels 100 | boot_disk.initialize_params = boot_disk_initialize_params 101 | boot_disk.auto_delete = True 102 | boot_disk.boot = True 103 | boot_disk.device_name = self.boot_disk_config.name 104 | 105 | if boot_disk: 106 | self.template.properties.disks = [boot_disk] 107 | 108 | def _get_disk_image(self, project_id: str, image_name: str) -> compute_v1.Image: 109 | """ 110 | Retrieve detailed information about a single image from a project. 111 | Args: 112 | project_id: project ID or project number of the Cloud project you want to list images from. 113 | image_name: name of the image you want to get details of. 114 | Returns: 115 | An instance of compute_v1.Image object with information about specified image. 116 | """ 117 | image_client = compute_v1.ImagesClient() 118 | return image_client.get(project=project_id, image=image_name) 119 | 120 | def _attach_disks(self) -> None: 121 | disk_names = self.vm_config.disks 122 | for disk_name in disk_names: 123 | disk = compute_v1.AttachedDisk( 124 | auto_delete=False, boot=False, mode="READ_ONLY", device_name=disk_name, source=disk_name 125 | ) 126 | self.template.properties.disks.append(disk) 127 | 128 | if len(disk_names) > 0: 129 | self.template.properties.metadata.items.append(compute_v1.Items(key="disks", value="\n".join(disk_names))) 130 | 131 | def _create_network_interface(self) -> None: 132 | network_interface = compute_v1.NetworkInterface() 133 | network_interface.name = "nic0" # The default value 134 | network_interface.network = self.network 135 | network_interface.subnetwork = self.subnetwork 136 | self.template.properties.network_interfaces = [network_interface] 137 | 138 | def _create_machine_configuration(self) -> None: 139 | self.template.properties.machine_type = self.vm_config.machine_type 140 | if self.vm_config.accelerator_count > 0: 141 | self.template.properties.guest_accelerators = [ 142 | compute_v1.AcceleratorConfig( 143 | accelerator_type=self.vm_config.accelerator_type, accelerator_count=self.vm_config.accelerator_count 144 | ) 145 | ] 146 | self.template.properties.service_accounts = [compute_v1.ServiceAccount(email="default", scopes=self.scopes)] 147 | self.template.properties.labels = self.labels 148 | 149 | vm_type = VMType(self.vm_config.vm_type) 150 | if vm_type == VMType.PREEMPTIBLE: 151 | self.logger.info("Using PREEMPTIBLE machine") 152 | self.template.properties.scheduling = compute_v1.Scheduling(preemptible=True) 153 | elif vm_type == VMType.SPOT: 154 | self.logger.info("Using SPOT machine") 155 | self.template.properties.scheduling = compute_v1.Scheduling( 156 | provisioning_model=compute_v1.Scheduling.ProvisioningModel.SPOT.name, # type: ignore 157 | on_host_maintenance=compute_v1.Scheduling.OnHostMaintenance.TERMINATE.name, # type: ignore 158 | ) 159 | elif vm_type == VMType.STANDARD: 160 | self.logger.info("Using STANDARD machine") 161 | self.template.properties.scheduling = compute_v1.Scheduling( 162 | provisioning_model=compute_v1.Scheduling.ProvisioningModel.STANDARD.name, # type: ignore 163 | on_host_maintenance=compute_v1.Scheduling.OnHostMaintenance.TERMINATE.name, # type: ignore 164 | ) 165 | else: 166 | raise RuntimeError(f"Unsupported {vm_type=}") 167 | 168 | def _attach_metadata(self) -> None: 169 | startup_script = self._read_startup_script(self.startup_script_path) 170 | self.template.properties.metadata.items.append(compute_v1.Items(key="startup-script", value=startup_script)) 171 | 172 | for meta_data_name, meta_data_value in self.vm_metadata_config.items(): # type: ignore 173 | self.template.properties.metadata.items.append( 174 | compute_v1.Items(key=meta_data_name, value=str(meta_data_value)) 175 | ) 176 | 177 | def _read_startup_script(self, startup_script_path: str) -> str: 178 | return Path(startup_script_path).read_text() 179 | -------------------------------------------------------------------------------- /cybulde/configs/automatically_generated/config.yaml: -------------------------------------------------------------------------------- 1 | # Do not edit this file. It is automatically generated by cybulde/generate_final_config.py. 2 | # If you want to modify configuration, edit source files in cybulde/configs directory. 3 | 4 | defaults: 5 | - override hydra/hydra_logging: disabled 6 | - _self_ 7 | hydra: 8 | output_subdir: null 9 | run: 10 | dir: . 11 | 12 | infrastructure: 13 | project_id: cybulde 14 | zone: europe-west4-b 15 | instance_group_creator: 16 | _target_: cybulde.infrastructure.instance_group_creator.InstanceGroupCreator 17 | instance_template_creator: 18 | _target_: cybulde.infrastructure.instance_template_creator.InstanceTemplateCreator 19 | scopes: 20 | - https://www.googleapis.com/auth/cloud-platform 21 | - https://www.googleapis.com/auth/cloud.useraccounts.readonly 22 | - https://www.googleapis.com/auth/cloudruntimeconfig 23 | network: https://www.googleapis.com/compute/v1/projects/cybulde/global/networks/default 24 | subnetwork: https://www.googleapis.com/compute/v1/projects/cybulde/regions/europe-west4/subnetworks/default 25 | startup_script_path: scripts/vm_startup/task_runner_startup_script.sh 26 | vm_config: 27 | machine_type: n1-standard-8 28 | accelerator_count: 1 29 | accelerator_type: nvidia-tesla-t4 30 | vm_type: STANDARD 31 | disks: [] 32 | boot_disk_config: 33 | project_id: deeplearning-platform-release 34 | name: common-cu113-v20230925 35 | size_gb: 50 36 | labels: 37 | project: cybulde 38 | vm_metadata_config: 39 | instance_group_name: cybulde-None-20231115215415 40 | docker_image: null 41 | zone: europe-west4-b 42 | python_hash_seed: 42 43 | mlflow_tracking_uri: http://127.0.0.1:6101 44 | node_count: 1 45 | disks: [] 46 | etcd_ip: 10.164.0.12:2379 47 | template_name: cybulde-None-20231115215415 48 | project_id: cybulde 49 | labels: 50 | project: cybulde 51 | name: cybulde-None-20231115215415 52 | node_count: 1 53 | project_id: cybulde 54 | zone: europe-west4-b 55 | mlflow: 56 | mlflow_external_tracking_uri: http://127.0.0.1:6101 57 | mlflow_internal_tracking_uri: http://127.0.0.1:6101 58 | experiment_name: cybulde 59 | run_name: null 60 | run_id: 63b73e0189824d4abcbd42d07fa26428 61 | experiment_id: '16' 62 | experiment_url: http://127.0.0.1:6101/#/experiments/16/runs/63b73e0189824d4abcbd42d07fa26428 63 | artifact_uri: /mlflow-artifact-store/16/63b73e0189824d4abcbd42d07fa26428/artifacts 64 | etcd_ip: 10.164.0.12:2379 65 | save_last_checkpoint_every_n_train_steps: 500 66 | seed: 1234 67 | tasks: 68 | binary_text_classification_task: 69 | _target_: cybulde.training.tasks.tar_model_exporting_training_task.TarModelExportingTrainingTask 70 | name: binary_text_classfication_task 71 | data_module: 72 | _target_: cybulde.data_modules.data_modules.TextClassificationDataModule 73 | batch_size: 1024 74 | shuffle: false 75 | num_workers: 8 76 | pin_memory: true 77 | drop_last: true 78 | persistent_workers: false 79 | train_df_path: gs://emkademy/cybulde/data/processed/rebalanced_splits/train.parquet 80 | dev_df_path: gs://emkademy/cybulde/data/processed/rebalanced_splits/dev.parquet 81 | test_df_path: gs://emkademy/cybulde/data/processed/rebalanced_splits/test.parquet 82 | transformation: 83 | _target_: cybulde.models.transformations.HuggingFaceTokenizationTransformation 84 | pretrained_tokenizer_name_or_path: gs://emkademy/cybulde/data/processed/rebalanced_splits/trained_tokenizer 85 | max_sequence_length: 200 86 | text_column_name: cleaned_text 87 | label_column_name: label 88 | lightning_module: 89 | _target_: cybulde.training.lightning_modules.binary_text_classification.BinaryTextClassificationTrainingLightningModule 90 | model: 91 | _target_: cybulde.models.models.BinaryTextClassificationModel 92 | backbone: 93 | _target_: cybulde.models.backbones.HuggingFaceBackbone 94 | transformation: 95 | _target_: cybulde.models.transformations.HuggingFaceTokenizationTransformation 96 | pretrained_tokenizer_name_or_path: gs://emkademy/cybulde/data/processed/rebalanced_splits/trained_tokenizer 97 | max_sequence_length: 200 98 | pretrained_model_name_or_path: prajjwal1/bert-tiny 99 | pretrained: false 100 | adapter: 101 | _target_: cybulde.models.adapters.MLPWithPooling 102 | output_feature_sizes: 103 | - -1 104 | biases: null 105 | activation_fns: null 106 | dropout_drop_probs: null 107 | batch_norms: null 108 | order: LABDN 109 | standardize_input: true 110 | pooling_method: null 111 | output_attribute_to_use: pooler_output 112 | head: 113 | _target_: cybulde.models.heads.SigmoidHead 114 | in_features: 128 115 | out_features: 1 116 | loss: 117 | _target_: cybulde.training.loss_functions.BCEWithLogitsLoss 118 | reduction: mean 119 | optimizer: 120 | _target_: torch.optim.AdamW 121 | _partial_: true 122 | lr: 5.0e-05 123 | betas: 124 | - 0.9 125 | - 0.999 126 | eps: 1.0e-08 127 | weight_decay: 0.001 128 | amsgrad: false 129 | foreach: null 130 | maximize: false 131 | capturable: false 132 | scheduler: 133 | _target_: cybulde.training.schedulers.CommonLightningScheduler 134 | scheduler: 135 | _target_: torch.optim.lr_scheduler.ReduceLROnPlateau 136 | _partial_: true 137 | mode: max 138 | factor: 0.1 139 | patience: 5 140 | threshold: 0.0001 141 | threshold_mode: rel 142 | cooldown: 0 143 | min_lr: 0.0 144 | eps: 1.0e-08 145 | verbose: false 146 | interval: epoch 147 | frequency: 1 148 | monitor: validation_f1_score 149 | strict: true 150 | name: null 151 | trainer: 152 | _target_: lightning.pytorch.trainer.trainer.Trainer 153 | accelerator: gpu 154 | strategy: ddp_find_unused_parameters_true 155 | devices: auto 156 | num_nodes: 1 157 | precision: 16-mixed 158 | logger: 159 | - _target_: lightning.pytorch.loggers.mlflow.MLFlowLogger 160 | experiment_name: cybulde 161 | run_name: null 162 | tracking_uri: http://127.0.0.1:6101 163 | tags: null 164 | save_dir: null 165 | prefix: '' 166 | artifact_location: null 167 | run_id: 63b73e0189824d4abcbd42d07fa26428 168 | callbacks: 169 | - _target_: lightning.pytorch.callbacks.ModelCheckpoint 170 | dirpath: /mlflow-artifact-store/16/63b73e0189824d4abcbd42d07fa26428/artifacts/best-checkpoints/ 171 | filename: null 172 | monitor: validation_f1_score 173 | verbose: false 174 | save_last: true 175 | save_top_k: 2 176 | mode: max 177 | auto_insert_metric_name: false 178 | save_weights_only: false 179 | every_n_train_steps: null 180 | train_time_interval: null 181 | every_n_epochs: null 182 | save_on_train_epoch_end: null 183 | - _target_: lightning.pytorch.callbacks.ModelCheckpoint 184 | dirpath: /mlflow-artifact-store/16/63b73e0189824d4abcbd42d07fa26428/artifacts/last-checkpoints/ 185 | filename: checkpoint-{epoch} 186 | monitor: null 187 | verbose: false 188 | save_last: true 189 | save_top_k: -1 190 | mode: min 191 | auto_insert_metric_name: false 192 | save_weights_only: false 193 | every_n_train_steps: 500 194 | train_time_interval: null 195 | every_n_epochs: null 196 | save_on_train_epoch_end: null 197 | - _target_: lightning.pytorch.callbacks.LearningRateMonitor 198 | logging_interval: step 199 | fast_dev_run: false 200 | max_epochs: 20 201 | min_epochs: null 202 | max_steps: -1 203 | min_steps: null 204 | max_time: null 205 | limit_train_batches: 1.0 206 | limit_val_batches: 1.0 207 | limit_test_batches: 1.0 208 | limit_predict_batches: 1.0 209 | overfit_batches: 0.0 210 | val_check_interval: 1.0 211 | check_val_every_n_epoch: 1 212 | num_sanity_val_steps: 2 213 | log_every_n_steps: 20 214 | enable_checkpointing: true 215 | enable_progress_bar: true 216 | enable_model_summary: true 217 | accumulate_grad_batches: 1 218 | gradient_clip_val: 5.0 219 | gradient_clip_algorithm: value 220 | deterministic: null 221 | benchmark: null 222 | inference_mode: true 223 | use_distributed_sampler: true 224 | detect_anomaly: false 225 | barebones: false 226 | sync_batchnorm: true 227 | reload_dataloaders_every_n_epochs: 0 228 | default_root_dir: ./data/pytorch-lightning 229 | best_training_checkpoint: /mlflow-artifact-store/16/63b73e0189824d4abcbd42d07fa26428/artifacts/best-checkpoints/last.ckpt 230 | last_training_checkpoint: /mlflow-artifact-store/16/63b73e0189824d4abcbd42d07fa26428/artifacts/last-checkpoints/last.ckpt 231 | tar_model_export_path: /mlflow-artifact-store/16/63b73e0189824d4abcbd42d07fa26428/artifacts/exported_model.tar.gz 232 | binary_text_evaluation_task: 233 | _target_: cybulde.evaluation.tasks.common_evaluation_task.CommonEvaluationTask 234 | name: binary_text_evaluation_task 235 | data_module: 236 | _target_: cybulde.data_modules.data_modules.TextClassificationDataModule 237 | batch_size: 1024 238 | shuffle: false 239 | num_workers: 8 240 | pin_memory: true 241 | drop_last: true 242 | persistent_workers: false 243 | train_df_path: gs://emkademy/cybulde/data/processed/rebalanced_splits/train.parquet 244 | dev_df_path: gs://emkademy/cybulde/data/processed/rebalanced_splits/dev.parquet 245 | test_df_path: gs://emkademy/cybulde/data/processed/rebalanced_splits/test.parquet 246 | transformation: 247 | _target_: cybulde.models.transformations.HuggingFaceTokenizationTransformation 248 | pretrained_tokenizer_name_or_path: gs://emkademy/cybulde/data/processed/rebalanced_splits/trained_tokenizer 249 | max_sequence_length: 200 250 | text_column_name: cleaned_text 251 | label_column_name: label 252 | lightning_module: 253 | _target_: cybulde.evaluation.lightning_modules.binary_text_evaluation.BinaryTextEvaluationLightningModule 254 | _partial_: true 255 | trainer: 256 | _target_: lightning.pytorch.trainer.trainer.Trainer 257 | accelerator: gpu 258 | strategy: ddp_find_unused_parameters_true 259 | devices: auto 260 | num_nodes: 1 261 | precision: 16-mixed 262 | logger: 263 | - _target_: lightning.pytorch.loggers.mlflow.MLFlowLogger 264 | experiment_name: cybulde 265 | run_name: null 266 | tracking_uri: http://127.0.0.1:6101 267 | tags: null 268 | save_dir: null 269 | prefix: '' 270 | artifact_location: null 271 | run_id: 63b73e0189824d4abcbd42d07fa26428 272 | callbacks: 273 | - _target_: lightning.pytorch.callbacks.ModelCheckpoint 274 | dirpath: /mlflow-artifact-store/16/63b73e0189824d4abcbd42d07fa26428/artifacts/best-checkpoints/ 275 | filename: null 276 | monitor: validation_f1_score 277 | verbose: false 278 | save_last: true 279 | save_top_k: 2 280 | mode: max 281 | auto_insert_metric_name: false 282 | save_weights_only: false 283 | every_n_train_steps: null 284 | train_time_interval: null 285 | every_n_epochs: null 286 | save_on_train_epoch_end: null 287 | - _target_: lightning.pytorch.callbacks.ModelCheckpoint 288 | dirpath: /mlflow-artifact-store/16/63b73e0189824d4abcbd42d07fa26428/artifacts/last-checkpoints/ 289 | filename: checkpoint-{epoch} 290 | monitor: null 291 | verbose: false 292 | save_last: true 293 | save_top_k: -1 294 | mode: min 295 | auto_insert_metric_name: false 296 | save_weights_only: false 297 | every_n_train_steps: 500 298 | train_time_interval: null 299 | every_n_epochs: null 300 | save_on_train_epoch_end: null 301 | - _target_: lightning.pytorch.callbacks.LearningRateMonitor 302 | logging_interval: step 303 | fast_dev_run: false 304 | max_epochs: 20 305 | min_epochs: null 306 | max_steps: -1 307 | min_steps: null 308 | max_time: null 309 | limit_train_batches: 1.0 310 | limit_val_batches: 1.0 311 | limit_test_batches: 1.0 312 | limit_predict_batches: 1.0 313 | overfit_batches: 0.0 314 | val_check_interval: 1.0 315 | check_val_every_n_epoch: 1 316 | num_sanity_val_steps: 2 317 | log_every_n_steps: 20 318 | enable_checkpointing: true 319 | enable_progress_bar: true 320 | enable_model_summary: true 321 | accumulate_grad_batches: 1 322 | gradient_clip_val: 5.0 323 | gradient_clip_algorithm: value 324 | deterministic: null 325 | benchmark: null 326 | inference_mode: true 327 | use_distributed_sampler: true 328 | detect_anomaly: false 329 | barebones: false 330 | sync_batchnorm: true 331 | reload_dataloaders_every_n_epochs: 0 332 | default_root_dir: ./data/pytorch-lightning 333 | tar_model_path: /mlflow-artifact-store/16/63b73e0189824d4abcbd42d07fa26428/artifacts/exported_model.tar.gz 334 | model_selector: 335 | _target_: cybulde.evaluation.model_selector.ModelSelector 336 | mlflow_run_id: 63b73e0189824d4abcbd42d07fa26428 337 | must_be_better_metric_comparers: 338 | f1_score: 339 | _target_: cybulde.evaluation.model_selector.MetricComparer 340 | bigger_is_better: true 341 | can_be_equal: false 342 | metric_name: test_f1_score 343 | threshold: 0.0 344 | model_size: 345 | _target_: cybulde.evaluation.model_selector.MetricComparer 346 | bigger_is_better: false 347 | can_be_equal: true 348 | metric_name: model_size 349 | threshold: 0.0 350 | to_be_thresholded_metric_comparers: {} 351 | threshold: 0.0 352 | registered_model_name: bert_tiny 353 | docker_image: null 354 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | GNU GENERAL PUBLIC LICENSE 2 | Version 2, June 1991 3 | 4 | Copyright (C) 1989, 1991 Free Software Foundation, Inc., 5 | 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 6 | Everyone is permitted to copy and distribute verbatim copies 7 | of this license document, but changing it is not allowed. 8 | 9 | Preamble 10 | 11 | The licenses for most software are designed to take away your 12 | freedom to share and change it. By contrast, the GNU General Public 13 | License is intended to guarantee your freedom to share and change free 14 | software--to make sure the software is free for all its users. This 15 | General Public License applies to most of the Free Software 16 | Foundation's software and to any other program whose authors commit to 17 | using it. (Some other Free Software Foundation software is covered by 18 | the GNU Lesser General Public License instead.) You can apply it to 19 | your programs, too. 20 | 21 | When we speak of free software, we are referring to freedom, not 22 | price. Our General Public Licenses are designed to make sure that you 23 | have the freedom to distribute copies of free software (and charge for 24 | this service if you wish), that you receive source code or can get it 25 | if you want it, that you can change the software or use pieces of it 26 | in new free programs; and that you know you can do these things. 27 | 28 | To protect your rights, we need to make restrictions that forbid 29 | anyone to deny you these rights or to ask you to surrender the rights. 30 | These restrictions translate to certain responsibilities for you if you 31 | distribute copies of the software, or if you modify it. 32 | 33 | For example, if you distribute copies of such a program, whether 34 | gratis or for a fee, you must give the recipients all the rights that 35 | you have. You must make sure that they, too, receive or can get the 36 | source code. And you must show them these terms so they know their 37 | rights. 38 | 39 | We protect your rights with two steps: (1) copyright the software, and 40 | (2) offer you this license which gives you legal permission to copy, 41 | distribute and/or modify the software. 42 | 43 | Also, for each author's protection and ours, we want to make certain 44 | that everyone understands that there is no warranty for this free 45 | software. If the software is modified by someone else and passed on, we 46 | want its recipients to know that what they have is not the original, so 47 | that any problems introduced by others will not reflect on the original 48 | authors' reputations. 49 | 50 | Finally, any free program is threatened constantly by software 51 | patents. We wish to avoid the danger that redistributors of a free 52 | program will individually obtain patent licenses, in effect making the 53 | program proprietary. To prevent this, we have made it clear that any 54 | patent must be licensed for everyone's free use or not licensed at all. 55 | 56 | The precise terms and conditions for copying, distribution and 57 | modification follow. 58 | 59 | GNU GENERAL PUBLIC LICENSE 60 | TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION 61 | 62 | 0. This License applies to any program or other work which contains 63 | a notice placed by the copyright holder saying it may be distributed 64 | under the terms of this General Public License. The "Program", below, 65 | refers to any such program or work, and a "work based on the Program" 66 | means either the Program or any derivative work under copyright law: 67 | that is to say, a work containing the Program or a portion of it, 68 | either verbatim or with modifications and/or translated into another 69 | language. (Hereinafter, translation is included without limitation in 70 | the term "modification".) Each licensee is addressed as "you". 71 | 72 | Activities other than copying, distribution and modification are not 73 | covered by this License; they are outside its scope. The act of 74 | running the Program is not restricted, and the output from the Program 75 | is covered only if its contents constitute a work based on the 76 | Program (independent of having been made by running the Program). 77 | Whether that is true depends on what the Program does. 78 | 79 | 1. You may copy and distribute verbatim copies of the Program's 80 | source code as you receive it, in any medium, provided that you 81 | conspicuously and appropriately publish on each copy an appropriate 82 | copyright notice and disclaimer of warranty; keep intact all the 83 | notices that refer to this License and to the absence of any warranty; 84 | and give any other recipients of the Program a copy of this License 85 | along with the Program. 86 | 87 | You may charge a fee for the physical act of transferring a copy, and 88 | you may at your option offer warranty protection in exchange for a fee. 89 | 90 | 2. You may modify your copy or copies of the Program or any portion 91 | of it, thus forming a work based on the Program, and copy and 92 | distribute such modifications or work under the terms of Section 1 93 | above, provided that you also meet all of these conditions: 94 | 95 | a) You must cause the modified files to carry prominent notices 96 | stating that you changed the files and the date of any change. 97 | 98 | b) You must cause any work that you distribute or publish, that in 99 | whole or in part contains or is derived from the Program or any 100 | part thereof, to be licensed as a whole at no charge to all third 101 | parties under the terms of this License. 102 | 103 | c) If the modified program normally reads commands interactively 104 | when run, you must cause it, when started running for such 105 | interactive use in the most ordinary way, to print or display an 106 | announcement including an appropriate copyright notice and a 107 | notice that there is no warranty (or else, saying that you provide 108 | a warranty) and that users may redistribute the program under 109 | these conditions, and telling the user how to view a copy of this 110 | License. (Exception: if the Program itself is interactive but 111 | does not normally print such an announcement, your work based on 112 | the Program is not required to print an announcement.) 113 | 114 | These requirements apply to the modified work as a whole. If 115 | identifiable sections of that work are not derived from the Program, 116 | and can be reasonably considered independent and separate works in 117 | themselves, then this License, and its terms, do not apply to those 118 | sections when you distribute them as separate works. But when you 119 | distribute the same sections as part of a whole which is a work based 120 | on the Program, the distribution of the whole must be on the terms of 121 | this License, whose permissions for other licensees extend to the 122 | entire whole, and thus to each and every part regardless of who wrote it. 123 | 124 | Thus, it is not the intent of this section to claim rights or contest 125 | your rights to work written entirely by you; rather, the intent is to 126 | exercise the right to control the distribution of derivative or 127 | collective works based on the Program. 128 | 129 | In addition, mere aggregation of another work not based on the Program 130 | with the Program (or with a work based on the Program) on a volume of 131 | a storage or distribution medium does not bring the other work under 132 | the scope of this License. 133 | 134 | 3. You may copy and distribute the Program (or a work based on it, 135 | under Section 2) in object code or executable form under the terms of 136 | Sections 1 and 2 above provided that you also do one of the following: 137 | 138 | a) Accompany it with the complete corresponding machine-readable 139 | source code, which must be distributed under the terms of Sections 140 | 1 and 2 above on a medium customarily used for software interchange; or, 141 | 142 | b) Accompany it with a written offer, valid for at least three 143 | years, to give any third party, for a charge no more than your 144 | cost of physically performing source distribution, a complete 145 | machine-readable copy of the corresponding source code, to be 146 | distributed under the terms of Sections 1 and 2 above on a medium 147 | customarily used for software interchange; or, 148 | 149 | c) Accompany it with the information you received as to the offer 150 | to distribute corresponding source code. (This alternative is 151 | allowed only for noncommercial distribution and only if you 152 | received the program in object code or executable form with such 153 | an offer, in accord with Subsection b above.) 154 | 155 | The source code for a work means the preferred form of the work for 156 | making modifications to it. For an executable work, complete source 157 | code means all the source code for all modules it contains, plus any 158 | associated interface definition files, plus the scripts used to 159 | control compilation and installation of the executable. However, as a 160 | special exception, the source code distributed need not include 161 | anything that is normally distributed (in either source or binary 162 | form) with the major components (compiler, kernel, and so on) of the 163 | operating system on which the executable runs, unless that component 164 | itself accompanies the executable. 165 | 166 | If distribution of executable or object code is made by offering 167 | access to copy from a designated place, then offering equivalent 168 | access to copy the source code from the same place counts as 169 | distribution of the source code, even though third parties are not 170 | compelled to copy the source along with the object code. 171 | 172 | 4. You may not copy, modify, sublicense, or distribute the Program 173 | except as expressly provided under this License. Any attempt 174 | otherwise to copy, modify, sublicense or distribute the Program is 175 | void, and will automatically terminate your rights under this License. 176 | However, parties who have received copies, or rights, from you under 177 | this License will not have their licenses terminated so long as such 178 | parties remain in full compliance. 179 | 180 | 5. You are not required to accept this License, since you have not 181 | signed it. However, nothing else grants you permission to modify or 182 | distribute the Program or its derivative works. These actions are 183 | prohibited by law if you do not accept this License. Therefore, by 184 | modifying or distributing the Program (or any work based on the 185 | Program), you indicate your acceptance of this License to do so, and 186 | all its terms and conditions for copying, distributing or modifying 187 | the Program or works based on it. 188 | 189 | 6. Each time you redistribute the Program (or any work based on the 190 | Program), the recipient automatically receives a license from the 191 | original licensor to copy, distribute or modify the Program subject to 192 | these terms and conditions. You may not impose any further 193 | restrictions on the recipients' exercise of the rights granted herein. 194 | You are not responsible for enforcing compliance by third parties to 195 | this License. 196 | 197 | 7. If, as a consequence of a court judgment or allegation of patent 198 | infringement or for any other reason (not limited to patent issues), 199 | conditions are imposed on you (whether by court order, agreement or 200 | otherwise) that contradict the conditions of this License, they do not 201 | excuse you from the conditions of this License. If you cannot 202 | distribute so as to satisfy simultaneously your obligations under this 203 | License and any other pertinent obligations, then as a consequence you 204 | may not distribute the Program at all. For example, if a patent 205 | license would not permit royalty-free redistribution of the Program by 206 | all those who receive copies directly or indirectly through you, then 207 | the only way you could satisfy both it and this License would be to 208 | refrain entirely from distribution of the Program. 209 | 210 | If any portion of this section is held invalid or unenforceable under 211 | any particular circumstance, the balance of the section is intended to 212 | apply and the section as a whole is intended to apply in other 213 | circumstances. 214 | 215 | It is not the purpose of this section to induce you to infringe any 216 | patents or other property right claims or to contest validity of any 217 | such claims; this section has the sole purpose of protecting the 218 | integrity of the free software distribution system, which is 219 | implemented by public license practices. Many people have made 220 | generous contributions to the wide range of software distributed 221 | through that system in reliance on consistent application of that 222 | system; it is up to the author/donor to decide if he or she is willing 223 | to distribute software through any other system and a licensee cannot 224 | impose that choice. 225 | 226 | This section is intended to make thoroughly clear what is believed to 227 | be a consequence of the rest of this License. 228 | 229 | 8. If the distribution and/or use of the Program is restricted in 230 | certain countries either by patents or by copyrighted interfaces, the 231 | original copyright holder who places the Program under this License 232 | may add an explicit geographical distribution limitation excluding 233 | those countries, so that distribution is permitted only in or among 234 | countries not thus excluded. In such case, this License incorporates 235 | the limitation as if written in the body of this License. 236 | 237 | 9. The Free Software Foundation may publish revised and/or new versions 238 | of the General Public License from time to time. Such new versions will 239 | be similar in spirit to the present version, but may differ in detail to 240 | address new problems or concerns. 241 | 242 | Each version is given a distinguishing version number. If the Program 243 | specifies a version number of this License which applies to it and "any 244 | later version", you have the option of following the terms and conditions 245 | either of that version or of any later version published by the Free 246 | Software Foundation. If the Program does not specify a version number of 247 | this License, you may choose any version ever published by the Free Software 248 | Foundation. 249 | 250 | 10. If you wish to incorporate parts of the Program into other free 251 | programs whose distribution conditions are different, write to the author 252 | to ask for permission. For software which is copyrighted by the Free 253 | Software Foundation, write to the Free Software Foundation; we sometimes 254 | make exceptions for this. Our decision will be guided by the two goals 255 | of preserving the free status of all derivatives of our free software and 256 | of promoting the sharing and reuse of software generally. 257 | 258 | NO WARRANTY 259 | 260 | 11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY 261 | FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN 262 | OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES 263 | PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED 264 | OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF 265 | MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS 266 | TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE 267 | PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, 268 | REPAIR OR CORRECTION. 269 | 270 | 12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING 271 | WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR 272 | REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, 273 | INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING 274 | OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED 275 | TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY 276 | YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER 277 | PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE 278 | POSSIBILITY OF SUCH DAMAGES. 279 | 280 | END OF TERMS AND CONDITIONS 281 | 282 | How to Apply These Terms to Your New Programs 283 | 284 | If you develop a new program, and you want it to be of the greatest 285 | possible use to the public, the best way to achieve this is to make it 286 | free software which everyone can redistribute and change under these terms. 287 | 288 | To do so, attach the following notices to the program. It is safest 289 | to attach them to the start of each source file to most effectively 290 | convey the exclusion of warranty; and each file should have at least 291 | the "copyright" line and a pointer to where the full notice is found. 292 | 293 | 294 | Copyright (C) 295 | 296 | This program is free software; you can redistribute it and/or modify 297 | it under the terms of the GNU General Public License as published by 298 | the Free Software Foundation; either version 2 of the License, or 299 | (at your option) any later version. 300 | 301 | This program is distributed in the hope that it will be useful, 302 | but WITHOUT ANY WARRANTY; without even the implied warranty of 303 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 304 | GNU General Public License for more details. 305 | 306 | You should have received a copy of the GNU General Public License along 307 | with this program; if not, write to the Free Software Foundation, Inc., 308 | 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. 309 | 310 | Also add information on how to contact you by electronic and paper mail. 311 | 312 | If the program is interactive, make it output a short notice like this 313 | when it starts in an interactive mode: 314 | 315 | Gnomovision version 69, Copyright (C) year name of author 316 | Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'. 317 | This is free software, and you are welcome to redistribute it 318 | under certain conditions; type `show c' for details. 319 | 320 | The hypothetical commands `show w' and `show c' should show the appropriate 321 | parts of the General Public License. Of course, the commands you use may 322 | be called something other than `show w' and `show c'; they could even be 323 | mouse-clicks or menu items--whatever suits your program. 324 | 325 | You should also get your employer (if you work as a programmer) or your 326 | school, if any, to sign a "copyright disclaimer" for the program, if 327 | necessary. Here is a sample; alter the names: 328 | 329 | Yoyodyne, Inc., hereby disclaims all copyright interest in the program 330 | `Gnomovision' (which makes passes at compilers) written by James Hacker. 331 | 332 | , 1 April 1989 333 | Ty Coon, President of Vice 334 | 335 | This General Public License does not permit incorporating your program into 336 | proprietary programs. If your program is a subroutine library, you may 337 | consider it more useful to permit linking proprietary applications with the 338 | library. If this is what you want to do, use the GNU Lesser General 339 | Public License instead of this License. 340 | --------------------------------------------------------------------------------