├── cybulde
    ├── configs
    │   ├── __init__.py
    │   ├── automatically_generated
    │   │   ├── __init__.py
    │   │   ├── full_config_header.yaml
    │   │   ├── hydra
    │   │   │   └── job_logging
    │   │   │   │   └── custom.yaml
    │   │   └── config.yaml
    │   ├── config.yaml
    │   └── hydra
    │   │   └── job_logging
    │   │       └── custom.yaml
    ├── utils
    │   ├── mixins.py
    │   ├── utils.py
    │   ├── torch_utils.py
    │   ├── io_utils.py
    │   ├── config_utils.py
    │   ├── gcp_utils.py
    │   └── mlflow_utils.py
    ├── __init__.py
    ├── tests.py
    ├── training
    │   ├── loss_functions.py
    │   ├── tasks
    │   │   ├── bases.py
    │   │   ├── common_training_task.py
    │   │   └── tar_model_exporting_training_task.py
    │   ├── schedulers.py
    │   └── lightning_modules
    │   │   ├── bases.py
    │   │   └── binary_text_classification.py
    ├── config_schemas
    │   ├── infrastructure
    │   │   ├── instance_group_creator_schemas.py
    │   │   ├── infrastructure_schema.py
    │   │   └── instance_template_creator_schemas.py
    │   ├── base_schemas.py
    │   ├── training
    │   │   ├── loss_schemas.py
    │   │   ├── optimizer_schemas.py
    │   │   ├── scheduler_schemas.py
    │   │   ├── training_task_schemas.py
    │   │   └── training_lightning_module_schemas.py
    │   ├── models
    │   │   ├── head_schemas.py
    │   │   ├── transformation_schemas.py
    │   │   ├── backbone_schemas.py
    │   │   ├── model_schemas.py
    │   │   └── adapter_schemas.py
    │   ├── trainer
    │   │   ├── logger_schemas.py
    │   │   ├── callbacks_schemas.py
    │   │   └── trainer_schemas.py
    │   ├── evaluation
    │   │   ├── evaluation_lightning_module_schemas.py
    │   │   ├── evaluation_task_schemas.py
    │   │   └── model_selector_schemas.py
    │   ├── config_schema.py
    │   ├── data_module_schemas.py
    │   └── experiment
    │   │   └── bert
    │   │       └── local_bert.py
    ├── data_modules
    │   ├── datasets.py
    │   └── data_modules.py
    ├── models
    │   ├── heads.py
    │   ├── models.py
    │   ├── common
    │   │   ├── utils.py
    │   │   ├── io_utils.py
    │   │   └── exporter.py
    │   ├── backbones.py
    │   ├── transformations.py
    │   └── adapters.py
    ├── evaluation
    │   ├── lightning_modules
    │   │   ├── bases.py
    │   │   └── binary_text_evaluation.py
    │   ├── tasks
    │   │   ├── bases.py
    │   │   └── common_evaluation_task.py
    │   └── model_selector.py
    ├── web_app
    │   └── server.py
    ├── run_tasks.py
    ├── launch_job_on_gcp.py
    ├── generate_final_config.py
    └── infrastructure
    │   ├── instance_group_creator.py
    │   └── instance_template_creator.py
├── README.md
├── .envs
    ├── .postgres
    ├── .mlflow-common
    ├── .mlflow-prod
    ├── .infrastructure
    └── .mlflow-dev
├── .dockerignore
├── docker
    ├── scripts
    │   ├── start-tracking-server.sh
    │   ├── start-prediction-service.sh
    │   └── startup-script.sh
    └── Dockerfile
├── .gitattributes
├── setup.cfg
├── scripts
    ├── deploy-etcd-server.sh
    └── vm_startup
    │   └── task_runner_startup_script.sh
├── docker-compose.yaml
├── .gitignore
├── pyproject.toml
├── Makefile
└── LICENSE


/cybulde/configs/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/cybulde/configs/automatically_generated/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # cybulde-project-template
2 | A simple template for Cybulde project
3 | 


--------------------------------------------------------------------------------
/.envs/.postgres:
--------------------------------------------------------------------------------
1 | POSTGRES_DB=backend
2 | POSTGRES_USER=backend
3 | POSTGRES_PASSWORD=backend
4 | 


--------------------------------------------------------------------------------
/cybulde/utils/mixins.py:
--------------------------------------------------------------------------------
1 | class LoggableParamsMixin:
2 |     def loggable_params(self) -> list[str]:
3 |         return []
4 | 


--------------------------------------------------------------------------------
/.envs/.mlflow-common:
--------------------------------------------------------------------------------
1 | 
2 | 
3 | LOCAL_DEV_MLFLOW_SERVER_HOST=127.0.0.1
4 | LOCAL_DEV_MLFLOW_SERVER_PORT=6101
5 | 
6 | PROD_MLFLOW_SERVER_PORT=6100
7 | 


--------------------------------------------------------------------------------
/.dockerignore:
--------------------------------------------------------------------------------
1 | # We would like to ignore everything, then allow only required files and directories to pass
2 | *
3 | 
4 | !docker
5 | !cybulde
6 | !pyproject.toml
7 | !poetry.lock
8 | 


--------------------------------------------------------------------------------
/cybulde/utils/utils.py:
--------------------------------------------------------------------------------
1 | import logging
2 | import socket
3 | 
4 | 
5 | def get_logger(name: str) -> logging.Logger:
6 |     return logging.getLogger(f"[{socket.gethostname()}] {name}")
7 | 


--------------------------------------------------------------------------------
/cybulde/configs/automatically_generated/full_config_header.yaml:
--------------------------------------------------------------------------------
1 | defaults:
2 | - override hydra/hydra_logging: disabled
3 | - _self_
4 | hydra:
5 |   output_subdir: null
6 |   run:
7 |     dir: .
8 | 


--------------------------------------------------------------------------------
/cybulde/configs/config.yaml:
--------------------------------------------------------------------------------
 1 | defaults:
 2 |   - config_schema
 3 | 
 4 |   - override hydra/job_logging: custom
 5 |   - override hydra/hydra_logging: disabled
 6 |   - _self_
 7 | 
 8 | hydra:
 9 |   output_subdir: null
10 |   run:
11 |     dir: .
12 | 


--------------------------------------------------------------------------------
/cybulde/__init__.py:
--------------------------------------------------------------------------------
1 | import warnings
2 | 
3 | warnings.filterwarnings(action="ignore", category=RuntimeWarning, module=".*schema.*")
4 | 
5 | from cybulde.config_schemas.experiment.bert import local_bert  # noqa: E402
6 | 
7 | __all__ = ["local_bert"]
8 | 


--------------------------------------------------------------------------------
/docker/scripts/start-tracking-server.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | mlflow server \
4 |   --backend-store-uri "${MLFLOW_BACKEND_STORE}" \
5 |   --default-artifact-root "${MLFLOW_ARTIFACT_STORE}" \
6 |   --host 0.0.0.0 \
7 |   --port "${LOCAL_DEV_MLFLOW_SERVER_PORT}"
8 | 


--------------------------------------------------------------------------------
/.envs/.mlflow-prod:
--------------------------------------------------------------------------------
1 | IS_PROD_ENV=true
2 | GOOGLE_CLOUD_PROJECT=cybulde
3 | MLFLOW_INTERNAL_TRACKING_URI=http://cybulde-mlflow.europe-west4-a.c.${GOOGLE_CLOUD_PROJECT}.internal:${PROD_MLFLOW_SERVER_PORT}
4 | MLFLOW_TRACKING_URI=http://localhost:${PROD_MLFLOW_SERVER_PORT}
5 | 
6 | 


--------------------------------------------------------------------------------
/docker/scripts/start-prediction-service.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | set -o errexit
 4 | set -o pipefail
 5 | set -o nounset
 6 | 
 7 | uvicorn cybulde.web_app.server:app \
 8 | 	--host "${UVICORN_HOST:-0.0.0.0}" \
 9 | 	--port "${UVICORN_PORT:-8001}" \
10 | 	--workers "${UVICORN_WORKERS:-1}"
11 | 


--------------------------------------------------------------------------------
/cybulde/tests.py:
--------------------------------------------------------------------------------
 1 | from cybulde.utils.mlflow_utils import get_all_experiment_ids, get_best_run
 2 | 
 3 | experiments = get_all_experiment_ids()
 4 | 
 5 | print(f"{experiments=}")
 6 | 
 7 | best_runs = get_best_run()
 8 | 
 9 | print(f"{best_runs=}")
10 | print(f"{best_runs['metrics.test_f1_score']=}")
11 | 


--------------------------------------------------------------------------------
/.envs/.infrastructure:
--------------------------------------------------------------------------------
1 | GCP_PROJECT_ID=cybulde
2 | GCP_ARTIFACT_REGISTRY_REPOSITORY_NAME=cybulde
3 | VM_NAME=cybulde-mlflow
4 | ZONE=europe-west4-a
5 | DOCKER_IMAGE_NAME=cybulde-model
6 | GCP_DOCKER_REGISTRY_URL=europe-west4-docker.pkg.dev/${GCP_PROJECT_ID}/${GCP_ARTIFACT_REGISTRY_REPOSITORY_NAME}/${DOCKER_IMAGE_NAME}
7 | 


--------------------------------------------------------------------------------
/.envs/.mlflow-dev:
--------------------------------------------------------------------------------
1 | 
2 | IS_PROD_ENV=false
3 | 
4 | MLFLOW_BACKEND_STORE=postgresql://${POSTGRES_USER}:${POSTGRES_PASSWORD}@mlflow-backend-store/${POSTGRES_DB}
5 | MLFLOW_ARTIFACT_STORE=/mlflow-artifact-store
6 | 
7 | MLFLOW_INTERNAL_TRACKING_URI=http://${LOCAL_DEV_MLFLOW_SERVER_HOST}:${LOCAL_DEV_MLFLOW_SERVER_PORT}
8 | MLFLOW_TRACKING_URI=${MLFLOW_INTERNAL_TRACKING_URI}
9 | 


--------------------------------------------------------------------------------
/.gitattributes:
--------------------------------------------------------------------------------
 1 | *.pxd    text diff=python
 2 | *.py     text diff=python
 3 | *.py3    text diff=python
 4 | *.pyw    text diff=python
 5 | *.pyx    text diff=python
 6 | *.pyz    text diff=python
 7 | *.pyi    text diff=python
 8 | 
 9 | *.pkl    binary
10 | *.pickle binary
11 | *.pyc    binary
12 | *.pyd    binary
13 | *.pyo    binary
14 | 
15 | *.ipynb  text
16 | * text=auto
17 | 


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
 1 | [flake8]
 2 | # https://pycodestyle.readthedocs.io/en/latest/intro.html#error-codes
 3 | # http://flake8.pycqa.org/en/latest/user/error-codes.html
 4 | ignore = E501,W503,W504,E203,I201,I202
 5 | max-line-length = 120
 6 | import-order-style = pep8
 7 | application_import_names =
 8 |     cybulde
 9 | exclude =
10 |     .git
11 | 
12 | [pycodestyle]
13 | max-line-length = 120
14 | 


--------------------------------------------------------------------------------
/docker/scripts/startup-script.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | set -o errexit
 4 | set -o pipefail
 5 | set -o nounset
 6 | 
 7 | if [[ "${IS_PROD_ENV}" == "true" ]]; then
 8 | 	/usr/local/gcloud/google-cloud-sdk/bin/gcloud compute ssh "${VM_NAME}" --zone "${ZONE}" --tunnel-through-iap -- -4 -N -L ${PROD_MLFLOW_SERVER_PORT}:localhost:${PROD_MLFLOW_SERVER_PORT}
 9 | else
10 | 	/start-prediction-service.sh &
11 | 	/start-tracking-server.sh &
12 | 	tail -F anything
13 | fi
14 | 


--------------------------------------------------------------------------------
/cybulde/training/loss_functions.py:
--------------------------------------------------------------------------------
 1 | from typing import Optional
 2 | 
 3 | import torch.nn.functional as F
 4 | 
 5 | from torch import Tensor, nn
 6 | 
 7 | 
 8 | class LossFunction(nn.Module):
 9 |     pass
10 | 
11 | 
12 | class BCEWithLogitsLoss(LossFunction):
13 |     def __init__(self, reduction: str = "mean") -> None:
14 |         super().__init__()
15 |         self.reduction = reduction
16 | 
17 |     def forward(self, x: Tensor, target: Tensor, pos_weight: Optional[Tensor] = None) -> Tensor:
18 |         return F.binary_cross_entropy_with_logits(x, target, reduction=self.reduction, pos_weight=pos_weight)
19 | 


--------------------------------------------------------------------------------
/cybulde/configs/hydra/job_logging/custom.yaml:
--------------------------------------------------------------------------------
 1 | version: 1
 2 | formatters:
 3 |   brief:
 4 |     format: '[%(levelname)s] %(asctime)s %(name)s: %(message)s'
 5 |     datefmt: '%Y-%m-%d %H:%M:%S'
 6 | handlers:
 7 |   file:
 8 |     level: INFO
 9 |     class: logging.handlers.RotatingFileHandler
10 |     formatter: brief
11 |     maxBytes: 1024
12 |     backupCount: 0
13 |     filename: logs.log
14 |     mode: w
15 |     encoding: utf8
16 |   console:
17 |     level: DEBUG
18 |     class: logging.StreamHandler
19 |     formatter: brief
20 |     stream: ext://sys.stdout
21 | root:
22 |   level: INFO
23 |   handlers: [file, console]
24 | 
25 | disable_existing_loggers: false
26 | 


--------------------------------------------------------------------------------
/cybulde/configs/automatically_generated/hydra/job_logging/custom.yaml:
--------------------------------------------------------------------------------
 1 | version: 1
 2 | formatters:
 3 |   brief:
 4 |     format: '[%(levelname)s] %(asctime)s %(name)s: %(message)s'
 5 |     datefmt: '%Y-%m-%d %H:%M:%S'
 6 | handlers:
 7 |   file:
 8 |     level: INFO
 9 |     class: logging.handlers.RotatingFileHandler
10 |     formatter: brief
11 |     maxBytes: 1024
12 |     backupCount: 0
13 |     filename: logs.log
14 |     mode: w
15 |     encoding: utf8
16 |   console:
17 |     level: DEBUG
18 |     class: logging.StreamHandler
19 |     formatter: brief
20 |     stream: ext://sys.stdout
21 | root:
22 |   level: INFO
23 |   handlers: [file, console]
24 | 
25 | disable_existing_loggers: false
26 | 


--------------------------------------------------------------------------------
/cybulde/config_schemas/infrastructure/instance_group_creator_schemas.py:
--------------------------------------------------------------------------------
 1 | from omegaconf import SI
 2 | from pydantic.dataclasses import dataclass
 3 | 
 4 | from cybulde.config_schemas.infrastructure.instance_template_creator_schemas import InstanceTemplateCreatorConfig
 5 | 
 6 | 
 7 | @dataclass
 8 | class InstanceGroupCreatorConfig:
 9 |     _target_: str = "cybulde.infrastructure.instance_group_creator.InstanceGroupCreator"
10 |     instance_template_creator: InstanceTemplateCreatorConfig = InstanceTemplateCreatorConfig()
11 |     name: str = SI("${infrastructure.mlflow.experiment_name}-${infrastructure.mlflow.run_name}-${now:%Y%m%d%H%M%S}")
12 |     node_count: int = 1
13 |     project_id: str = SI("${infrastructure.project_id}")
14 |     zone: str = SI("${infrastructure.zone}")
15 | 


--------------------------------------------------------------------------------
/cybulde/data_modules/datasets.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | 
 3 | from torch import Tensor
 4 | from torch.utils.data import Dataset
 5 | 
 6 | 
 7 | class TextClassificationDataset(Dataset):
 8 |     def __init__(self, df_path: str, text_column_name: str, label_column_name: str) -> None:
 9 |         super().__init__()
10 |         self.df = pd.read_parquet(df_path)
11 |         self.text_column_name = text_column_name
12 |         self.label_column_name = label_column_name
13 | 
14 |     def __getitem__(self, idx: int) -> tuple[str, Tensor]:
15 |         row = self.df.iloc[idx]
16 | 
17 |         text = row[self.text_column_name]
18 |         label = row[self.label_column_name]
19 | 
20 |         return text, Tensor([label])
21 | 
22 |     def __len__(self) -> int:
23 |         return len(self.df)
24 | 


--------------------------------------------------------------------------------
/cybulde/config_schemas/base_schemas.py:
--------------------------------------------------------------------------------
 1 | from dataclasses import dataclass
 2 | 
 3 | from omegaconf import MISSING
 4 | 
 5 | from cybulde.config_schemas.data_module_schemas import DataModuleConfig
 6 | from cybulde.config_schemas.trainer.trainer_schemas import TrainerConfig
 7 | from cybulde.utils.mixins import LoggableParamsMixin
 8 | 
 9 | 
10 | @dataclass
11 | class LightningModuleConfig(LoggableParamsMixin):
12 |     _target_: str = MISSING
13 | 
14 | 
15 | @dataclass
16 | class TaskConfig(LoggableParamsMixin):
17 |     _target_: str = MISSING
18 |     name: str = MISSING
19 |     data_module: DataModuleConfig = MISSING
20 |     lightning_module: LightningModuleConfig = MISSING
21 |     trainer: TrainerConfig = MISSING
22 | 
23 |     def loggable_params(self) -> list[str]:
24 |         return ["_target_"]
25 | 


--------------------------------------------------------------------------------
/cybulde/models/heads.py:
--------------------------------------------------------------------------------
 1 | from torch import Tensor, nn
 2 | 
 3 | 
 4 | class Head(nn.Module):
 5 |     pass
 6 | 
 7 | 
 8 | class SoftmaxHead(Head):
 9 |     def __init__(self, in_features: int, out_features: int, dim: int = 1) -> None:
10 |         super().__init__()
11 |         self.head = nn.Sequential(nn.Linear(in_features, out_features), nn.Softmax(dim=dim))
12 | 
13 |     def forward(self, x: Tensor) -> Tensor:
14 |         output: Tensor = self.head(x)
15 |         return output
16 | 
17 | 
18 | class SigmoidHead(Head):
19 |     def __init__(self, in_features: int, out_features: int) -> None:
20 |         super().__init__()
21 | 
22 |         self.head = nn.Sequential(nn.Linear(in_features, out_features), nn.Sigmoid())
23 | 
24 |     def forward(self, x: Tensor) -> Tensor:
25 |         output: Tensor = self.head(x)
26 |         return output
27 | 


--------------------------------------------------------------------------------
/cybulde/evaluation/lightning_modules/bases.py:
--------------------------------------------------------------------------------
 1 | from abc import abstractmethod
 2 | from typing import Any, Protocol
 3 | 
 4 | from lightning.pytorch import LightningModule
 5 | from torch import Tensor
 6 | 
 7 | from cybulde.models.models import Model
 8 | from cybulde.models.transformations import Transformation
 9 | 
10 | 
11 | class EvaluationLightningModule(LightningModule):
12 |     def __init__(self, model: Model) -> None:
13 |         super().__init__()
14 |         self.model = model
15 | 
16 |     @abstractmethod
17 |     def test_step(self, batch: Any, batch_idx: int) -> Tensor:
18 |         ...
19 | 
20 |     @abstractmethod
21 |     def get_transformation(self) -> Transformation:
22 |         ...
23 | 
24 | 
25 | class PartialEvaluationLightningModuleType(Protocol):
26 |     def __call__(self, model: Model) -> EvaluationLightningModule:
27 |         ...
28 | 


--------------------------------------------------------------------------------
/cybulde/config_schemas/training/loss_schemas.py:
--------------------------------------------------------------------------------
 1 | from dataclasses import dataclass
 2 | 
 3 | from hydra.core.config_store import ConfigStore
 4 | from omegaconf import MISSING
 5 | 
 6 | from cybulde.utils.mixins import LoggableParamsMixin
 7 | 
 8 | 
 9 | @dataclass
10 | class LossFunctionConfig(LoggableParamsMixin):
11 |     _target_: str = MISSING
12 | 
13 |     def loggable_params(self) -> list[str]:
14 |         return ["_target_"]
15 | 
16 | 
17 | @dataclass
18 | class BCEWithLogitsLossConfig(LossFunctionConfig):
19 |     _target_: str = "cybulde.training.loss_functions.BCEWithLogitsLoss"
20 |     reduction: str = "mean"
21 | 
22 | 
23 | def setup_config() -> None:
24 |     cs = ConfigStore.instance()
25 |     cs.store(
26 |         name="bce_with_logits_loss_schema",
27 |         group="tasks/lightning_module/loss",
28 |         node=BCEWithLogitsLossConfig,
29 |     )
30 | 


--------------------------------------------------------------------------------
/cybulde/config_schemas/models/head_schemas.py:
--------------------------------------------------------------------------------
 1 | from dataclasses import dataclass
 2 | 
 3 | from hydra.core.config_store import ConfigStore
 4 | from omegaconf import MISSING
 5 | 
 6 | from cybulde.utils.mixins import LoggableParamsMixin
 7 | 
 8 | 
 9 | @dataclass
10 | class HeadConfig(LoggableParamsMixin):
11 |     _target_: str = MISSING
12 | 
13 |     def loggable_params(self) -> list[str]:
14 |         return ["_target_"]
15 | 
16 | 
17 | @dataclass
18 | class SigmoidHeadConfig(HeadConfig):
19 |     _target_: str = "cybulde.models.heads.SigmoidHead"
20 |     in_features: int = MISSING
21 |     out_features: int = MISSING
22 | 
23 | 
24 | @dataclass
25 | class BinaryClassificationSigmoidHead(SigmoidHeadConfig):
26 |     in_features: int = 128
27 |     out_features: int = 1
28 | 
29 | 
30 | def setup_config() -> None:
31 |     cs = ConfigStore.instance()
32 |     cs.store(
33 |         name="sigmoid_head_schema",
34 |         group="tasks/lightning_module/model/head",
35 |         node=SigmoidHeadConfig,
36 |     )
37 | 


--------------------------------------------------------------------------------
/cybulde/config_schemas/trainer/logger_schemas.py:
--------------------------------------------------------------------------------
 1 | from dataclasses import dataclass
 2 | from typing import Any, Optional
 3 | 
 4 | from hydra.core.config_store import ConfigStore
 5 | from omegaconf import MISSING, SI
 6 | 
 7 | 
 8 | @dataclass
 9 | class LoggerConfig:
10 |     _target_: str = MISSING
11 | 
12 | 
13 | @dataclass
14 | class MLFlowLoggerConfig(LoggerConfig):
15 |     _target_: str = "lightning.pytorch.loggers.mlflow.MLFlowLogger"
16 |     experiment_name: str = SI("${infrastructure.mlflow.experiment_name}")
17 |     run_name: Optional[str] = SI("${infrastructure.mlflow.run_name}")
18 |     tracking_uri: Optional[str] = SI("${infrastructure.mlflow.mlflow_internal_tracking_uri}")
19 |     tags: Optional[dict[str, Any]] = None
20 |     save_dir: Optional[str] = None
21 |     prefix: str = ""
22 |     artifact_location: Optional[str] = None
23 |     run_id: Optional[str] = SI("${infrastructure.mlflow.run_id}")
24 | 
25 | 
26 | def setup_config() -> None:
27 |     cs = ConfigStore.instance()
28 |     cs.store(name="mlflow_logger_schema", group="tasks/trainer/logger", node=MLFlowLoggerConfig)
29 | 


--------------------------------------------------------------------------------
/cybulde/web_app/server.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | from fastapi import FastAPI
 4 | from hydra.utils import instantiate
 5 | 
 6 | from cybulde.models.common.exporter import TarModelLoader
 7 | from cybulde.utils.config_utils import load_config
 8 | from cybulde.utils.mlflow_utils import get_client
 9 | 
10 | config = load_config(config_path="../configs/automatically_generated", config_name="config")
11 | tokenizer = instantiate(config.tasks.binary_text_classification_task.data_module.transformation)
12 | 
13 | model_name = "bert_tiny"
14 | model_version = "1"
15 | mlflow_client = get_client()
16 | 
17 | mlflow_model = mlflow_client.get_model_version(name=model_name, version=model_version)
18 | model_path = os.path.join(mlflow_model.source, "exported_model.tar.gz")  # type: ignore
19 | model = TarModelLoader(model_path).load()
20 | model.eval()
21 | 
22 | app = FastAPI()
23 | 
24 | 
25 | @app.get("/predict_cyberbullying")
26 | def predict_cyberbullying(text: str) -> dict[str, int]:
27 |     tokens = tokenizer([text])
28 |     probs = model(tokens)
29 |     classes = (probs >= 0.5).item()
30 |     return {"is_cyberbullying": int(classes)}
31 | 


--------------------------------------------------------------------------------
/cybulde/config_schemas/evaluation/evaluation_lightning_module_schemas.py:
--------------------------------------------------------------------------------
 1 | from dataclasses import dataclass
 2 | 
 3 | from hydra.core.config_store import ConfigStore
 4 | from omegaconf import MISSING
 5 | 
 6 | from cybulde.config_schemas.base_schemas import LightningModuleConfig
 7 | 
 8 | 
 9 | @dataclass
10 | class EvaluationLightningModuleConfig(LightningModuleConfig):
11 |     _target_: str = MISSING
12 |     _partial_: bool = False
13 | 
14 |     def loggable_params(self) -> list[str]:
15 |         return ["_target_"]
16 | 
17 | 
18 | @dataclass
19 | class PartialEvaluationLightningModuleConfig(EvaluationLightningModuleConfig):
20 |     _partial_: bool = True
21 | 
22 | 
23 | @dataclass
24 | class BinaryTextEvaluationLightningModuleConfig(PartialEvaluationLightningModuleConfig):
25 |     _target_: str = "cybulde.evaluation.lightning_modules.binary_text_evaluation.BinaryTextEvaluationLightningModule"
26 | 
27 | 
28 | def setup_config() -> None:
29 |     cs = ConfigStore.instance()
30 |     cs.store(
31 |         name="binary_text_classification_prediction_lightning_module_schema",
32 |         group="tasks/lightning_module",
33 |         node=BinaryTextEvaluationLightningModuleConfig,
34 |     )
35 | 


--------------------------------------------------------------------------------
/cybulde/models/models.py:
--------------------------------------------------------------------------------
 1 | from abc import abstractmethod
 2 | from typing import Optional
 3 | 
 4 | from torch import Tensor, nn
 5 | from transformers import BatchEncoding
 6 | 
 7 | from cybulde.models.adapters import Adapter
 8 | from cybulde.models.backbones import Backbone
 9 | from cybulde.models.heads import Head
10 | from cybulde.models.transformations import Transformation
11 | 
12 | 
13 | class Model(nn.Module):
14 |     @abstractmethod
15 |     def get_transformation(self) -> Transformation:
16 |         ...
17 | 
18 | 
19 | class BinaryTextClassificationModel(Model):
20 |     def __init__(self, backbone: Backbone, head: Head, adapter: Optional[Adapter]) -> None:
21 |         super().__init__()
22 |         self.backbone = backbone
23 |         self.adapter = adapter
24 |         self.head = head
25 | 
26 |     def forward(self, encodings: BatchEncoding) -> Tensor:
27 |         output = self.backbone(encodings)
28 |         if self.adapter is not None:
29 |             output = self.adapter(output)
30 |         output = self.head(output)
31 |         assert isinstance(output, Tensor)
32 |         return output
33 | 
34 |     def get_transformation(self) -> Transformation:
35 |         return self.backbone.get_transformation()
36 | 


--------------------------------------------------------------------------------
/cybulde/run_tasks.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | from hydra.utils import instantiate
 4 | from lightning.pytorch import seed_everything
 5 | 
 6 | from cybulde.config_schemas.config_schema import Config
 7 | from cybulde.utils.config_utils import get_config
 8 | from cybulde.utils.torch_utils import get_local_rank
 9 | from cybulde.utils.utils import get_logger
10 | 
11 | 
12 | @get_config(
13 |     config_path="../configs/automatically_generated", config_name="config", to_object=False, return_dict_config=True
14 | )
15 | def run_tasks(config: Config) -> None:
16 |     logger = get_logger(__file__)
17 |     assert config.infrastructure.mlflow.run_id is not None, "Run id has to be set for running tasks"
18 | 
19 |     backend = "gloo"
20 |     if torch.cuda.is_available():
21 |         torch.cuda.set_device(f"cuda:{get_local_rank()}")
22 |         backend = "nccl"
23 | 
24 |     torch.distributed.init_process_group(backend=backend)
25 | 
26 |     seed_everything(seed=config.seed, workers=True)
27 | 
28 |     for task_name, task_config in config.tasks.items():
29 |         logger.info(f"Running task: {task_name}")
30 |         task = instantiate(task_config)
31 |         task.run(config=config, task_config=task_config)
32 | 
33 | 
34 | if __name__ == "__main__":
35 |     run_tasks()
36 | 


--------------------------------------------------------------------------------
/cybulde/config_schemas/config_schema.py:
--------------------------------------------------------------------------------
 1 | from typing import Optional
 2 | 
 3 | from hydra.core.config_store import ConfigStore
 4 | from omegaconf import MISSING
 5 | from pydantic.dataclasses import dataclass
 6 | 
 7 | from cybulde.config_schemas import base_schemas
 8 | from cybulde.config_schemas.evaluation import evaluation_task_schemas, model_selector_schemas
 9 | from cybulde.config_schemas.infrastructure import infrastructure_schema
10 | from cybulde.config_schemas.training import training_task_schemas
11 | 
12 | 
13 | @dataclass
14 | class Config:
15 |     infrastructure: infrastructure_schema.InfrastructureConfig = infrastructure_schema.InfrastructureConfig()
16 |     save_last_checkpoint_every_n_train_steps: int = 500
17 |     seed: int = 1234
18 |     tasks: dict[str, base_schemas.TaskConfig] = MISSING
19 |     model_selector: Optional[model_selector_schemas.ModelSelectorConfig] = None
20 |     registered_model_name: Optional[str] = None
21 |     docker_image: Optional[str] = None
22 | 
23 | 
24 | def setup_config() -> None:
25 |     infrastructure_schema.setup_config()
26 |     training_task_schemas.setup_config()
27 |     evaluation_task_schemas.setup_config()
28 |     model_selector_schemas.setup_config()
29 | 
30 |     cs = ConfigStore.instance()
31 |     cs.store(name="config_schema", node=Config)
32 | 


--------------------------------------------------------------------------------
/cybulde/launch_job_on_gcp.py:
--------------------------------------------------------------------------------
 1 | from typing import TYPE_CHECKING
 2 | 
 3 | import mlflow
 4 | 
 5 | from hydra.utils import instantiate
 6 | 
 7 | from cybulde.utils.config_utils import get_config
 8 | from cybulde.utils.gcp_utils import TrainingInfo
 9 | 
10 | if TYPE_CHECKING:
11 |     from cybulde.config_schemas.config_schema import Config
12 | 
13 | 
14 | @get_config(
15 |     config_path="../configs/automatically_generated", config_name="config", to_object=False, return_dict_config=True
16 | )
17 | def run(config: "Config") -> None:
18 |     run_id = config.infrastructure.mlflow.run_id
19 |     assert run_id is not None
20 | 
21 |     instance_group_creator = instantiate(config.infrastructure.instance_group_creator)
22 |     instance_ids = instance_group_creator.launch_instance_group()
23 |     training_info = TrainingInfo(
24 |         project_id=config.infrastructure.project_id,
25 |         zone=config.infrastructure.zone,
26 |         instance_group_name=config.infrastructure.instance_group_creator.name,
27 |         instance_ids=instance_ids,
28 |         mlflow_experiment_url=config.infrastructure.mlflow.experiment_url,
29 |     )
30 |     mlflow.start_run(run_id=run_id, description=training_info.get_job_info_message())
31 |     training_info.print_job_info()
32 | 
33 | 
34 | if __name__ == "__main__":
35 |     run()
36 | 


--------------------------------------------------------------------------------
/scripts/deploy-etcd-server.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | gcloud compute instances create-with-container etcd-server \
 4 | 	--project=cybulde \
 5 | 	--zone=europe-west4-a \
 6 | 	--machine-type=n1-standard-1 \
 7 | 	--network-interface=subnet=default,no-address \
 8 | 	--maintenance-policy=MIGRATE \
 9 | 	--provisioning-model=STANDARD \
10 | 	--service-account=941446584999-compute@developer.gserviceaccount.com \
11 | 	--scopes=https://www.googleapis.com/auth/devstorage.read_only,https://www.googleapis.com/auth/logging.write,https://www.googleapis.com/auth/monitoring.write,https://www.googleapis.com/auth/servicecontrol,https://www.googleapis.com/auth/service.management.readonly,https://www.googleapis.com/auth/trace.append \
12 | 	--image=projects/cos-cloud/global/images/cos-stable-109-17800-66-15 \
13 | 	--boot-disk-size=10GB \
14 | 	--boot-disk-type=pd-balanced \
15 | 	--boot-disk-device-name=etcd-server \
16 | 	--container-image=docker.io/bitnami/etcd:3.5 \
17 | 	--container-restart-policy=always \
18 | 	--container-privileged \
19 | 	--container-env=ALLOW_NONE_AUTHENTICATION=yes,ETCD_ADVERTISE_CLIENT_URLS=http://0.0.0.0:2379,ETCD_ENABLE_V2=true,ETCDCTL_API=2 \
20 | 	--no-shielded-secure-boot \
21 | 	--shielded-vtpm \
22 | 	--shielded-integrity-monitoring \
23 | 	--labels=goog-ec-src=vm_add-gcloud,container-vm=cos-stable-109-17800-66-15
24 | 


--------------------------------------------------------------------------------
/cybulde/config_schemas/infrastructure/infrastructure_schema.py:
--------------------------------------------------------------------------------
 1 | from dataclasses import dataclass
 2 | from typing import Optional
 3 | 
 4 | from hydra.core.config_store import ConfigStore
 5 | from omegaconf import SI
 6 | 
 7 | from cybulde.config_schemas.infrastructure.instance_group_creator_schemas import InstanceGroupCreatorConfig
 8 | 
 9 | 
10 | @dataclass
11 | class MLFlowConfig:
12 |     mlflow_external_tracking_uri: str = SI("${oc.env:MLFLOW_TRACKING_URI,localhost:6101}")
13 |     mlflow_internal_tracking_uri: str = SI("${oc.env:MLFLOW_INTERNAL_TRACKING_URI,localhost:6101}")
14 |     experiment_name: str = "Default"
15 |     run_name: Optional[str] = None
16 |     run_id: Optional[str] = None
17 |     experiment_id: Optional[str] = None
18 |     experiment_url: str = SI("${.mlflow_external_tracking_uri}/#/experiments/${.experiment_id}/runs/${.run_id}")
19 |     artifact_uri: Optional[str] = None
20 | 
21 | 
22 | @dataclass
23 | class InfrastructureConfig:
24 |     project_id: str = "cybulde"
25 |     zone: str = "europe-west4-b"
26 |     instance_group_creator: InstanceGroupCreatorConfig = InstanceGroupCreatorConfig()
27 |     mlflow: MLFlowConfig = MLFlowConfig()
28 |     etcd_ip: Optional[str] = "10.164.0.12:2379"
29 | 
30 | 
31 | def setup_config() -> None:
32 |     cs = ConfigStore.instance()
33 |     cs.store(
34 |         name="infrastructure_schema",
35 |         group="infrastructure",
36 |         node=InfrastructureConfig,
37 |     )
38 | 


--------------------------------------------------------------------------------
/cybulde/config_schemas/models/transformation_schemas.py:
--------------------------------------------------------------------------------
 1 | from dataclasses import dataclass
 2 | 
 3 | from hydra.core.config_store import ConfigStore
 4 | from omegaconf import MISSING
 5 | 
 6 | from cybulde.utils.mixins import LoggableParamsMixin
 7 | 
 8 | 
 9 | @dataclass
10 | class TransformationConfig(LoggableParamsMixin):
11 |     _target_: str = MISSING
12 | 
13 |     def loggable_params(self) -> list[str]:
14 |         return ["_target_"]
15 | 
16 | 
17 | @dataclass
18 | class HuggingFaceTokenizationTransformationConfig(TransformationConfig):
19 |     _target_: str = "cybulde.models.transformations.HuggingFaceTokenizationTransformation"
20 |     pretrained_tokenizer_name_or_path: str = MISSING
21 |     max_sequence_length: int = MISSING
22 | 
23 |     def loggable_params(self) -> list[str]:
24 |         return super().loggable_params() + ["pretrained_tokenizer_name_or_path", "max_sequence_length"]
25 | 
26 | 
27 | @dataclass
28 | class CustomHuggingFaceTokenizationTransformationConfig(HuggingFaceTokenizationTransformationConfig):
29 |     pretrained_tokenizer_name_or_path: str = "gs://emkademy/cybulde/data/processed/rebalanced_splits/trained_tokenizer"
30 |     max_sequence_length: int = 200
31 | 
32 | 
33 | def setup_config() -> None:
34 |     cs = ConfigStore.instance()
35 |     cs.store(
36 |         name="text_classification_data_module_schema",
37 |         group="tasks/data_module/transformation",
38 |         node=HuggingFaceTokenizationTransformationConfig,
39 |     )
40 | 


--------------------------------------------------------------------------------
/cybulde/config_schemas/training/optimizer_schemas.py:
--------------------------------------------------------------------------------
 1 | from dataclasses import dataclass
 2 | from typing import Optional
 3 | 
 4 | from hydra.core.config_store import ConfigStore
 5 | from omegaconf import MISSING
 6 | 
 7 | from cybulde.utils.mixins import LoggableParamsMixin
 8 | 
 9 | 
10 | @dataclass
11 | class OptimizerConfig(LoggableParamsMixin):
12 |     _target_: str = MISSING
13 |     _partial_: bool = True
14 |     lr: float = MISSING
15 | 
16 |     def loggable_params(self) -> list[str]:
17 |         return ["_target_", "lr"]
18 | 
19 | 
20 | @dataclass
21 | class AdamOptimizerConfig(OptimizerConfig):
22 |     _target_: str = "torch.optim.Adam"
23 |     lr: float = 5e-3
24 |     betas: tuple[float, float] = (0.9, 0.999)
25 |     eps: float = 1e-8
26 |     weight_decay: float = 0.0
27 |     amsgrad: bool = False
28 |     foreach: Optional[bool] = None
29 |     maximize: bool = False
30 |     capturable: bool = False
31 | 
32 | 
33 | @dataclass
34 | class AdamWOptimizerConfig(AdamOptimizerConfig):
35 |     _target_: str = "torch.optim.AdamW"
36 |     lr: float = 5e-5
37 |     weight_decay: float = 1e-3
38 | 
39 | 
40 | def setup_config() -> None:
41 |     cs = ConfigStore.instance()
42 |     cs.store(
43 |         name="adam_optimizer_schema",
44 |         group="tasks/lightning_module/optimizer",
45 |         node=AdamOptimizerConfig,
46 |     )
47 | 
48 |     cs.store(
49 |         name="adamw_optimizer_schema",
50 |         group="tasks/lightning_module/optimizer",
51 |         node=AdamWOptimizerConfig,
52 |     )
53 | 


--------------------------------------------------------------------------------
/cybulde/models/common/utils.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | from contextlib import contextmanager
 4 | from typing import Generator
 5 | 
 6 | import torch
 7 | 
 8 | 
 9 | def get_local_rank() -> int:
10 |     return int(os.getenv("LOCAL_RANK", -1))
11 | 
12 | 
13 | def get_global_rank() -> int:
14 |     return int(os.getenv("RANK", get_local_rank()))
15 | 
16 | 
17 | @contextmanager
18 | def local_rank_zero_first() -> Generator[None, None, None]:
19 |     if not torch.distributed.is_initialized() and os.getenv("RANK") is not None:
20 |         raise RuntimeError("RANK is set but torch.distributed is not initialized")
21 | 
22 |     if torch.distributed.is_initialized():
23 |         rank = get_local_rank()
24 |         if rank not in [-1, 0]:
25 |             torch.distributed.barrier()  # type: ignore
26 |         yield
27 |         if rank == 0:
28 |             torch.distributed.barrier()  # type: ignore
29 |     else:
30 |         yield
31 | 
32 | 
33 | @contextmanager
34 | def global_rank_zero_first() -> Generator[None, None, None]:
35 |     if not torch.distributed.is_initialized() and os.getenv("RANK") is not None:
36 |         raise RuntimeError("RANK is set but torch.distributed is not initialized")
37 | 
38 |     if torch.distributed.is_initialized():
39 |         rank = get_global_rank()
40 |         if rank not in [-1, 0]:
41 |             torch.distributed.barrier()  # type: ignore
42 |         yield
43 |         if rank == 0:
44 |             torch.distributed.barrier()  # type: ignore
45 |     else:
46 |         yield
47 | 


--------------------------------------------------------------------------------
/cybulde/utils/torch_utils.py:
--------------------------------------------------------------------------------
 1 | import itertools
 2 | import os
 3 | 
 4 | from typing import Any
 5 | 
 6 | import matplotlib.pyplot as plt
 7 | import numpy as np
 8 | 
 9 | from matplotlib.pyplot import figure
10 | from torch import Tensor
11 | 
12 | 
13 | def plot_confusion_matrix(confusion_matrix: Tensor, class_names: list[str]) -> Any:
14 |     confusion_matrix = confusion_matrix.cpu().detach().numpy()
15 | 
16 |     figure(num=None, figsize=(16, 12), dpi=60, facecolor="w", edgecolor="k")
17 |     plt.imshow(confusion_matrix, interpolation="nearest", cmap=plt.cm.Purples)  # type: ignore
18 |     plt.colorbar()
19 | 
20 |     tick_marks = np.arange(len(class_names))
21 |     plt.xticks(tick_marks, class_names, rotation=90, fontsize=20)
22 |     plt.yticks(tick_marks, class_names, fontsize=20)
23 | 
24 |     fmt = "d"
25 |     thresh = confusion_matrix.max() / 2.0
26 |     for i, j in itertools.product(range(confusion_matrix.shape[0]), range(confusion_matrix.shape[1])):
27 |         plt.text(
28 |             j,
29 |             i,
30 |             format(confusion_matrix[i, j], fmt),
31 |             horizontalalignment="center",
32 |             color="white" if confusion_matrix[i, j] > thresh else "black",
33 |             fontsize=20,
34 |         )
35 | 
36 |     plt.title("Confusion matrix")
37 |     plt.ylabel("Actual label", fontsize=20)
38 |     plt.xlabel("Predicted label", fontsize=20)
39 |     plt.tight_layout()
40 | 
41 |     return plt.gcf()
42 | 
43 | 
44 | def get_local_rank() -> int:
45 |     return int(os.getenv("LOCAL_RANK", -1))
46 | 


--------------------------------------------------------------------------------
/cybulde/models/backbones.py:
--------------------------------------------------------------------------------
 1 | from torch import nn
 2 | from transformers import AutoConfig, AutoModel, BatchEncoding
 3 | from transformers.modeling_outputs import BaseModelOutputWithPooling
 4 | 
 5 | from cybulde.models.transformations import Transformation
 6 | from cybulde.utils.io_utils import translate_gcs_dir_to_local
 7 | 
 8 | 
 9 | class Backbone(nn.Module):
10 |     def __init__(self, transformation: Transformation) -> None:
11 |         super().__init__()
12 |         self.transformation = transformation
13 | 
14 |     def get_transformation(self) -> Transformation:
15 |         return self.transformation
16 | 
17 | 
18 | class HuggingFaceBackbone(Backbone):
19 |     def __init__(
20 |         self, pretrained_model_name_or_path: str, transformation: Transformation, pretrained: bool = False
21 |     ) -> None:
22 |         super().__init__(transformation)
23 |         self.backbone = self.get_backbone(pretrained_model_name_or_path, pretrained)
24 | 
25 |     def forward(self, encodings: BatchEncoding) -> BaseModelOutputWithPooling:
26 |         output: BaseModelOutputWithPooling = self.backbone(**encodings)
27 |         return output
28 | 
29 |     def get_backbone(self, pretrained_model_name_or_path: str, pretrained: bool) -> nn.Module:
30 |         path = translate_gcs_dir_to_local(pretrained_model_name_or_path)
31 |         config = AutoConfig.from_pretrained(path)
32 |         if pretrained:
33 |             backbone_from_pretrained: nn.Module = AutoModel.from_pretrained(path, config=config)
34 |             return backbone_from_pretrained
35 | 
36 |         backbone_from_config: nn.Module = AutoModel.from_config(config)
37 |         return backbone_from_config
38 | 


--------------------------------------------------------------------------------
/cybulde/config_schemas/models/backbone_schemas.py:
--------------------------------------------------------------------------------
 1 | from dataclasses import dataclass
 2 | 
 3 | from hydra.core.config_store import ConfigStore
 4 | from omegaconf import MISSING
 5 | 
 6 | from cybulde.config_schemas.models.transformation_schemas import (
 7 |     CustomHuggingFaceTokenizationTransformationConfig,
 8 |     TransformationConfig,
 9 | )
10 | from cybulde.utils.mixins import LoggableParamsMixin
11 | 
12 | 
13 | @dataclass
14 | class BackboneConfig(LoggableParamsMixin):
15 |     _target_: str = MISSING
16 |     transformation: TransformationConfig = MISSING
17 | 
18 |     def loggable_params(self) -> list[str]:
19 |         return ["_target_"]
20 | 
21 | 
22 | @dataclass
23 | class HuggingFaceBackboneConfig(BackboneConfig):
24 |     _target_: str = "cybulde.models.backbones.HuggingFaceBackbone"
25 |     pretrained_model_name_or_path: str = MISSING
26 |     pretrained: bool = False
27 | 
28 |     def loggable_params(self) -> list[str]:
29 |         return super().loggable_params() + ["pretrained_model_name_or_path", "pretrained"]
30 | 
31 | 
32 | @dataclass
33 | class BertTinyHuggingFaceBackboneConfig(HuggingFaceBackboneConfig):
34 |     pretrained_model_name_or_path: str = "prajjwal1/bert-tiny"
35 |     transformation: TransformationConfig = CustomHuggingFaceTokenizationTransformationConfig()
36 | 
37 | 
38 | def setup_config() -> None:
39 |     cs = ConfigStore.instance()
40 |     cs.store(
41 |         name="hugging_face_backbone_schema",
42 |         group="tasks/lightning_module/model/backbone",
43 |         node=HuggingFaceBackboneConfig,
44 |     )
45 | 
46 |     cs.store(
47 |         name="test_backbone_config",
48 |         node=BertTinyHuggingFaceBackboneConfig,
49 |     )
50 | 


--------------------------------------------------------------------------------
/cybulde/config_schemas/models/model_schemas.py:
--------------------------------------------------------------------------------
 1 | from dataclasses import dataclass
 2 | from typing import Optional
 3 | 
 4 | from hydra.core.config_store import ConfigStore
 5 | from omegaconf import MISSING
 6 | 
 7 | from cybulde.config_schemas.models import adapter_schemas, backbone_schemas, head_schemas
 8 | from cybulde.utils.mixins import LoggableParamsMixin
 9 | 
10 | 
11 | @dataclass
12 | class ModelConfig(LoggableParamsMixin):
13 |     _target_: str = MISSING
14 | 
15 |     def loggable_params(self) -> list[str]:
16 |         return ["_target_"]
17 | 
18 | 
19 | @dataclass
20 | class BinaryTextClassificationModelConfig(ModelConfig):
21 |     _target_: str = "cybulde.models.models.BinaryTextClassificationModel"
22 |     backbone: backbone_schemas.BackboneConfig = MISSING
23 |     adapter: Optional[adapter_schemas.AdapterConfig] = None
24 |     head: head_schemas.HeadConfig = MISSING
25 | 
26 | 
27 | @dataclass
28 | class BertTinyBinaryTextClassificationModelConfig(BinaryTextClassificationModelConfig):
29 |     backbone: backbone_schemas.BackboneConfig = backbone_schemas.BertTinyHuggingFaceBackboneConfig()
30 |     adapter: Optional[adapter_schemas.AdapterConfig] = adapter_schemas.PoolerOutputAdapterConfig()
31 |     head: head_schemas.HeadConfig = head_schemas.BinaryClassificationSigmoidHead()
32 | 
33 | 
34 | def setup_config() -> None:
35 |     backbone_schemas.setup_config()
36 |     adapter_schemas.setup_config()
37 |     head_schemas.setup_config()
38 | 
39 |     cs = ConfigStore.instance()
40 |     cs.store(
41 |         name="binary_text_classification_model_schema",
42 |         group="tasks/lightning_module/model",
43 |         node=BinaryTextClassificationModelConfig,
44 |     )
45 | 


--------------------------------------------------------------------------------
/cybulde/config_schemas/evaluation/evaluation_task_schemas.py:
--------------------------------------------------------------------------------
 1 | from dataclasses import dataclass
 2 | 
 3 | from hydra.core.config_store import ConfigStore
 4 | from omegaconf import MISSING
 5 | 
 6 | from cybulde.config_schemas import data_module_schemas
 7 | from cybulde.config_schemas.base_schemas import TaskConfig
 8 | from cybulde.config_schemas.evaluation import evaluation_lightning_module_schemas
 9 | from cybulde.config_schemas.trainer import trainer_schemas
10 | 
11 | 
12 | @dataclass
13 | class EvaluationTaskConfig(TaskConfig):
14 |     pass
15 | 
16 | 
17 | @dataclass
18 | class TarModelEvaluationTaskConfig(EvaluationTaskConfig):
19 |     tar_model_path: str = MISSING
20 |     lightning_module: evaluation_lightning_module_schemas.PartialEvaluationLightningModuleConfig = MISSING
21 | 
22 | 
23 | @dataclass
24 | class CommonEvaluationTaskConfig(TarModelEvaluationTaskConfig):
25 |     _target_: str = "cybulde.evaluation.tasks.common_evaluation_task.CommonEvaluationTask"
26 | 
27 | 
28 | @dataclass
29 | class DefaultCommonEvaluationTaskConfig(CommonEvaluationTaskConfig):
30 |     name: str = "binary_text_evaluation_task"
31 |     lightning_module: evaluation_lightning_module_schemas.PartialEvaluationLightningModuleConfig = (
32 |         evaluation_lightning_module_schemas.BinaryTextEvaluationLightningModuleConfig()
33 |     )
34 | 
35 | 
36 | def setup_config() -> None:
37 |     data_module_schemas.setup_config()
38 |     evaluation_lightning_module_schemas.setup_config()
39 |     trainer_schemas.setup_config()
40 | 
41 |     cs = ConfigStore.instance()
42 |     cs.store(
43 |         name="common_evaluation_task_schema",
44 |         group="tasks",
45 |         node=CommonEvaluationTaskConfig,
46 |     )
47 | 


--------------------------------------------------------------------------------
/cybulde/training/tasks/bases.py:
--------------------------------------------------------------------------------
 1 | from abc import ABC, abstractmethod
 2 | from functools import partial
 3 | from typing import TYPE_CHECKING, Union
 4 | 
 5 | from lightning.pytorch import Trainer
 6 | 
 7 | from cybulde.data_modules.data_modules import DataModule, PartialDataModuleType
 8 | from cybulde.training.lightning_modules.bases import TrainingLightningModule
 9 | from cybulde.utils.utils import get_logger
10 | 
11 | if TYPE_CHECKING:
12 |     from cybulde.config_schemas.config_schema import Config
13 |     from cybulde.config_schemas.training.training_task_schemas import TrainingTaskConfig
14 | 
15 | 
16 | class TrainingTask(ABC):
17 |     def __init__(
18 |         self,
19 |         name: str,
20 |         data_module: Union[DataModule, PartialDataModuleType],
21 |         lightning_module: TrainingLightningModule,
22 |         trainer: Trainer,
23 |         best_training_checkpoint: str,
24 |         last_training_checkpoint: str,
25 |     ) -> None:
26 |         super().__init__()
27 |         self.name = name
28 |         self.trainer = trainer
29 |         self.best_training_checkpoint = best_training_checkpoint
30 |         self.last_training_checkpoint = last_training_checkpoint
31 |         self.logger = get_logger(self.__class__.__name__)
32 | 
33 |         self.lightning_module = lightning_module
34 | 
35 |         if isinstance(data_module, partial):
36 |             transformation = self.lightning_module.get_transformation()
37 |             self.data_module = data_module(transformation=transformation)
38 |         else:
39 |             self.data_module = data_module
40 | 
41 |     @abstractmethod
42 |     def run(self, config: "Config", task_config: "TrainingTaskConfig") -> None:
43 |         ...
44 | 


--------------------------------------------------------------------------------
/cybulde/training/schedulers.py:
--------------------------------------------------------------------------------
 1 | from abc import ABC, abstractmethod
 2 | from typing import Any, Literal, Optional, Protocol, Union
 3 | 
 4 | from torch.optim import Optimizer
 5 | from torch.optim.lr_scheduler import _LRScheduler
 6 | 
 7 | 
 8 | class PartialSchedulerType(Protocol):
 9 |     def __call__(
10 |         self, optimizer: Optimizer, estimated_stepping_batches: Optional[Union[int, float]] = None
11 |     ) -> _LRScheduler:
12 |         ...
13 | 
14 | 
15 | class LightningScheduler(ABC):
16 |     def __init__(
17 |         self,
18 |         scheduler: PartialSchedulerType,
19 |         interval: Literal["epoch", "step"] = "epoch",
20 |         frequency: int = 1,
21 |         monitor: str = "val_loss",
22 |         strict: bool = True,
23 |         name: Optional[str] = None,
24 |     ) -> None:
25 |         self.scheduler = scheduler
26 |         self.interval = interval
27 |         self.frequency = frequency
28 |         self.monitor = monitor
29 |         self.strict = strict
30 |         self.name = name
31 | 
32 |     @abstractmethod
33 |     def configure_scheduler(
34 |         self, optimizer: Optimizer, estimated_stepping_batches: Union[int, float]
35 |     ) -> dict[str, Any]:
36 |         ...
37 | 
38 | 
39 | class CommonLightningScheduler(LightningScheduler):
40 |     def configure_scheduler(
41 |         self, optimizer: Optimizer, estimated_stepping_batches: Union[int, float]
42 |     ) -> dict[str, Any]:
43 |         return {
44 |             "scheduler": self.scheduler(optimizer),
45 |             "interval": self.interval,
46 |             "frequency": self.frequency,
47 |             "monitor": self.monitor,
48 |             "strict": self.strict,
49 |             "name": self.name,
50 |         }
51 | 


--------------------------------------------------------------------------------
/cybulde/models/transformations.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | from abc import ABC, abstractmethod
 4 | 
 5 | from transformers import AutoTokenizer, BatchEncoding, PreTrainedTokenizerBase
 6 | 
 7 | from cybulde.utils.io_utils import is_dir, is_file, translate_gcs_dir_to_local
 8 | 
 9 | 
10 | class Transformation(ABC):
11 |     @abstractmethod
12 |     def __call__(self, texts: list[str]) -> BatchEncoding:
13 |         ...
14 | 
15 | 
16 | class HuggingFaceTokenizationTransformation(Transformation):
17 |     def __init__(self, pretrained_tokenizer_name_or_path: str, max_sequence_length: int) -> None:
18 |         super().__init__()
19 |         self.max_sequence_length = max_sequence_length
20 |         self.tokenizer = self.get_tokenizer(pretrained_tokenizer_name_or_path)
21 | 
22 |     def __call__(self, texts: list[str]) -> BatchEncoding:
23 |         output: BatchEncoding = self.tokenizer.batch_encode_plus(
24 |             texts, truncation=True, padding=True, return_tensors="pt", max_length=self.max_sequence_length
25 |         )
26 |         return output
27 | 
28 |     def get_tokenizer(self, pretrained_tokenizer_name_or_path: str) -> PreTrainedTokenizerBase:
29 |         if is_dir(pretrained_tokenizer_name_or_path):
30 |             tokenizer_dir = translate_gcs_dir_to_local(pretrained_tokenizer_name_or_path)
31 |         elif is_file(pretrained_tokenizer_name_or_path):
32 |             pretrained_tokenizer_name_or_path = translate_gcs_dir_to_local(pretrained_tokenizer_name_or_path)
33 |             tokenizer_dir = os.path.dirname(pretrained_tokenizer_name_or_path)
34 |         else:
35 |             tokenizer_dir = pretrained_tokenizer_name_or_path
36 | 
37 |         tokenizer: PreTrainedTokenizerBase = AutoTokenizer.from_pretrained(tokenizer_dir)
38 |         return tokenizer
39 | 


--------------------------------------------------------------------------------
/cybulde/generate_final_config.py:
--------------------------------------------------------------------------------
 1 | from pathlib import Path
 2 | from typing import TYPE_CHECKING
 3 | 
 4 | import mlflow
 5 | 
 6 | from omegaconf import DictConfig
 7 | 
 8 | from cybulde.utils.config_utils import get_config_and_dict_config, save_config_as_yaml
 9 | from cybulde.utils.mlflow_utils import activate_mlflow, log_artifacts_for_reproducibility, log_training_hparams
10 | 
11 | if TYPE_CHECKING:
12 |     from cybulde.config_schemas.config_schema import Config
13 | 
14 | 
15 | @get_config_and_dict_config(config_path="../configs", config_name="config")  # type: ignore
16 | def generate_final_config(config: "Config", dict_config: DictConfig) -> None:
17 |     run: mlflow.ActiveRun
18 |     with activate_mlflow(
19 |         config.infrastructure.mlflow.experiment_name,
20 |         run_id=config.infrastructure.mlflow.run_id,
21 |         run_name=config.infrastructure.mlflow.run_name,
22 |     ) as run:
23 |         run_id: str = run.info.run_id
24 |         experiment_id: str = run.info.experiment_id
25 |         artifact_uri: str = run.info.artifact_uri
26 | 
27 |         dict_config.infrastructure.mlflow.artifact_uri = artifact_uri
28 |         dict_config.infrastructure.mlflow.run_id = run_id
29 |         dict_config.infrastructure.mlflow.experiment_id = experiment_id
30 | 
31 |         config_save_dir = Path("./cybulde/configs/automatically_generated/")
32 |         config_save_dir.mkdir(parents=True, exist_ok=True)
33 |         (config_save_dir / "__init__.py").touch(exist_ok=True)
34 | 
35 |         yaml_config_save_path = config_save_dir / "config.yaml"
36 |         save_config_as_yaml(dict_config, str(yaml_config_save_path))
37 |         mlflow.log_artifact(str(yaml_config_save_path))
38 | 
39 |         log_training_hparams(config)
40 |         log_artifacts_for_reproducibility()
41 | 
42 | 
43 | if __name__ == "__main__":
44 |     generate_final_config()
45 | 


--------------------------------------------------------------------------------
/cybulde/config_schemas/training/scheduler_schemas.py:
--------------------------------------------------------------------------------
 1 | from dataclasses import dataclass
 2 | from typing import Optional
 3 | 
 4 | from hydra.core.config_store import ConfigStore
 5 | from omegaconf import MISSING
 6 | 
 7 | from cybulde.utils.mixins import LoggableParamsMixin
 8 | 
 9 | 
10 | @dataclass
11 | class SchedulerConfig(LoggableParamsMixin):
12 |     _target_: str = MISSING
13 |     _partial_: bool = True
14 | 
15 |     def loggable_params(self) -> list[str]:
16 |         return ["_target_"]
17 | 
18 | 
19 | @dataclass
20 | class ReduceLROnPlateauSchedulerConfig(SchedulerConfig):
21 |     _target_: str = "torch.optim.lr_scheduler.ReduceLROnPlateau"
22 |     mode: str = "max"
23 |     factor: float = 0.1
24 |     patience: int = 10
25 |     threshold: float = 1e-4
26 |     threshold_mode: str = "rel"
27 |     cooldown: int = 0
28 |     min_lr: float = 0
29 |     eps: float = 1e-8
30 |     verbose: bool = False
31 | 
32 | 
33 | @dataclass
34 | class LightningSchedulerConfig:
35 |     _target_: str = MISSING
36 |     scheduler: SchedulerConfig = MISSING
37 |     interval: str = "epoch"
38 |     frequency: int = 1
39 |     monitor: str = "validation_f1_score"
40 |     strict: bool = True
41 |     name: Optional[str] = None
42 | 
43 | 
44 | @dataclass
45 | class CommonLightningSchedulerConfig(LightningSchedulerConfig):
46 |     _target_: str = "cybulde.training.schedulers.CommonLightningScheduler"
47 | 
48 | 
49 | @dataclass
50 | class ReduceLROnPlateauLightningSchedulerConfig(CommonLightningSchedulerConfig):
51 |     scheduler: SchedulerConfig = ReduceLROnPlateauSchedulerConfig(patience=5)
52 | 
53 | 
54 | def setup_config() -> None:
55 |     cs = ConfigStore.instance()
56 |     cs.store(
57 |         name="reduce_lr_on_plateau_scheduler_schema",
58 |         group="tasks/lightning_module/scheduler",
59 |         node=ReduceLROnPlateauLightningSchedulerConfig,
60 |     )
61 | 


--------------------------------------------------------------------------------
/cybulde/config_schemas/models/adapter_schemas.py:
--------------------------------------------------------------------------------
 1 | from dataclasses import dataclass, field
 2 | from typing import Optional
 3 | 
 4 | from hydra.core.config_store import ConfigStore
 5 | from omegaconf import MISSING
 6 | 
 7 | from cybulde.utils.mixins import LoggableParamsMixin
 8 | 
 9 | 
10 | @dataclass
11 | class AdapterConfig(LoggableParamsMixin):
12 |     _target_: str = MISSING
13 | 
14 |     def loggable_params(self) -> list[str]:
15 |         return ["_target_"]
16 | 
17 | 
18 | @dataclass
19 | class MLPWithPoolingConfig(AdapterConfig):
20 |     _target_: str = "cybulde.models.adapters.MLPWithPooling"
21 |     output_feature_sizes: list[int] = MISSING
22 |     biases: Optional[list[bool]] = None
23 |     activation_fns: Optional[list[Optional[str]]] = None
24 |     dropout_drop_probs: Optional[list[float]] = None
25 |     batch_norms: Optional[list[bool]] = None
26 |     order: str = "LABDN"
27 |     standardize_input: bool = True
28 |     pooling_method: Optional[str] = None
29 |     output_attribute_to_use: Optional[str] = None
30 | 
31 |     def loggable_params(self) -> list[str]:
32 |         return super().loggable_params() + [
33 |             "output_feature_sizes",
34 |             "biases",
35 |             "activation_fns",
36 |             "dropout_drop_probs",
37 |             "batch_norms",
38 |             "order",
39 |             "pooling_method",
40 |             "output_attribute_to_use",
41 |         ]
42 | 
43 | 
44 | @dataclass
45 | class PoolerOutputAdapterConfig(MLPWithPoolingConfig):
46 |     output_feature_sizes: list[int] = field(default_factory=lambda: [-1])
47 |     output_attribute_to_use: str = "pooler_output"
48 | 
49 | 
50 | def setup_config() -> None:
51 |     cs = ConfigStore.instance()
52 |     cs.store(
53 |         name="mlp_with_pooling_schema",
54 |         group="tasks/lightning_module/model/adapter",
55 |         node=MLPWithPoolingConfig,
56 |     )
57 | 


--------------------------------------------------------------------------------
/cybulde/config_schemas/evaluation/model_selector_schemas.py:
--------------------------------------------------------------------------------
 1 | from dataclasses import dataclass, field
 2 | from typing import Optional
 3 | 
 4 | from hydra.core.config_store import ConfigStore
 5 | from omegaconf import MISSING, SI
 6 | 
 7 | 
 8 | @dataclass
 9 | class MetricComparerConfig:
10 |     _target_: str = "cybulde.evaluation.model_selector.MetricComparer"
11 |     bigger_is_better: bool = MISSING
12 |     can_be_equal: bool = False
13 |     metric_name: str = MISSING
14 |     threshold: float = 0.0
15 | 
16 | 
17 | @dataclass
18 | class BinaryF1ScoreMetricComparerConfig(MetricComparerConfig):
19 |     bigger_is_better: bool = True
20 |     metric_name: str = "test_f1_score"
21 | 
22 | 
23 | @dataclass
24 | class ModelSizeMetricComparerConfig(MetricComparerConfig):
25 |     bigger_is_better: bool = False
26 |     metric_name: str = "model_size"
27 |     can_be_equal: bool = True
28 | 
29 | 
30 | @dataclass
31 | class ModelSelectorConfig:
32 |     _target_: str = "cybulde.evaluation.model_selector.ModelSelector"
33 |     mlflow_run_id: Optional[str] = SI("${infrastructure.mlflow.run_id}")
34 |     must_be_better_metric_comparers: dict[str, MetricComparerConfig] = field(default_factory=lambda: {})
35 |     to_be_thresholded_metric_comparers: dict[str, MetricComparerConfig] = field(default_factory=lambda: {})
36 |     threshold: float = 0.0
37 | 
38 | 
39 | @dataclass
40 | class CyberBullyingDetectionModelSelectorConfig(ModelSelectorConfig):
41 |     must_be_better_metric_comparers: dict[str, MetricComparerConfig] = field(
42 |         default_factory=lambda: {
43 |             "f1_score": BinaryF1ScoreMetricComparerConfig(),
44 |             "model_size": ModelSizeMetricComparerConfig(),
45 |         }
46 |     )
47 | 
48 | 
49 | def setup_config() -> None:
50 |     cs = ConfigStore.instance()
51 |     cs.store(name="metric_comparer_schema", group="model_selector/metric_comparers", node=MetricComparerConfig)
52 |     cs.store(name="model_selector_schema", group="model_selector", node=ModelSelectorConfig)
53 | 


--------------------------------------------------------------------------------
/cybulde/evaluation/tasks/bases.py:
--------------------------------------------------------------------------------
 1 | from abc import ABC, abstractmethod
 2 | from typing import TYPE_CHECKING, Union
 3 | 
 4 | from lightning.pytorch import Trainer
 5 | 
 6 | from cybulde.data_modules.data_modules import DataModule, PartialDataModuleType
 7 | from cybulde.evaluation.lightning_modules.bases import EvaluationLightningModule, PartialEvaluationLightningModuleType
 8 | from cybulde.models.common.exporter import TarModelLoader
 9 | 
10 | if TYPE_CHECKING:
11 |     from cybulde.config_schemas.config_schema import Config
12 |     from cybulde.config_schemas.evaluation.evaluation_task_schemas import EvaluationTaskConfig
13 | 
14 | 
15 | class EvaluationTask(ABC):
16 |     def __init__(
17 |         self,
18 |         name: str,
19 |         data_module: Union[DataModule, PartialDataModuleType],
20 |         lightning_module: EvaluationLightningModule,
21 |         trainer: Trainer,
22 |     ) -> None:
23 |         super().__init__()
24 | 
25 |         self.name = name
26 |         self.trainer = trainer
27 |         self.lightning_module = lightning_module
28 |         self.lightning_module.eval()
29 | 
30 |         if isinstance(data_module, DataModule):
31 |             self.data_module = data_module
32 |         else:
33 |             self.data_module = data_module(transformation=self.lightning_module.get_transformation())
34 | 
35 |     @abstractmethod
36 |     def run(self, config: "Config", task_config: "EvaluationTaskConfig") -> None:
37 |         ...
38 | 
39 | 
40 | class TarModelEvaluationTask(EvaluationTask):
41 |     def __init__(
42 |         self,
43 |         name: str,
44 |         data_module: Union[DataModule, PartialDataModuleType],
45 |         lightning_module: PartialEvaluationLightningModuleType,
46 |         trainer: Trainer,
47 |         tar_model_path: str,
48 |     ) -> None:
49 |         model = TarModelLoader(tar_model_path).load()
50 |         _lightning_module = lightning_module(model=model)
51 |         super().__init__(name=name, data_module=data_module, lightning_module=_lightning_module, trainer=trainer)
52 | 


--------------------------------------------------------------------------------
/cybulde/config_schemas/data_module_schemas.py:
--------------------------------------------------------------------------------
 1 | from dataclasses import dataclass
 2 | 
 3 | from hydra.core.config_store import ConfigStore
 4 | from omegaconf import MISSING, SI
 5 | 
 6 | from cybulde.config_schemas.models import transformation_schemas
 7 | from cybulde.utils.mixins import LoggableParamsMixin
 8 | 
 9 | 
10 | @dataclass
11 | class DataModuleConfig(LoggableParamsMixin):
12 |     _target_: str = MISSING
13 |     batch_size: int = MISSING
14 |     shuffle: bool = False
15 |     num_workers: int = 8
16 |     pin_memory: bool = True
17 |     drop_last: bool = True
18 |     persistent_workers: bool = False
19 | 
20 |     def loggable_params(self) -> list[str]:
21 |         return ["_target_", "batch_size"]
22 | 
23 | 
24 | @dataclass
25 | class TextClassificationDataModuleConfig(DataModuleConfig):
26 |     _target_: str = "cybulde.data_modules.data_modules.TextClassificationDataModule"
27 |     train_df_path: str = MISSING
28 |     dev_df_path: str = MISSING
29 |     test_df_path: str = MISSING
30 |     transformation: transformation_schemas.TransformationConfig = MISSING
31 |     text_column_name: str = "cleaned_text"
32 |     label_column_name: str = "label"
33 | 
34 | 
35 | @dataclass
36 | class ScrappedDataTextClassificationDataModuleConfig(TextClassificationDataModuleConfig):
37 |     batch_size: int = 64
38 |     train_df_path: str = "gs://emkademy/cybulde/data/processed/rebalanced_splits/train.parquet"
39 |     dev_df_path: str = "gs://emkademy/cybulde/data/processed/rebalanced_splits/dev.parquet"
40 |     test_df_path: str = "gs://emkademy/cybulde/data/processed/rebalanced_splits/test.parquet"
41 |     transformation: transformation_schemas.TransformationConfig = SI(
42 |         "${..lightning_module.model.backbone.transformation}"
43 |     )
44 | 
45 | 
46 | def setup_config() -> None:
47 |     transformation_schemas.setup_config()
48 | 
49 |     cs = ConfigStore.instance()
50 |     cs.store(
51 |         name="text_classification_data_module_schema",
52 |         group="tasks/data_module",
53 |         node=TextClassificationDataModuleConfig,
54 |     )
55 | 


--------------------------------------------------------------------------------
/cybulde/config_schemas/experiment/bert/local_bert.py:
--------------------------------------------------------------------------------
 1 | from dataclasses import dataclass, field
 2 | from typing import Optional
 3 | 
 4 | from hydra.core.config_store import ConfigStore
 5 | from omegaconf import OmegaConf
 6 | 
 7 | from cybulde.config_schemas.base_schemas import TaskConfig
 8 | from cybulde.config_schemas.config_schema import Config
 9 | from cybulde.config_schemas.evaluation import model_selector_schemas
10 | from cybulde.config_schemas.evaluation.evaluation_task_schemas import DefaultCommonEvaluationTaskConfig
11 | from cybulde.config_schemas.trainer.trainer_schemas import GPUProd
12 | from cybulde.config_schemas.training.training_task_schemas import DefaultCommonTrainingTaskConfig
13 | 
14 | 
15 | @dataclass
16 | class LocalBertExperiment(Config):
17 |     tasks: dict[str, TaskConfig] = field(
18 |         default_factory=lambda: {
19 |             "binary_text_classification_task": DefaultCommonTrainingTaskConfig(trainer=GPUProd()),
20 |             "binary_text_evaluation_task": DefaultCommonEvaluationTaskConfig(),
21 |         }
22 |     )
23 |     model_selector: Optional[
24 |         model_selector_schemas.ModelSelectorConfig
25 |     ] = model_selector_schemas.CyberBullyingDetectionModelSelectorConfig()
26 |     registered_model_name: Optional[str] = "bert_tiny"
27 | 
28 | 
29 | FinalLocalBertExperiment = OmegaConf.merge(
30 |     LocalBertExperiment,
31 |     OmegaConf.from_dotlist(
32 |         [
33 |             "infrastructure.mlflow.experiment_name=cybulde",
34 |             "tasks.binary_text_classification_task.data_module.batch_size=1024",
35 |             "tasks.binary_text_evaluation_task.tar_model_path=${tasks.binary_text_classification_task.tar_model_export_path}",
36 |             "tasks.binary_text_evaluation_task.data_module=${tasks.binary_text_classification_task.data_module}",
37 |             "tasks.binary_text_evaluation_task.trainer=${tasks.binary_text_classification_task.trainer}",
38 |         ]
39 |     ),
40 | )
41 | 
42 | cs = ConfigStore.instance()
43 | cs.store(name="local_bert", group="experiment/bert", node=FinalLocalBertExperiment, package="_global_")
44 | 


--------------------------------------------------------------------------------
/docker/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM python:3.10-slim-bullseye
 2 | 
 3 | ARG USER_ID
 4 | ARG USER_NAME
 5 | ENV HOME=/home/${USER_NAME} \
 6 |     VIRTUAL_ENV=/home/${USER_NAME}/venv
 7 | ENV \
 8 |   PYTHONUNBUFFERED=1 \
 9 |   DEBIAN_FRONTEND=noninteractive \
10 |   TZ=Europe/Warsaw \
11 |   PATH="/usr/local/gcloud/google-cloud-sdk/bin:${HOME}/.local/bin:${VIRTUAL_ENV}/bin:${PATH}" \
12 |   PYTHONPATH="/app:${PYTHONPATH}" \
13 |   BUILD_POETRY_LOCK="${HOME}/poetry.lock.build"
14 | 
15 | RUN apt-get -qq update \
16 |     && apt-get -qq -y install vim gcc curl git build-essential libb64-dev software-properties-common \
17 |     && rm -rf /var/lib/apt/lists/* \
18 |     && apt-get -qq -y clean
19 | 
20 | RUN curl https://dl.google.com/dl/cloudsdk/channels/rapid/downloads/google-cloud-sdk-426.0.0-linux-x86_64.tar.gz > /tmp/google-cloud-sdk.tar.gz
21 | RUN mkdir -p /usr/local/gcloud \
22 |     && tar -C /usr/local/gcloud -xf /tmp/google-cloud-sdk.tar.gz \
23 |     && /usr/local/gcloud/google-cloud-sdk/install.sh --usage-reporting false --command-completion true --bash-completion true --path-update true --quiet
24 | 
25 | RUN addgroup --system --gid ${USER_ID} ${USER_NAME} \
26 |     && useradd --system -m --no-log-init --home-dir ${HOME} --uid ${USER_ID} --gid ${USER_NAME} --groups ${USER_NAME} ${USER_NAME}
27 | 
28 | RUN chown -R ${USER_NAME}:${USER_NAME} ${HOME}
29 | RUN mkdir -p /app /mlflow-artifact-store "${HOME}/.local/share" && chown -R ${USER_NAME}:${USER_NAME} /app /tmp /mlflow-artifact-store "${HOME}/.local"
30 | 
31 | RUN curl -sSL https://install.python-poetry.org | python3 - --version 1.7.1
32 | 
33 | USER ${USER_NAME}
34 | 
35 | COPY pyproject.toml *.lock /app/
36 | WORKDIR /app
37 | 
38 | RUN poetry config virtualenvs.create false \
39 |     && python3.10 -m venv ${VIRTUAL_ENV} \
40 |     && pip install --upgrade pip setuptools \
41 |     && poetry install && cp poetry.lock ${BUILD_POETRY_LOCK} \
42 |     && rm -rf ${HOME}/.cache/*
43 | 
44 | USER root
45 | COPY ./docker/scripts/* /
46 | RUN chown -R ${USER_NAME} /*.sh && chmod +x /*.sh
47 | USER ${USER_NAME}
48 | 
49 | COPY . /app/
50 | CMD ["/startup-script.sh"]
51 | 


--------------------------------------------------------------------------------
/docker-compose.yaml:
--------------------------------------------------------------------------------
 1 | version: "3.8"
 2 | 
 3 | volumes:
 4 |   postgresql-mlflow-data:
 5 |   mlflow-artifact-store:
 6 | 
 7 | x-app-template: &APP_TEMPLATE
 8 |   user: "${USER_ID:-1000}"
 9 |   hostname: "${HOST_NAME:-emkademy}"
10 |   image: cybulde-model
11 |   build:
12 |     context: .
13 |     dockerfile: ./docker/Dockerfile
14 |     args:
15 |       USER_NAME: "${USER_NAME:-kyuksel}"
16 |       USER_ID: "${USER_ID:-1000}"
17 |   env_file:
18 |     - .envs/.postgres
19 |     - .envs/.mlflow-common
20 |     - .envs/.mlflow-dev
21 |   volumes:
22 |     - ./:/app/
23 |     - mlflow-artifact-store:/mlflow-artifact-store
24 |     - ~/.config/gcloud/:/home/${USER_NAME}/.config/gcloud
25 |   ipc: host
26 |   init: true
27 | 
28 | services:
29 |   mlflow-db:
30 |     container_name: mlflow-backend-store
31 |     image: postgres:14
32 |     env_file:
33 |       - .envs/.mlflow-common
34 |       - .envs/.mlflow-dev
35 |       - .envs/.postgres
36 |     volumes:
37 |       - postgresql-mlflow-data:/var/lib/postgresql/data
38 |     ports:
39 |       - 5433:5432
40 |     profiles:
41 |       - dev
42 | 
43 |   app-dev:
44 |     <<: *APP_TEMPLATE
45 |     container_name: cybulde-model-dev-container
46 |     ports:
47 |       - ${LOCAL_DEV_MLFLOW_SERVER_PORT}:${LOCAL_DEV_MLFLOW_SERVER_PORT}
48 |       - 8080:8080
49 |       - 8888:8888
50 |       - 8001:8001
51 |     deploy:
52 |       resources:
53 |         reservations:
54 |           devices:
55 |             - driver: nvidia
56 |               count: all
57 |               capabilities: [gpu]
58 |     profiles:
59 |       - dev
60 | 
61 |   app-prod:
62 |     <<: *APP_TEMPLATE
63 |     container_name: cybulde-model-prod-container
64 |     env_file:
65 |       - .envs/.mlflow-common
66 |       - .envs/.mlflow-prod
67 |       - .envs/.infrastructure
68 |     ports:
69 |       - ${PROD_MLFLOW_SERVER_PORT}:${PROD_MLFLOW_SERVER_PORT}
70 |     profiles:
71 |       - prod
72 | 
73 |   app-ci:
74 |     <<: *APP_TEMPLATE
75 |     container_name: cybulde-model-ci-container
76 |     ports:
77 |       - ${LOCAL_DEV_MLFLOW_SERVER_PORT}:${LOCAL_DEV_MLFLOW_SERVER_PORT}
78 |     profiles:
79 |       - ci
80 | 


--------------------------------------------------------------------------------
/cybulde/evaluation/tasks/common_evaluation_task.py:
--------------------------------------------------------------------------------
 1 | from typing import TYPE_CHECKING, Union
 2 | 
 3 | from hydra.utils import instantiate
 4 | from lightning.pytorch import Trainer
 5 | 
 6 | from cybulde.data_modules.data_modules import DataModule, PartialDataModuleType
 7 | from cybulde.evaluation.lightning_modules.bases import PartialEvaluationLightningModuleType
 8 | from cybulde.evaluation.tasks.bases import TarModelEvaluationTask
 9 | from cybulde.utils.mlflow_utils import activate_mlflow, log_model
10 | 
11 | if TYPE_CHECKING:
12 |     from cybulde.config_schemas.config_schema import Config
13 |     from cybulde.config_schemas.evaluation.evaluation_task_schemas import EvaluationTaskConfig
14 | 
15 | 
16 | class CommonEvaluationTask(TarModelEvaluationTask):
17 |     def __init__(
18 |         self,
19 |         name: str,
20 |         data_module: Union[DataModule, PartialDataModuleType],
21 |         lightning_module: PartialEvaluationLightningModuleType,
22 |         trainer: Trainer,
23 |         tar_model_path: str,
24 |     ) -> None:
25 |         super().__init__(
26 |             name=name,
27 |             data_module=data_module,
28 |             lightning_module=lightning_module,
29 |             trainer=trainer,
30 |             tar_model_path=tar_model_path,
31 |         )
32 | 
33 |     def run(self, config: "Config", task_config: "EvaluationTaskConfig") -> None:
34 |         experiment_name = config.infrastructure.mlflow.experiment_name
35 |         run_id = config.infrastructure.mlflow.run_id
36 |         run_name = config.infrastructure.mlflow.run_name
37 | 
38 |         with activate_mlflow(experiment_name=experiment_name, run_id=run_id, run_name=run_name) as _:
39 |             self.trainer.test(model=self.lightning_module, datamodule=self.data_module)
40 | 
41 |         model_selector = instantiate(config.model_selector)
42 |         assert config.registered_model_name is not None
43 |         if model_selector is not None:
44 |             if model_selector.is_selected():
45 |                 log_model(
46 |                     config.infrastructure.mlflow, model_selector.get_new_best_run_tag(), config.registered_model_name
47 |                 )
48 | 


--------------------------------------------------------------------------------
/cybulde/config_schemas/training/training_task_schemas.py:
--------------------------------------------------------------------------------
 1 | from dataclasses import dataclass
 2 | 
 3 | from hydra.core.config_store import ConfigStore
 4 | from omegaconf import MISSING, SI
 5 | 
 6 | from cybulde.config_schemas import data_module_schemas
 7 | from cybulde.config_schemas.base_schemas import TaskConfig
 8 | from cybulde.config_schemas.trainer import trainer_schemas
 9 | from cybulde.config_schemas.training import training_lightning_module_schemas
10 | 
11 | 
12 | @dataclass
13 | class TrainingTaskConfig(TaskConfig):
14 |     best_training_checkpoint: str = SI("${infrastructure.mlflow.artifact_uri}/best-checkpoints/last.ckpt")
15 |     last_training_checkpoint: str = SI("${infrastructure.mlflow.artifact_uri}/last-checkpoints/last.ckpt")
16 | 
17 | 
18 | @dataclass
19 | class TarModelExportingTrainingTaskConfig(TrainingTaskConfig):
20 |     tar_model_export_path: str = SI("${infrastructure.mlflow.artifact_uri}/exported_model.tar.gz")
21 | 
22 | 
23 | @dataclass
24 | class CommonTrainingTaskConfig(TrainingTaskConfig):
25 |     _target_: str = "cybulde.training.tasks.common_training_task.CommonTrainingTask"
26 | 
27 | 
28 | @dataclass
29 | class DefaultCommonTrainingTaskConfig(TarModelExportingTrainingTaskConfig):
30 |     _target_: str = "cybulde.training.tasks.tar_model_exporting_training_task.TarModelExportingTrainingTask"
31 |     name: str = "binary_text_classfication_task"
32 |     data_module: data_module_schemas.DataModuleConfig = (
33 |         data_module_schemas.ScrappedDataTextClassificationDataModuleConfig()
34 |     )
35 |     lightning_module: training_lightning_module_schemas.TrainingLightningModuleConfig = (
36 |         training_lightning_module_schemas.CybuldeBinaryTextClassificationTrainingLightningModuleConfig()
37 |     )
38 |     trainer: trainer_schemas.TrainerConfig = trainer_schemas.GPUDev()
39 | 
40 | 
41 | def setup_config() -> None:
42 |     data_module_schemas.setup_config()
43 |     training_lightning_module_schemas.setup_config()
44 |     trainer_schemas.setup_config()
45 | 
46 |     cs = ConfigStore.instance()
47 |     cs.store(
48 |         name="common_training_task_schema",
49 |         group="tasks",
50 |         node=CommonTrainingTaskConfig,
51 |     )
52 | 


--------------------------------------------------------------------------------
/cybulde/config_schemas/training/training_lightning_module_schemas.py:
--------------------------------------------------------------------------------
 1 | from dataclasses import dataclass
 2 | from typing import Optional
 3 | 
 4 | from hydra.core.config_store import ConfigStore
 5 | from omegaconf import MISSING
 6 | 
 7 | from cybulde.config_schemas.base_schemas import LightningModuleConfig
 8 | from cybulde.config_schemas.models.model_schemas import BertTinyBinaryTextClassificationModelConfig, ModelConfig
 9 | from cybulde.config_schemas.training import loss_schemas, optimizer_schemas, scheduler_schemas
10 | from cybulde.utils.mixins import LoggableParamsMixin
11 | 
12 | 
13 | @dataclass
14 | class TrainingLightningModuleConfig(LightningModuleConfig, LoggableParamsMixin):
15 |     _target_: str = MISSING
16 |     model: ModelConfig = MISSING
17 |     loss: loss_schemas.LossFunctionConfig = MISSING
18 |     optimizer: optimizer_schemas.OptimizerConfig = MISSING
19 |     scheduler: Optional[scheduler_schemas.LightningSchedulerConfig] = None
20 | 
21 |     def loggable_params(self) -> list[str]:
22 |         return ["_target_"]
23 | 
24 | 
25 | @dataclass
26 | class BinaryTextClassificationTrainingLightningModuleConfig(TrainingLightningModuleConfig):
27 |     _target_: str = (
28 |         "cybulde.training.lightning_modules.binary_text_classification.BinaryTextClassificationTrainingLightningModule"
29 |     )
30 | 
31 | 
32 | @dataclass
33 | class CybuldeBinaryTextClassificationTrainingLightningModuleConfig(
34 |     BinaryTextClassificationTrainingLightningModuleConfig
35 | ):
36 |     model: ModelConfig = BertTinyBinaryTextClassificationModelConfig()
37 |     loss: loss_schemas.LossFunctionConfig = loss_schemas.BCEWithLogitsLossConfig()
38 |     optimizer: optimizer_schemas.OptimizerConfig = optimizer_schemas.AdamWOptimizerConfig()
39 |     scheduler: Optional[
40 |         scheduler_schemas.LightningSchedulerConfig
41 |     ] = scheduler_schemas.ReduceLROnPlateauLightningSchedulerConfig()
42 | 
43 | 
44 | def setup_config() -> None:
45 |     cs = ConfigStore.instance()
46 |     cs.store(
47 |         name="binary_text_classification_training_lightning_module_schema",
48 |         group="tasks/lightning_module",
49 |         node=BinaryTextClassificationTrainingLightningModuleConfig,
50 |     )
51 | 


--------------------------------------------------------------------------------
/cybulde/evaluation/lightning_modules/binary_text_evaluation.py:
--------------------------------------------------------------------------------
 1 | from collections import defaultdict
 2 | 
 3 | import mlflow
 4 | import torch
 5 | 
 6 | from torch import Tensor
 7 | from torchmetrics.classification import BinaryAccuracy, BinaryConfusionMatrix, BinaryF1Score
 8 | from transformers import BatchEncoding
 9 | 
10 | from cybulde.evaluation.lightning_modules.bases import EvaluationLightningModule
11 | from cybulde.models.models import Model
12 | from cybulde.models.transformations import Transformation
13 | from cybulde.utils.torch_utils import plot_confusion_matrix
14 | 
15 | 
16 | class BinaryTextEvaluationLightningModule(EvaluationLightningModule):
17 |     def __init__(
18 |         self,
19 |         model: Model,
20 |     ) -> None:
21 |         super().__init__(model=model)
22 | 
23 |         self.test_accuracy = BinaryAccuracy()
24 |         self.test_f1_score = BinaryF1Score()
25 |         self.test_confusion_matrix = BinaryConfusionMatrix()
26 | 
27 |         self.test_step_outputs: dict[str, list[Tensor]] = defaultdict(list)
28 | 
29 |     def forward(self, texts: BatchEncoding) -> Tensor:
30 |         output: Tensor = self.model(texts)
31 |         return output
32 | 
33 |     def test_step(self, batch: tuple[BatchEncoding, Tensor], batch_idx: int) -> None:  # type: ignore
34 |         texts, labels = batch
35 |         logits = self(texts)
36 | 
37 |         self.test_accuracy(logits, labels)
38 |         self.test_f1_score(logits, labels)
39 |         self.test_confusion_matrix(logits, labels)
40 | 
41 |         self.log("test_accuracy", self.test_accuracy, on_step=False, on_epoch=True)
42 |         self.log("test_f1_score", self.test_f1_score, on_step=False, on_epoch=True)
43 | 
44 |         self.test_step_outputs["logits"].append(logits)
45 |         self.test_step_outputs["labels"].append(labels)
46 | 
47 |     def on_test_epoch_end(self) -> None:
48 |         all_logits = torch.stack(self.test_step_outputs["logits"])
49 |         all_labels = torch.stack(self.test_step_outputs["labels"])
50 | 
51 |         confusion_matrix = self.test_confusion_matrix(all_logits, all_labels)
52 |         figure = plot_confusion_matrix(confusion_matrix, ["0", "1"])
53 |         mlflow.log_figure(figure, "test_confusion_matrix.png")
54 | 
55 |         self.test_step_outputs = defaultdict(list)
56 | 
57 |     def get_transformation(self) -> Transformation:
58 |         return self.model.get_transformation()
59 | 


--------------------------------------------------------------------------------
/cybulde/config_schemas/infrastructure/instance_template_creator_schemas.py:
--------------------------------------------------------------------------------
 1 | from dataclasses import dataclass, field
 2 | from typing import Any, Optional
 3 | 
 4 | from omegaconf import SI
 5 | 
 6 | from cybulde.infrastructure.instance_template_creator import VMType
 7 | 
 8 | 
 9 | @dataclass
10 | class BootDiskConfig:
11 |     project_id: str = "deeplearning-platform-release"
12 |     name: str = "common-cu113-v20230925"
13 |     size_gb: int = 50
14 |     labels: Any = SI("${..labels}")
15 | 
16 | 
17 | @dataclass
18 | class VMConfig:
19 |     machine_type: str = "n1-standard-8"
20 |     accelerator_count: int = 1
21 |     accelerator_type: str = "nvidia-tesla-t4"
22 |     vm_type: VMType = VMType.STANDARD
23 |     disks: list[str] = field(default_factory=lambda: [])
24 | 
25 | 
26 | @dataclass
27 | class VMMetadataConfig:
28 |     instance_group_name: str = SI("${infrastructure.instance_group_creator.name}")
29 |     docker_image: Optional[str] = SI("${docker_image}")
30 |     zone: str = SI("${infrastructure.zone}")
31 |     python_hash_seed: int = 42
32 |     mlflow_tracking_uri: str = SI("${infrastructure.mlflow.mlflow_internal_tracking_uri}")
33 |     node_count: int = 1
34 |     disks: Any = SI("${..vm_config.disks}")
35 |     etcd_ip: Optional[str] = SI("${infrastructure.etcd_ip}")
36 | 
37 | 
38 | @dataclass
39 | class InstanceTemplateCreatorConfig:
40 |     _target_: str = "cybulde.infrastructure.instance_template_creator.InstanceTemplateCreator"
41 |     scopes: list[str] = field(
42 |         default_factory=lambda: [
43 |             "https://www.googleapis.com/auth/cloud-platform",
44 |             "https://www.googleapis.com/auth/cloud.useraccounts.readonly",
45 |             "https://www.googleapis.com/auth/cloudruntimeconfig",
46 |         ]
47 |     )
48 |     network: str = "https://www.googleapis.com/compute/v1/projects/cybulde/global/networks/default"
49 |     subnetwork: str = "https://www.googleapis.com/compute/v1/projects/cybulde/regions/europe-west4/subnetworks/default"
50 |     startup_script_path: str = "scripts/vm_startup/task_runner_startup_script.sh"
51 |     vm_config: VMConfig = VMConfig()
52 |     boot_disk_config: BootDiskConfig = BootDiskConfig()
53 |     vm_metadata_config: VMMetadataConfig = VMMetadataConfig()
54 |     template_name: str = SI("${infrastructure.instance_group_creator.name}")
55 |     project_id: str = SI("${infrastructure.project_id}")
56 |     labels: dict[str, str] = field(default_factory=lambda: {"project": "cybulde"})
57 | 


--------------------------------------------------------------------------------
/cybulde/training/tasks/common_training_task.py:
--------------------------------------------------------------------------------
 1 | from typing import TYPE_CHECKING, Union
 2 | 
 3 | from lightning.pytorch import Trainer
 4 | 
 5 | from cybulde.data_modules.data_modules import DataModule, PartialDataModuleType
 6 | from cybulde.training.lightning_modules.bases import TrainingLightningModule
 7 | from cybulde.training.tasks.bases import TrainingTask
 8 | from cybulde.utils.io_utils import is_file
 9 | from cybulde.utils.mlflow_utils import activate_mlflow, log_artifacts_for_reproducibility
10 | 
11 | if TYPE_CHECKING:
12 |     from cybulde.config_schemas.config_schema import Config
13 |     from cybulde.config_schemas.training.training_task_schemas import TrainingTaskConfig
14 | 
15 | 
16 | class CommonTrainingTask(TrainingTask):
17 |     def __init__(
18 |         self,
19 |         name: str,
20 |         data_module: Union[DataModule, PartialDataModuleType],
21 |         lightning_module: TrainingLightningModule,
22 |         trainer: Trainer,
23 |         best_training_checkpoint: str,
24 |         last_training_checkpoint: str,
25 |     ) -> None:
26 |         super().__init__(
27 |             name=name,
28 |             data_module=data_module,
29 |             lightning_module=lightning_module,
30 |             trainer=trainer,
31 |             best_training_checkpoint=best_training_checkpoint,
32 |             last_training_checkpoint=last_training_checkpoint,
33 |         )
34 | 
35 |     def run(self, config: "Config", task_config: "TrainingTaskConfig") -> None:
36 |         experiment_name = config.infrastructure.mlflow.experiment_name
37 |         run_id = config.infrastructure.mlflow.run_id
38 |         run_name = config.infrastructure.mlflow.run_name
39 | 
40 |         with activate_mlflow(experiment_name=experiment_name, run_id=run_id, run_name=run_name) as _:
41 |             if self.trainer.is_global_zero:
42 |                 log_artifacts_for_reproducibility()
43 | 
44 |             assert isinstance(self.data_module, DataModule)
45 |             if is_file(self.last_training_checkpoint):
46 |                 self.logger.info("Found checkpoint here: {self.last_training_checkpoint}. Resuming training...")
47 |                 self.trainer.fit(
48 |                     model=self.lightning_module, datamodule=self.data_module, ckpt_path=self.last_training_checkpoint
49 |                 )
50 |             else:
51 |                 self.trainer.fit(model=self.lightning_module, datamodule=self.data_module)
52 | 
53 |             self.logger.info("training finished...")
54 | 


--------------------------------------------------------------------------------
/cybulde/config_schemas/trainer/callbacks_schemas.py:
--------------------------------------------------------------------------------
 1 | from dataclasses import dataclass
 2 | from typing import Optional
 3 | 
 4 | from hydra.core.config_store import ConfigStore
 5 | from omegaconf import MISSING, SI
 6 | 
 7 | 
 8 | @dataclass
 9 | class CallbackConfig:
10 |     _target_: str = MISSING
11 | 
12 | 
13 | @dataclass
14 | class ModelCheckpointConfig(CallbackConfig):
15 |     _target_: str = "lightning.pytorch.callbacks.ModelCheckpoint"
16 |     dirpath: Optional[str] = "./data/pytorch-lightning"
17 |     filename: Optional[str] = None
18 |     monitor: Optional[str] = None
19 |     verbose: bool = False
20 |     save_last: Optional[bool] = None
21 |     save_top_k: int = 1
22 |     mode: str = "min"
23 |     auto_insert_metric_name: bool = False
24 |     save_weights_only: bool = False
25 |     every_n_train_steps: Optional[int] = None
26 |     train_time_interval: Optional[str] = None
27 |     every_n_epochs: Optional[int] = None
28 |     save_on_train_epoch_end: Optional[bool] = None
29 | 
30 | 
31 | @dataclass
32 | class BestModelCheckpointConfig(ModelCheckpointConfig):
33 |     dirpath: Optional[str] = SI("${infrastructure.mlflow.artifact_uri}/best-checkpoints/")
34 |     monitor: str = MISSING
35 |     save_last: Optional[bool] = True
36 |     save_top_k: int = 2
37 |     mode: str = MISSING
38 | 
39 | 
40 | @dataclass
41 | class ValidationF1ScoreBestModelCheckpointConfig(BestModelCheckpointConfig):
42 |     monitor: str = "validation_f1_score"
43 |     mode: str = "max"
44 | 
45 | 
46 | @dataclass
47 | class LastModelCheckpointConfig(ModelCheckpointConfig):
48 |     dirpath: Optional[str] = SI("${infrastructure.mlflow.artifact_uri}/last-checkpoints/")
49 |     every_n_train_steps: int = SI("${save_last_checkpoint_every_n_train_steps}")
50 |     save_last: Optional[bool] = True
51 |     filename: Optional[str] = "checkpoint-{epoch}"
52 |     save_top_k: int = -1
53 | 
54 | 
55 | @dataclass
56 | class LearningRateMonitorConfig(CallbackConfig):
57 |     _target_: str = "lightning.pytorch.callbacks.LearningRateMonitor"
58 |     logging_interval: str = "step"
59 | 
60 | 
61 | def setup_config() -> None:
62 |     cs = ConfigStore.instance()
63 | 
64 |     cs.store(name="best_model_checkpoint_schema", group="tasks/trainer/callbacks", node=BestModelCheckpointConfig)
65 |     cs.store(name="last_model_checkpoint_schema", group="tasks/trainer/callbacks", node=LastModelCheckpointConfig)
66 |     cs.store(name="learning_rate_monitor_schema", group="tasks/trainer/callbacks", node=LearningRateMonitorConfig)
67 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | pip-wheel-metadata/
 24 | share/python-wheels/
 25 | *.egg-info/
 26 | .installed.cfg
 27 | *.egg
 28 | MANIFEST
 29 | 
 30 | # PyInstaller
 31 | #  Usually these files are written by a python script from a template
 32 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 33 | *.manifest
 34 | *.spec
 35 | 
 36 | # Installer logs
 37 | pip-log.txt
 38 | pip-delete-this-directory.txt
 39 | 
 40 | # Unit test / coverage reports
 41 | htmlcov/
 42 | .tox/
 43 | .nox/
 44 | .coverage
 45 | .coverage.*
 46 | .cache
 47 | nosetests.xml
 48 | coverage.xml
 49 | *.cover
 50 | *.py,cover
 51 | .hypothesis/
 52 | .pytest_cache/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | target/
 76 | 
 77 | # Jupyter Notebook
 78 | .ipynb_checkpoints
 79 | 
 80 | # IPython
 81 | profile_default/
 82 | ipython_config.py
 83 | 
 84 | # pyenv
 85 | .python-version
 86 | 
 87 | # pipenv
 88 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 89 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 90 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 91 | #   install all needed dependencies.
 92 | #Pipfile.lock
 93 | 
 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
 95 | __pypackages__/
 96 | 
 97 | # Celery stuff
 98 | celerybeat-schedule
 99 | celerybeat.pid
100 | 
101 | # SageMath parsed files
102 | *.sage.py
103 | 
104 | # Environments
105 | .env
106 | .venv
107 | env/
108 | venv/
109 | ENV/
110 | env.bak/
111 | venv.bak/
112 | .envs/.app
113 | .envs/.dvc
114 | .envs/.mlflow
115 | .envs/.postgres
116 | .envs/.triton
117 | .envs/.secrets
118 | .envs/.gcp
119 | 
120 | # Spyder project settings
121 | .spyderproject
122 | .spyproject
123 | 
124 | # Rope project settings
125 | .ropeproject
126 | 
127 | # mkdocs documentation
128 | /site
129 | 
130 | # mypy
131 | .mypy_cache/
132 | .dmypy.json
133 | dmypy.json
134 | 
135 | # Pyre type checker
136 | .pyre/
137 | 


--------------------------------------------------------------------------------
/cybulde/utils/io_utils.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | from typing import Any
 4 | 
 5 | import yaml
 6 | 
 7 | from fsspec import AbstractFileSystem, filesystem
 8 | 
 9 | GCS_PREFIX = "gs://"
10 | GCS_FILE_SYSTEM_NAME = "gcs"
11 | LOCAL_FILE_SYSTEM_NAME = "file"
12 | TMP_FILE_PATH = "/tmp/"
13 | 
14 | 
15 | def choose_file_system(path: str) -> AbstractFileSystem:
16 |     return filesystem(GCS_FILE_SYSTEM_NAME) if path.startswith(GCS_PREFIX) else filesystem(LOCAL_FILE_SYSTEM_NAME)
17 | 
18 | 
19 | def open_file(path: str, mode: str = "r") -> Any:
20 |     file_system = choose_file_system(path)
21 |     return file_system.open(path, mode)
22 | 
23 | 
24 | def write_yaml_file(yaml_file_path: str, yaml_file_content: dict[Any, Any]) -> None:
25 |     with open_file(yaml_file_path, "w") as yaml_file:
26 |         yaml.dump(yaml_file_content, yaml_file)
27 | 
28 | 
29 | def is_dir(path: str) -> bool:
30 |     file_system = choose_file_system(path)
31 |     is_dir: bool = file_system.isdir(path)
32 |     return is_dir
33 | 
34 | 
35 | def is_file(path: str) -> bool:
36 |     file_system = choose_file_system(path)
37 |     is_file: bool = file_system.isfile(path)
38 |     return is_file
39 | 
40 | 
41 | def make_dirs(path: str) -> None:
42 |     file_system = choose_file_system(path)
43 |     file_system.makedirs(path, exist_ok=True)
44 | 
45 | 
46 | def list_paths(path: str) -> list[str]:
47 |     file_system = choose_file_system(path)
48 |     if not is_dir(path):
49 |         return []
50 |     paths: list[str] = file_system.ls(path)
51 |     if GCS_FILE_SYSTEM_NAME in file_system.protocol:
52 |         gs_paths: list[str] = [f"{GCS_PREFIX}{path}" for path in paths]
53 |         return gs_paths
54 |     return paths
55 | 
56 | 
57 | def copy_dir(source_dir: str, target_dir: str) -> None:
58 |     if not is_dir(target_dir):
59 |         make_dirs(target_dir)
60 |     source_files = list_paths(source_dir)
61 |     for source_file in source_files:
62 |         target_file = os.path.join(target_dir, os.path.basename(source_file))
63 |         if is_file(source_file):
64 |             with open_file(source_file, mode="rb") as source, open_file(target_file, mode="wb") as target:
65 |                 content = source.read()
66 |                 target.write(content)
67 |         else:
68 |             raise ValueError(f"Source file {source_file} is not a file.")
69 | 
70 | 
71 | def translate_gcs_dir_to_local(path: str) -> str:
72 |     if path.startswith(GCS_PREFIX):
73 |         path = path.rstrip("/")
74 |         local_path = os.path.join(TMP_FILE_PATH, os.path.split(path)[-1])
75 |         os.makedirs(local_path, exist_ok=True)
76 |         copy_dir(path, local_path)
77 |         return local_path
78 |     return path
79 | 


--------------------------------------------------------------------------------
/scripts/vm_startup/task_runner_startup_script.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -euo pipefail
 3 | IFS=$'\n\t'
 4 | 
 5 | export NCCL_ASYNC_ERROR_HANDLING=1
 6 | export GCP_LOGGING_ENABLED="TRUE"
 7 | 
 8 | INSTANCE_GROUP_NAME=$(curl --silent --fail http://metadata.google.internal/computeMetadata/v1/instance/attributes/instance_group_name -H "Metadata-Flavor: Google")
 9 | DOCKER_IMAGE=$(curl --silent --fail http://metadata.google.internal/computeMetadata/v1/instance/attributes/docker_image -H "Metadata-Flavor: Google")
10 | ZONE=$(curl --silent http://metadata.google.internal/computeMetadata/v1/instance/attributes/zone -H "Metadata-Flavor: Google")
11 | PYTHON_HASH_SEED=$(curl --silent --fail http://metadata.google.internal/computeMetadata/v1/instance/attributes/python_hash_seed -H "Metadata-Flavor: Google" || echo "42")
12 | MLFLOW_TRACKING_URI=$(curl --silent http://metadata.google.internal/computeMetadata/v1/instance/attributes/mlflow_tracking_uri -H "Metadata-Flavor: Google")
13 | NODE_COUNT=$(curl --silent http://metadata.google.internal/computeMetadata/v1/instance/attributes/node_count -H "Metadata-Flavor: Google")
14 | DISKS=$(curl --silent http://metadata.google.internal/computeMetadata/v1/instance/attributes/disks -H "Metadata-Flavor: Google")
15 | ETCD_IP=$(curl --silent http://metadata.google.internal/computeMetadata/v1/instance/attributes/etcd_ip -H "Metadata-Flavor: Google")
16 | 
17 | INSTANCE_GROUP_NAME=$(echo ${INSTANCE_GROUP_NAME} | tr '[:upper:]' '[:lower:]')
18 | 
19 | echo -e "TRAINING: instance group name: ${INSTANCE_GROUP_NAME}, docker image: ${DOCKER_IMAGE}, node count: ${NODE_COUNT}, python hash seed: ${PYTHON_HASH_SEED}"
20 | 
21 | echo "============= Installing Nvidia Drivers ==============="
22 | apt-get update && /opt/deeplearning/install-driver.sh
23 | 
24 | echo "============= Downloading docker image ==============="
25 | gcloud auth configure-docker --quiet europe-west4-docker.pkg.dev
26 | time docker pull "${DOCKER_IMAGE}"
27 | 
28 | echo "============= TRAINING: start ==============="
29 | 
30 | if [ "${ETCD_IP}" = "None" ]; then
31 | 	docker run --init --rm --gpus all --ipc host --user root --hostname "$(hostname)" --privileged \
32 | 		--log-driver=gcplogs \
33 | 		-e PYTHONHASHSEED="${PYTHON_HASH_SEED}" \
34 | 		-e MLFLOW_TRACKING_URI="${MLFLOW_TRACKING_URI}" \
35 | 		-e TOKENIZERS_PARALLELISM=false \
36 | 		${DOCKER_IMAGE} \
37 | 		torchrun \
38 | 		--nnodes="${NODE_COUNT}" \
39 | 		--nproc_per_node='gpu' \
40 | 		cybulde/run_tasks.py || echo '================ TRAINING: job failed ==============='
41 | else
42 | 	docker run --init --rm --gpus all --ipc host --user root --hostname "$(hostname)" --privileged \
43 | 		--log-driver=gcplogs \
44 | 		-e PYTHONHASHSEED="${PYTHON_HASH_SEED}" \
45 | 		-e MLFLOW_TRACKING_URI="${MLFLOW_TRACKING_URI}" \
46 | 		-e TOKENIZERS_PARALLELISM=false \
47 | 		${DOCKER_IMAGE} \
48 | 		torchrun \
49 | 		--nnodes="${NODE_COUNT}" \
50 | 		--nproc_per_node='gpu' \
51 | 		--rdzv_id="${INSTANCE_GROUP_NAME}" \
52 | 		--rdzv_backend=etcd-v2 \
53 | 		--rdzv_endpoint="${ETCD_IP}" \
54 | 		cybulde/run_tasks.py || echo '================ TRAINING: job failed ==============='
55 | 
56 | fi
57 | 
58 | echo "============= Cleaning up ==============="
59 | gcloud compute instance-groups managed delete --quiet "${INSTANCE_GROUP_NAME}" --zone "${ZONE}"
60 | 


--------------------------------------------------------------------------------
/cybulde/training/tasks/tar_model_exporting_training_task.py:
--------------------------------------------------------------------------------
 1 | from typing import TYPE_CHECKING, Union
 2 | 
 3 | import pandas as pd
 4 | 
 5 | from lightning.pytorch import Trainer
 6 | from torch import Tensor
 7 | 
 8 | from cybulde.data_modules.data_modules import DataModule, PartialDataModuleType
 9 | from cybulde.models.common.exporter import TarModelExporter
10 | from cybulde.training.lightning_modules.bases import ModelStateDictExportingTrainingLightningModule
11 | from cybulde.training.tasks.bases import TrainingTask
12 | from cybulde.utils.io_utils import is_file
13 | from cybulde.utils.mlflow_utils import activate_mlflow, log_artifacts_for_reproducibility
14 | 
15 | if TYPE_CHECKING:
16 |     from cybulde.config_schemas.config_schema import Config
17 |     from cybulde.config_schemas.training.training_task_schemas import TrainingTaskConfig
18 | 
19 | 
20 | class TarModelExportingTrainingTask(TrainingTask):
21 |     def __init__(
22 |         self,
23 |         name: str,
24 |         data_module: Union[DataModule, PartialDataModuleType],
25 |         lightning_module: ModelStateDictExportingTrainingLightningModule,
26 |         trainer: Trainer,
27 |         best_training_checkpoint: str,
28 |         last_training_checkpoint: str,
29 |         tar_model_export_path: str,
30 |     ) -> None:
31 |         super().__init__(
32 |             name=name,
33 |             data_module=data_module,
34 |             lightning_module=lightning_module,
35 |             trainer=trainer,
36 |             best_training_checkpoint=best_training_checkpoint,
37 |             last_training_checkpoint=last_training_checkpoint,
38 |         )
39 | 
40 |         self.tar_model_export_path = tar_model_export_path
41 | 
42 |     def run(self, config: "Config", task_config: "TrainingTaskConfig") -> None:
43 |         experiment_name = config.infrastructure.mlflow.experiment_name
44 |         run_id = config.infrastructure.mlflow.run_id
45 |         run_name = config.infrastructure.mlflow.run_name
46 | 
47 |         train_df = pd.read_parquet(task_config.data_module.train_df_path)
48 |         value_counts = train_df["label"].value_counts()
49 |         pos_weight = value_counts[0] / value_counts[1]
50 | 
51 |         self.lightning_module.set_pos_weight(Tensor([pos_weight]))
52 | 
53 |         with activate_mlflow(experiment_name=experiment_name, run_id=run_id, run_name=run_name) as _:
54 |             if self.trainer.is_global_zero:
55 |                 log_artifacts_for_reproducibility()
56 | 
57 |             assert isinstance(self.data_module, DataModule)
58 |             if is_file(self.last_training_checkpoint):
59 |                 self.logger.info("Found checkpoint here: {self.last_training_checkpoint}. Resuming training...")
60 |                 self.trainer.fit(
61 |                     model=self.lightning_module, datamodule=self.data_module, ckpt_path=self.last_training_checkpoint
62 |                 )
63 |             else:
64 |                 self.trainer.fit(model=self.lightning_module, datamodule=self.data_module)
65 | 
66 |             self.logger.info("Training finished. Exporting model state dict...")
67 | 
68 |             model_state_dict_path = self.lightning_module.export_model_state_dict(self.best_training_checkpoint)  # type: ignore
69 | 
70 |             model_config = task_config.lightning_module.model  # type: ignore
71 |             model_exporter = TarModelExporter(model_state_dict_path, model_config, self.tar_model_export_path)
72 |             model_exporter.export()
73 | 


--------------------------------------------------------------------------------
/cybulde/infrastructure/instance_group_creator.py:
--------------------------------------------------------------------------------
 1 | import time
 2 | 
 3 | from google.cloud import compute_v1
 4 | 
 5 | from cybulde.infrastructure.instance_template_creator import InstanceTemplateCreator
 6 | from cybulde.utils.gcp_utils import wait_for_extended_operation
 7 | from cybulde.utils.utils import get_logger
 8 | 
 9 | 
10 | class InstanceGroupCreator:
11 |     def __init__(
12 |         self,
13 |         instance_template_creator: InstanceTemplateCreator,
14 |         name: str,
15 |         node_count: int,
16 |         project_id: str,
17 |         zone: str,
18 |     ):
19 |         self.logger = get_logger(self.__class__.__name__)
20 |         self.instance_template_creator = instance_template_creator
21 |         self.name = name.lower()
22 |         self.node_count = node_count
23 |         self.project_id = project_id
24 |         self.zone = zone
25 | 
26 |     def launch_instance_group(self) -> list[int]:
27 |         instance_group = self._create_instance_group()
28 |         self.logger.debug(f"{instance_group=}")
29 | 
30 |         instance_ids = self._get_instance_ids(self.name, self.node_count)
31 |         return instance_ids
32 | 
33 |     def _create_instance_group(self) -> compute_v1.InstanceGroupManager:
34 |         self.logger.info("Starting to create instance group...")
35 |         instance_template = self.instance_template_creator.create_template()
36 | 
37 |         name = self.name
38 |         instance_group_manager_resource = compute_v1.InstanceGroupManager(
39 |             name=name,
40 |             base_instance_name=self.name,
41 |             instance_template=instance_template.self_link,
42 |             target_size=self.node_count,
43 |         )
44 | 
45 |         project_id = self.project_id
46 |         zone = self.zone
47 | 
48 |         instance_group_managers_client = compute_v1.InstanceGroupManagersClient()
49 |         operation = instance_group_managers_client.insert(
50 |             project=project_id, instance_group_manager_resource=instance_group_manager_resource, zone=zone
51 |         )
52 |         wait_for_extended_operation(operation, "managed instance group creation")
53 | 
54 |         self.logger.info("Instance group has been created...")
55 |         return instance_group_managers_client.get(project=project_id, instance_group_manager=name, zone=zone)
56 | 
57 |     def _get_instance_ids(self, name: str, node_count: int) -> list[int]:
58 |         instance_ids = set()
59 |         trial = 0
60 |         max_trials = 10
61 |         base_sleep_time = 1.5
62 |         while trial <= max_trials:
63 |             self.logger.info(f"Waiting for instances ({trial=})...")
64 |             pager = self.list_instances_in_group(name)
65 |             for instance in pager:
66 |                 if instance.id:
67 |                     self.logger.info(f"Instance {instance.id} ready")
68 |                     instance_ids.add(instance.id)
69 | 
70 |             if len(instance_ids) >= node_count:
71 |                 break
72 | 
73 |             time.sleep(pow(base_sleep_time, trial))
74 |             trial += 1
75 |         return list(instance_ids)
76 | 
77 |     def list_instances_in_group(
78 |         self, name: str
79 |     ) -> compute_v1.services.instance_group_managers.pagers.ListManagedInstancesPager:
80 |         instance_group_managers_client = compute_v1.InstanceGroupManagersClient()
81 |         pager = instance_group_managers_client.list_managed_instances(
82 |             project=self.project_id, instance_group_manager=name, zone=self.zone
83 |         )
84 |         return pager
85 | 


--------------------------------------------------------------------------------
/cybulde/models/common/io_utils.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | 
  3 | from typing import Any
  4 | 
  5 | import yaml
  6 | 
  7 | from fsspec import AbstractFileSystem, filesystem
  8 | 
  9 | GCS_PREFIX = "gs://"
 10 | GCS_FILE_SYSTEM_NAME = "gcs"
 11 | LOCAL_FILE_SYSTEM_NAME = "file"
 12 | TMP_FILE_PATH = "/tmp/"
 13 | 
 14 | 
 15 | def choose_file_system(path: str) -> AbstractFileSystem:
 16 |     return filesystem(GCS_FILE_SYSTEM_NAME) if path.startswith(GCS_PREFIX) else filesystem(LOCAL_FILE_SYSTEM_NAME)
 17 | 
 18 | 
 19 | def open_file(path: str, mode: str = "r") -> Any:
 20 |     file_system = choose_file_system(path)
 21 |     return file_system.open(path, mode)
 22 | 
 23 | 
 24 | def write_yaml_file(yaml_file_path: str, yaml_file_content: dict[Any, Any]) -> None:
 25 |     with open_file(yaml_file_path, "w") as yaml_file:
 26 |         yaml.dump(yaml_file_content, yaml_file)
 27 | 
 28 | 
 29 | def is_dir(path: str) -> bool:
 30 |     file_system = choose_file_system(path)
 31 |     is_dir: bool = file_system.isdir(path)
 32 |     return is_dir
 33 | 
 34 | 
 35 | def is_file(path: str) -> bool:
 36 |     file_system = choose_file_system(path)
 37 |     is_file: bool = file_system.isfile(path)
 38 |     return is_file
 39 | 
 40 | 
 41 | def make_dirs(path: str) -> None:
 42 |     file_system = choose_file_system(path)
 43 |     file_system.makedirs(path, exist_ok=True)
 44 | 
 45 | 
 46 | def list_paths(path: str) -> list[str]:
 47 |     file_system = choose_file_system(path)
 48 |     if not is_dir(path):
 49 |         return []
 50 |     paths: list[str] = file_system.ls(path)
 51 |     if GCS_FILE_SYSTEM_NAME in file_system.protocol:
 52 |         gs_paths: list[str] = [f"{GCS_PREFIX}{path}" for path in paths]
 53 |         return gs_paths
 54 |     return paths
 55 | 
 56 | 
 57 | def copy_file(source_file: str, target_path: str) -> None:
 58 |     with open_file(source_file, mode="rb") as source, open_file(target_path, "wb") as target:
 59 |         content = source.read()
 60 |         target.write(content)
 61 | 
 62 | 
 63 | def copy_dir(source_dir: str, target_dir: str) -> None:
 64 |     if not is_dir(target_dir):
 65 |         make_dirs(target_dir)
 66 |     source_files = list_paths(source_dir)
 67 |     for source_file in source_files:
 68 |         target_file = os.path.join(target_dir, os.path.basename(source_file))
 69 |         if is_file(source_file):
 70 |             with open_file(source_file, mode="rb") as source, open_file(target_file, mode="wb") as target:
 71 |                 content = source.read()
 72 |                 target.write(content)
 73 |         else:
 74 |             raise ValueError(f"Source file {source_file} is not a file.")
 75 | 
 76 | 
 77 | def convert_gcs_path_to_local_path(path: str) -> str:
 78 |     if path.startswith(GCS_PREFIX):
 79 |         path = path.rstrip("/")
 80 |         local_path = os.path.join(TMP_FILE_PATH, os.path.split(path)[-1])
 81 |         return local_path
 82 |     return path
 83 | 
 84 | 
 85 | def cache_gcs_resource_locally(path: str) -> str:
 86 |     if path.startswith(GCS_PREFIX):
 87 |         local_path = convert_gcs_path_to_local_path(path)
 88 | 
 89 |         if os.path.exists(local_path):
 90 |             return local_path
 91 | 
 92 |         if is_dir(path):
 93 |             os.makedirs(local_path, exist_ok=True)
 94 |             copy_dir(path, local_path)
 95 |         else:
 96 |             copy_file(path, local_path)
 97 |         return local_path
 98 | 
 99 |     return path
100 | 
101 | 
102 | def translate_gcs_dir_to_local(path: str) -> str:
103 |     if path.startswith(GCS_PREFIX):
104 |         path = path.rstrip("/")
105 |         local_path = os.path.join(TMP_FILE_PATH, os.path.split(path)[-1])
106 |         os.makedirs(local_path, exist_ok=True)
107 |         copy_dir(path, local_path)
108 |         return local_path
109 |     return path
110 | 


--------------------------------------------------------------------------------
/cybulde/config_schemas/trainer/trainer_schemas.py:
--------------------------------------------------------------------------------
 1 | from dataclasses import dataclass, field
 2 | from typing import Optional
 3 | 
 4 | from hydra.core.config_store import ConfigStore
 5 | 
 6 | from cybulde.config_schemas.trainer import callbacks_schemas, logger_schemas
 7 | from cybulde.utils.mixins import LoggableParamsMixin
 8 | 
 9 | 
10 | @dataclass
11 | class TrainerConfig(LoggableParamsMixin):
12 |     _target_: str = "lightning.pytorch.trainer.trainer.Trainer"
13 |     accelerator: str = "auto"
14 |     strategy: str = "ddp_find_unused_parameters_true"
15 |     devices: str = "auto"
16 |     num_nodes: int = 1  # SI("${}")
17 |     precision: str = "16-mixed"
18 |     logger: Optional[list[logger_schemas.LoggerConfig]] = field(default_factory=lambda: [])  # type: ignore
19 |     callbacks: Optional[list[callbacks_schemas.CallbackConfig]] = field(default_factory=lambda: [])  # type: ignore
20 |     fast_dev_run: bool = False
21 |     max_epochs: Optional[int] = None
22 |     min_epochs: Optional[int] = None
23 |     max_steps: int = -1
24 |     min_steps: Optional[int] = None
25 |     max_time: Optional[str] = None
26 |     limit_train_batches: Optional[float] = 1
27 |     limit_val_batches: Optional[float] = 1
28 |     limit_test_batches: Optional[float] = 1
29 |     limit_predict_batches: Optional[float] = 1
30 |     overfit_batches: float = 0.0
31 |     val_check_interval: Optional[float] = 1
32 |     check_val_every_n_epoch: Optional[int] = 1
33 |     num_sanity_val_steps: int = 2
34 |     log_every_n_steps: int = 50
35 |     enable_checkpointing: bool = True
36 |     enable_progress_bar: bool = True
37 |     enable_model_summary: bool = True
38 |     accumulate_grad_batches: int = 1
39 |     gradient_clip_val: Optional[float] = 5
40 |     gradient_clip_algorithm: Optional[str] = "value"
41 |     deterministic: Optional[bool] = None
42 |     benchmark: Optional[bool] = None
43 |     inference_mode: bool = True
44 |     use_distributed_sampler: bool = True
45 |     detect_anomaly: bool = False
46 |     barebones: bool = False
47 |     sync_batchnorm: bool = True
48 |     reload_dataloaders_every_n_epochs: int = 0
49 |     default_root_dir: Optional[str] = "./data/pytorch-lightning"
50 | 
51 |     def loggable_params(self) -> list[str]:
52 |         return ["max_epochs", "max_steps", "strategy", "precision"]
53 | 
54 | 
55 | @dataclass
56 | class GPUDev(TrainerConfig):
57 |     max_epochs: int = 3
58 |     accelerator: str = "gpu"
59 |     log_every_n_steps: int = 1
60 |     limit_train_batches: float = 0.01
61 |     limit_val_batches: float = 0.01
62 |     limit_test_batches: float = 0.01
63 |     logger: Optional[list[logger_schemas.LoggerConfig]] = field(
64 |         default_factory=lambda: [logger_schemas.MLFlowLoggerConfig()]
65 |     )  # type: ignore
66 |     callbacks: Optional[list[callbacks_schemas.CallbackConfig]] = field(
67 |         default_factory=lambda: [
68 |             callbacks_schemas.ValidationF1ScoreBestModelCheckpointConfig(),
69 |             callbacks_schemas.LastModelCheckpointConfig(),
70 |             callbacks_schemas.LearningRateMonitorConfig(),
71 |         ]
72 |     )
73 | 
74 | 
75 | @dataclass
76 | class GPUProd(TrainerConfig):
77 |     max_epochs: int = 20
78 |     accelerator: str = "gpu"
79 |     log_every_n_steps: int = 20
80 |     logger: Optional[list[logger_schemas.LoggerConfig]] = field(
81 |         default_factory=lambda: [logger_schemas.MLFlowLoggerConfig()]
82 |     )  # type: ignore
83 |     callbacks: Optional[list[callbacks_schemas.CallbackConfig]] = field(
84 |         default_factory=lambda: [
85 |             callbacks_schemas.ValidationF1ScoreBestModelCheckpointConfig(),
86 |             callbacks_schemas.LastModelCheckpointConfig(),
87 |             callbacks_schemas.LearningRateMonitorConfig(),
88 |         ]
89 |     )
90 | 
91 | 
92 | def setup_config() -> None:
93 |     logger_schemas.setup_config()
94 |     callbacks_schemas.setup_config()
95 | 
96 |     cs = ConfigStore.instance()
97 |     cs.store(name="trainer_schema", group="tasks/trainer", node=TrainerConfig)
98 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
  1 | [tool.poetry]
  2 | name = "emkademy"
  3 | version = "0.1.0"
  4 | description = ""
  5 | authors = ["Kıvanç Yüksel <emkademy@gmail.com>"]
  6 | readme = "README.md"
  7 | 
  8 | [tool.poetry.dependencies]
  9 | python = ">=3.10,<3.11"
 10 | pandas = "~=2.0"
 11 | matplotlib = "~=3.7"
 12 | hydra-core = "~=1.3"
 13 | pydantic = "~=1.10"
 14 | fsspec = { version = "~=2023.6", extras = ["gcs"] }
 15 | gcsfs = "~=2023.6"
 16 | torch = "==2.1.1"
 17 | lightning = "==2.1.2"
 18 | torchmetrics = "~=1.2"
 19 | transformers = { version = "~=4.35", extras = ["torch"] }
 20 | mlflow = "==2.5.0"
 21 | psycopg2-binary = "~=2.9"
 22 | google-cloud-compute = "~=1.13"
 23 | google-cloud-secret-manager = "~=2.16"
 24 | python-etcd = "~=0.4"
 25 | fastapi = "~=0.104"
 26 | uvicorn = {version = "~=0.24", extras=["standard"]}
 27 | 
 28 | [tool.poetry.group.dev.dependencies]
 29 | jupyterlab = "~=4.0"
 30 | pytest = "~=7.4"
 31 | black = "~=23.7.0"
 32 | isort = "~=5.12"
 33 | flake8 = "~=6.0"
 34 | mypy = "~=1.4"
 35 | 
 36 | [build-system]
 37 | requires = ["poetry-core"]
 38 | build-backend = "poetry.core.masonry.api"
 39 | 
 40 | [tool.black]
 41 | line-length = 120
 42 | 
 43 | [tool.isort]
 44 | profile = "black"
 45 | multi_line_output = 3
 46 | include_trailing_comma = true
 47 | force_grid_wrap = 0
 48 | use_parentheses = true
 49 | ensure_newline_before_comments = true
 50 | line_length = 120
 51 | indent = 4
 52 | lines_between_types = 1
 53 | known_first_party = ["cybulde"]
 54 | default_section = "THIRDPARTY"
 55 | sections = ["FUTURE", "STDLIB", "THIRDPARTY", "FIRSTPARTY", "LOCALFOLDER"]
 56 | 
 57 | [tool.mypy]
 58 | python_version = "3.10"
 59 | show_error_codes = true
 60 | ignore_missing_imports = true
 61 | install_types = true
 62 | non_interactive = true
 63 | follow_imports = "silent"
 64 | no_strict_optional = true
 65 | 
 66 | warn_redundant_casts = false
 67 | warn_unused_ignores = true
 68 | warn_unused_configs = true
 69 | warn_return_any = true
 70 | warn_no_return = true
 71 | warn_incomplete_stub = true
 72 | 
 73 | disallow_subclassing_any = true
 74 | 
 75 | disallow_untyped_calls = true
 76 | disallow_untyped_defs = true
 77 | disallow_incomplete_defs = true
 78 | disallow_untyped_decorators = true
 79 | check_untyped_defs = true
 80 | strict_optional = true
 81 | 
 82 | verbosity = 0
 83 | 
 84 | [tool.ruff]
 85 | # Enable Pyflakes (`F`) and a subset of the pycodestyle (`E`)  codes by default.
 86 | select = ["E4", "E7", "E9", "F"]
 87 | ignore = []
 88 | 
 89 | # Allow fix for all enabled rules (when `--fix`) is provided.
 90 | fixable = [
 91 |   "A",
 92 |   "B",
 93 |   "C",
 94 |   "D",
 95 |   "E",
 96 |   "F",
 97 |   "G",
 98 |   "I",
 99 |   "N",
100 |   "Q",
101 |   "S",
102 |   "T",
103 |   "W",
104 |   "ANN",
105 |   "ARG",
106 |   "BLE",
107 |   "COM",
108 |   "DJ",
109 |   "DTZ",
110 |   "EM",
111 |   "ERA",
112 |   "EXE",
113 |   "FBT",
114 |   "ICN",
115 |   "INP",
116 |   "ISC",
117 |   "NPY",
118 |   "PD",
119 |   "PGH",
120 |   "PIE",
121 |   "PL",
122 |   "PT",
123 |   "PTH",
124 |   "PYI",
125 |   "RET",
126 |   "RSE",
127 |   "RUF",
128 |   "SIM",
129 |   "SLF",
130 |   "TCH",
131 |   "TID",
132 |   "TRY",
133 |   "UP",
134 |   "YTT",
135 | ]
136 | unfixable = []
137 | 
138 | # Exclude a variety of commonly ignored directories.
139 | exclude = [
140 |   ".bzr",
141 |   ".direnv",
142 |   ".eggs",
143 |   ".git",
144 |   ".git-rewrite",
145 |   ".hg",
146 |   ".mypy_cache",
147 |   ".nox",
148 |   ".pants.d",
149 |   ".pytype",
150 |   ".ruff_cache",
151 |   ".svn",
152 |   ".tox",
153 |   ".venv",
154 |   "__pypackages__",
155 |   "_build",
156 |   "buck-out",
157 |   "build",
158 |   "dist",
159 |   "node_modules",
160 |   "venv",
161 | ]
162 | 
163 | # Same as Black.
164 | line-length = 120
165 | 
166 | # Allow unused variables when underscore-prefixed.
167 | dummy-variable-rgx = "^(_+|(_+[a-zA-Z0-9_]*[a-zA-Z0-9]+?))$"
168 | 
169 | # Assume Python 3.10
170 | target-version = "py310"
171 | 
172 | [tool.ruff.mccabe]
173 | # Unlike Flake8, default to a complexity level of 10.
174 | max-complexity = 10
175 | 


--------------------------------------------------------------------------------
/cybulde/training/lightning_modules/bases.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | 
  3 | from abc import abstractmethod
  4 | from typing import Any, Callable, Iterable, Optional, Union
  5 | 
  6 | import mlflow
  7 | import torch
  8 | 
  9 | from lightning.pytorch import LightningModule
 10 | from torch import Tensor
 11 | from torch.optim import Optimizer
 12 | 
 13 | from cybulde.models.models import Model
 14 | from cybulde.models.transformations import Transformation
 15 | from cybulde.training.loss_functions import LossFunction
 16 | from cybulde.training.schedulers import LightningScheduler
 17 | from cybulde.utils.io_utils import open_file
 18 | from cybulde.utils.utils import get_logger
 19 | 
 20 | PartialOptimizerType = Callable[[Union[Iterable[Tensor], dict[str, Iterable[Tensor]]]], Optimizer]
 21 | 
 22 | 
 23 | class TrainingLightningModule(LightningModule):
 24 |     def __init__(
 25 |         self,
 26 |         model: Model,
 27 |         loss: LossFunction,
 28 |         optimizer: PartialOptimizerType,
 29 |         scheduler: Optional[LightningScheduler] = None,
 30 |     ) -> None:
 31 |         super().__init__()
 32 | 
 33 |         self.model = model
 34 |         self.loss = loss
 35 |         self.partial_optimizer = optimizer
 36 |         self.scheduler = scheduler
 37 | 
 38 |         self.model_size = self._calculate_model_size()
 39 |         self.logging_logger = get_logger(self.__class__.__name__)
 40 | 
 41 |     def _calculate_model_size(self) -> float:
 42 |         param_size = 0
 43 |         for param in self.parameters():
 44 |             param_size += param.nelement() & param.element_size()
 45 | 
 46 |         buffer_size = 0
 47 |         for buffer in self.buffers():
 48 |             buffer_size += buffer.nelement() * buffer.element_size()
 49 | 
 50 |         size_all_mb = (param_size + buffer_size) / 1024**2
 51 |         return size_all_mb
 52 | 
 53 |     def configure_optimizers(self) -> Union[Optimizer, tuple[list[Optimizer], list[dict[str, Any]]]]:
 54 |         optimizer = self.partial_optimizer(self.parameters())
 55 | 
 56 |         if self.scheduler is not None:
 57 |             scheduler = self.scheduler.configure_scheduler(
 58 |                 optimizer=optimizer, estimated_stepping_batches=self.trainer.estimated_stepping_batches
 59 |             )
 60 |             return [optimizer], [scheduler]
 61 | 
 62 |         return optimizer
 63 | 
 64 |     def on_train_end(self) -> None:
 65 |         try:
 66 |             mlflow.log_metric("model_size", self.model_size)
 67 |         except Exception:
 68 |             pass
 69 |         return super().on_train_end()
 70 | 
 71 |     @abstractmethod
 72 |     def training_step(self, batch: Any, batch_idx: int) -> Tensor:
 73 |         ...
 74 | 
 75 |     @abstractmethod
 76 |     def validation_step(self, batch: Any, batch_idx: int) -> Tensor:
 77 |         ...
 78 | 
 79 |     @abstractmethod
 80 |     def get_transformation(self) -> Transformation:
 81 |         ...
 82 | 
 83 | 
 84 | class ModelStateDictExportingTrainingLightningModule(TrainingLightningModule):
 85 |     @abstractmethod
 86 |     def export_model_state_dict(self, checkpoint_path: str) -> str:
 87 |         """
 88 |         Export model state dict from LightningModule checkpoint and save it
 89 |         to the same location as the checkpoint_path, and return the save path
 90 |         """
 91 | 
 92 |     def common_export_model_state_dict(self, checkpoint_path: str) -> str:
 93 |         with open_file(checkpoint_path, "rb") as f:
 94 |             state_dict = torch.load(f, map_location=torch.device("cpu"))["state_dict"]
 95 | 
 96 |         model_state_dict = {}
 97 |         for key, value in state_dict.items():
 98 |             if not key.startswith("loss."):
 99 |                 model_state_dict[key.replace("model.", "", 1)] = value
100 | 
101 |         model_state_dict_save_path = os.path.join(os.path.dirname(checkpoint_path), "model_state_dict.pth")
102 | 
103 |         with open_file(model_state_dict_save_path, "wb") as f:
104 |             torch.save(model_state_dict, f)
105 | 
106 |         return model_state_dict_save_path
107 | 


--------------------------------------------------------------------------------
/cybulde/utils/config_utils.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | import logging.config
  3 | import sys
  4 | 
  5 | from io import StringIO
  6 | from pathlib import Path
  7 | from typing import TYPE_CHECKING, Any, Optional, Union
  8 | 
  9 | import hydra
 10 | import yaml
 11 | 
 12 | from hydra import compose, initialize
 13 | from hydra.types import TaskFunction
 14 | from omegaconf import DictConfig, OmegaConf
 15 | 
 16 | from cybulde.config_schemas import config_schema
 17 | from cybulde.utils.io_utils import open_file, write_yaml_file
 18 | 
 19 | if TYPE_CHECKING:
 20 |     from cybulde.config_schemas.config_schema import Config
 21 | 
 22 | 
 23 | def get_config(
 24 |     config_path: str, config_name: str, to_object: bool = True, return_dict_config: bool = False
 25 | ) -> TaskFunction:
 26 |     setup_config()
 27 |     setup_logger()
 28 | 
 29 |     def main_decorator(task_function: TaskFunction) -> Any:
 30 |         @hydra.main(config_path=config_path, config_name=config_name, version_base=None)
 31 |         def decorated_main(dict_config: Optional[DictConfig] = None) -> Any:
 32 |             if to_object:
 33 |                 config = OmegaConf.to_object(dict_config)
 34 | 
 35 |             if not return_dict_config:
 36 |                 assert to_object
 37 |                 return task_function(config)
 38 |             return task_function(dict_config)
 39 | 
 40 |         return decorated_main
 41 | 
 42 |     return main_decorator
 43 | 
 44 | 
 45 | def get_config_and_dict_config(config_path: str, config_name: str) -> Any:
 46 |     setup_config()
 47 |     setup_logger()
 48 | 
 49 |     def main_decorator(task_function: Any) -> Any:
 50 |         @hydra.main(config_path=config_path, config_name=config_name, version_base=None)
 51 |         def decorated_main(dict_config: Optional[DictConfig] = None) -> Any:
 52 |             config = OmegaConf.to_object(dict_config)
 53 |             return task_function(config, dict_config)
 54 | 
 55 |         return decorated_main
 56 | 
 57 |     return main_decorator
 58 | 
 59 | 
 60 | def setup_config() -> None:
 61 |     config_schema.setup_config()
 62 | 
 63 | 
 64 | def setup_logger() -> None:
 65 |     with open("./cybulde/configs/hydra/job_logging/custom.yaml", "r") as stream:
 66 |         config = yaml.load(stream, Loader=yaml.FullLoader)
 67 |     logging.config.dictConfig(config)
 68 | 
 69 | 
 70 | def save_config_as_yaml(config: Union["Config", DictConfig], save_path: str) -> None:
 71 |     text_io = StringIO()
 72 |     text_io.writelines(
 73 |         [
 74 |             f"# Do not edit this file. It is automatically generated by {sys.argv[0]}.\n",
 75 |             "# If you want to modify configuration, edit source files in cybulde/configs directory.\n",
 76 |             "\n",
 77 |         ]
 78 |     )
 79 | 
 80 |     config_header = load_config_header()
 81 |     text_io.write(config_header)
 82 |     text_io.write("\n")
 83 | 
 84 |     OmegaConf.save(config, text_io, resolve=True)
 85 |     with open_file(save_path, "w") as f:
 86 |         f.write(text_io.getvalue())
 87 | 
 88 | 
 89 | def load_config_header() -> str:
 90 |     config_header_path = Path("./cybulde/configs/automatically_generated/full_config_header.yaml")
 91 |     if not config_header_path.exists():
 92 |         config_header = {
 93 |             "defaults": [
 94 |                 # {"override hydra/job_logging": "custom"},
 95 |                 {"override hydra/hydra_logging": "disabled"},
 96 |                 "_self_",
 97 |             ],
 98 |             "hydra": {"output_subdir": None, "run": {"dir": "."}},
 99 |         }
100 |         config_header_path.parent.mkdir(parents=True, exist_ok=True)
101 |         write_yaml_file(str(config_header_path), config_header)
102 | 
103 |     with open(config_header_path, "r") as f:
104 |         return f.read()
105 | 
106 | 
107 | def load_config(config_path: str, config_name: str, overrides: Optional[list[str]] = None) -> Any:
108 |     setup_config()
109 |     setup_logger()
110 | 
111 |     if overrides is None:
112 |         overrides = []
113 | 
114 |     with initialize(version_base=None, config_path=config_path, job_name="config-compose"):
115 |         config = compose(config_name=config_name, overrides=overrides)
116 | 
117 |     return config
118 | 


--------------------------------------------------------------------------------
/cybulde/evaluation/model_selector.py:
--------------------------------------------------------------------------------
 1 | from typing import Any, Optional
 2 | 
 3 | from mlflow.entities import Run
 4 | 
 5 | from cybulde.utils.mlflow_utils import get_best_run, get_client
 6 | from cybulde.utils.utils import get_logger
 7 | 
 8 | 
 9 | class MetricComparer:
10 |     def __init__(self, bigger_is_better: bool, can_be_equal: bool, metric_name: str, threshold: float = 0.0) -> None:
11 |         self.bigger_is_better = bigger_is_better
12 |         self.can_be_equal = can_be_equal
13 |         self.metric_name = metric_name
14 |         self.threshold = threshold
15 | 
16 |     def is_metric_better(self, run: Run, best_run_data: dict[str, Any]) -> bool:
17 |         if not best_run_data:
18 |             return True
19 | 
20 |         current_metric_value = self.get_current_metric_value(run)
21 |         best_metric_value = best_run_data[f"metrics.{self.metric_name}"]
22 | 
23 |         if self.can_be_equal and current_metric_value == best_metric_value:
24 |             return True
25 | 
26 |         if self.bigger_is_better:
27 |             current_metric_value -= self.threshold
28 |             result = current_metric_value > best_metric_value
29 |             assert isinstance(result, bool)
30 |             return result
31 |         else:
32 |             current_metric_value += self.threshold
33 |             result = current_metric_value < best_metric_value
34 |             assert isinstance(result, bool)
35 |             return result
36 | 
37 |     def get_current_metric_value(self, run: Run) -> float:
38 |         current_metric_value = run.data.metrics.get(self.metric_name, None)
39 |         if current_metric_value is None:
40 |             raise RuntimeError(f"Metric: {self.metric_name} couldn't be found on MLFlow. Was it logged?")
41 |         assert isinstance(current_metric_value, float)
42 |         return current_metric_value
43 | 
44 | 
45 | class ModelSelector:
46 |     def __init__(
47 |         self,
48 |         mlflow_run_id: str,
49 |         must_be_better_metric_comparers: dict[str, MetricComparer] = {},
50 |         to_be_thresholded_metric_comparers: dict[str, MetricComparer] = {},
51 |         threshold: float = 0.0,
52 |     ) -> None:
53 |         if not must_be_better_metric_comparers and not to_be_thresholded_metric_comparers:
54 |             raise ValueError(
55 |                 "Both 'must_be_better_metric_comparers' and 'to_be_thresholded_metric_comparers' cannot be empty..."
56 |             )
57 | 
58 |         self.logger = get_logger(self.__class__.__name__)
59 | 
60 |         self.mlflow_run_id = mlflow_run_id
61 |         self.must_be_better_metric_comparers = must_be_better_metric_comparers
62 |         self.to_be_thresholded_metric_comparers = to_be_thresholded_metric_comparers
63 |         self.threshold = threshold
64 | 
65 |         client = get_client()
66 |         self.run = client.get_run(mlflow_run_id)
67 |         self.best_run_data = get_best_run()
68 |         self.new_best_run_tag: Optional[str] = None
69 | 
70 |     def is_selected(self) -> bool:
71 |         is_selected = self._is_selected(self.run)
72 |         if is_selected:
73 |             self.new_best_run_tag = self.get_new_best_run_tag()
74 |         return is_selected
75 | 
76 |     def _is_selected(self, run: Run) -> bool:
77 |         for metric_name, metric_comparer in self.must_be_better_metric_comparers.items():
78 |             if not metric_comparer.is_metric_better(run, self.best_run_data):
79 |                 self.logger.info(f"'{metric_name}' is a must have metric, and its value is not better than before...")
80 |                 return False
81 | 
82 |         hits = []
83 |         for metric_comparer in self.to_be_thresholded_metric_comparers.values():
84 |             is_metric_better = metric_comparer.is_metric_better(run, self.best_run_data)
85 |             hits.append(int(is_metric_better))
86 | 
87 |         if not hits:
88 |             return True
89 | 
90 |         mean_hits = sum(hits) / len(hits)
91 |         return mean_hits > self.threshold
92 | 
93 |     def get_new_best_run_tag(self) -> str:
94 |         if len(self.best_run_data) == 0:
95 |             return "v1"
96 |         last_tag: str = self.best_run_data["tags.best_run"]
97 |         last_version = int(last_tag[1:])
98 |         return f"v{last_version + 1}"
99 | 


--------------------------------------------------------------------------------
/cybulde/utils/gcp_utils.py:
--------------------------------------------------------------------------------
 1 | import inspect
 2 | import typing as t
 3 | 
 4 | from dataclasses import dataclass
 5 | 
 6 | from google.api_core.exceptions import GoogleAPICallError
 7 | from google.api_core.extended_operation import ExtendedOperation
 8 | from google.cloud import compute_v1, secretmanager
 9 | 
10 | from cybulde.utils.utils import get_logger
11 | 
12 | GCP_UTILS_LOGGER = get_logger(__name__)
13 | 
14 | 
15 | def access_secret_version(project_id: str, secret_id: str, version_id: str = "1") -> str:
16 |     """
17 |     Access the payload for the given secret version if one exists. The version
18 |     can be a version number as a string (e.g. "5") or an alias (e.g. "latest").
19 |     """
20 |     client = secretmanager.SecretManagerServiceClient()
21 |     name = f"projects/{project_id}/secrets/{secret_id}/versions/{version_id}"
22 |     response = client.access_secret_version(request={"name": name})
23 |     payload: str = response.payload.data.decode("UTF-8")
24 | 
25 |     return payload
26 | 
27 | 
28 | def wait_for_extended_operation(
29 |     operation: ExtendedOperation, verbose_name: str = "operation", timeout: int = 300
30 | ) -> t.Any:
31 |     try:
32 |         result = operation.result(timeout=timeout)  # type: ignore
33 |     except GoogleAPICallError as ex:
34 |         GCP_UTILS_LOGGER.exception("Exception occurred")
35 |         for attr in ["details", "domain", "errors", "metadata", "reason", "response"]:
36 |             value = getattr(ex, attr, None)
37 |             if value:
38 |                 GCP_UTILS_LOGGER.error(f"ex.{attr}:\n{value}")
39 |         if isinstance(ex.response, compute_v1.Operation):
40 |             for error in ex.response.error.errors:
41 |                 GCP_UTILS_LOGGER.error(f"Error message: {error.message}")
42 | 
43 |         raise RuntimeError("Exception during extended operation") from ex
44 | 
45 |     if operation.error_code:
46 |         GCP_UTILS_LOGGER.error(
47 |             f"Error during {verbose_name}: [Code: {operation.error_code}]: {operation.error_message}"
48 |         )
49 |         GCP_UTILS_LOGGER.error(f"Operation ID: {operation.name}")
50 |         raise operation.exception() or RuntimeError(operation.error_message)  # type: ignore
51 | 
52 |     if operation.warnings:
53 |         GCP_UTILS_LOGGER.warning(f"Warnings during {verbose_name}:\n")
54 |         for warning in operation.warnings:
55 |             GCP_UTILS_LOGGER.warning(f" - {warning.code}: {warning.message}")
56 | 
57 |     return result
58 | 
59 | 
60 | @dataclass
61 | class TrainingInfo:
62 |     project_id: str
63 |     zone: str
64 |     instance_group_name: str
65 |     instance_ids: list[int]
66 |     mlflow_experiment_url: str
67 | 
68 |     def get_job_info_message(self) -> str:
69 |         instance_ids_regex, log_viewer_url, train_cluster_url = self._get_job_tracking_links()
70 | 
71 |         run_description = f"""
72 |             Deployed training cluster: {train_cluster_url}
73 |             Experiment logs (python): {log_viewer_url}
74 |             if something goes wrong type in log viewer query field:
75 |             ```
76 |             resource.type="gce_instance"
77 |             logName="projects/{self.project_id}/logs/GCEMetadataScripts"
78 |             resource.labels.instance_id={instance_ids_regex}
79 |             ```
80 |         """
81 |         return inspect.cleandoc(run_description)
82 | 
83 |     def _get_job_tracking_links(self) -> tuple[str, str, str]:
84 |         instance_ids = [str(id) for id in self.instance_ids]
85 |         instance_ids_regex = " OR ".join(instance_ids)
86 |         instance_ids_url = "%20OR%20".join(instance_ids)
87 |         cluster_url = f"https://console.cloud.google.com/compute/instanceGroups/details/{self.zone}/{self.instance_group_name}?project={self.project_id}"
88 |         log_viewer_url = f"https://console.cloud.google.com/logs/query;query=resource.type%3D%22gce_instance%22%0Aresource.labels.instance_id%3D%2528{instance_ids_url}%2529?project={self.project_id}"
89 |         return instance_ids_regex, log_viewer_url, cluster_url
90 | 
91 |     def print_job_info(self) -> None:
92 |         print(f"============ Task {self.instance_group_name} details ============")
93 |         print(f"MLFlow experiment url: {self.mlflow_experiment_url}")
94 |         print(self.get_job_info_message())
95 | 


--------------------------------------------------------------------------------
/cybulde/utils/mlflow_utils.py:
--------------------------------------------------------------------------------
  1 | import dataclasses
  2 | import os
  3 | 
  4 | from contextlib import contextmanager
  5 | from typing import TYPE_CHECKING, Any, Generator, Iterable, Optional
  6 | 
  7 | import mlflow
  8 | 
  9 | from mlflow.pyfunc import PythonModel
 10 | from mlflow.tracking.fluent import ActiveRun
 11 | 
 12 | from cybulde.config_schemas.infrastructure.infrastructure_schema import MLFlowConfig
 13 | from cybulde.utils.mixins import LoggableParamsMixin
 14 | 
 15 | MLFLOW_TRACKING_URI = os.getenv("MLFLOW_TRACKING_URI")
 16 | 
 17 | if TYPE_CHECKING:
 18 |     from cybulde.config_schemas.config_schema import Config
 19 | 
 20 | 
 21 | @contextmanager  # type: ignore
 22 | def activate_mlflow(
 23 |     experiment_name: Optional[str] = None,
 24 |     run_id: Optional[str] = None,
 25 |     run_name: Optional[str] = None,
 26 | ) -> Iterable[mlflow.ActiveRun]:
 27 |     set_experiment(experiment_name)
 28 | 
 29 |     run: ActiveRun
 30 |     with mlflow.start_run(run_name=run_name, run_id=run_id) as run:
 31 |         yield run
 32 | 
 33 | 
 34 | def set_experiment(experiment_name: Optional[str] = None) -> None:
 35 |     if experiment_name is None:
 36 |         experiment_name = "Default"
 37 | 
 38 |     try:
 39 |         mlflow.create_experiment(experiment_name)
 40 |     except mlflow.exceptions.RestException:
 41 |         pass
 42 | 
 43 |     mlflow.set_experiment(experiment_name)
 44 | 
 45 | 
 46 | def log_artifacts_for_reproducibility() -> None:
 47 |     locations_to_store = ["./cybulde", "./docker", "./pyproject.toml", "./poetry.lock"]
 48 | 
 49 |     for location_to_store in locations_to_store:
 50 |         mlflow.log_artifact(location_to_store, "reproduction")
 51 | 
 52 | 
 53 | def log_training_hparams(config: "Config") -> None:
 54 |     logged_nodes = set()
 55 | 
 56 |     def loggable_params(node: Any, path: list[str]) -> Generator[tuple[str, Any], None, None]:
 57 |         if isinstance(node, LoggableParamsMixin) and id(node) not in logged_nodes:
 58 |             for param_name in node.loggable_params():
 59 |                 yield ".".join(path + [param_name]), getattr(node, param_name)
 60 |             logged_nodes.add(id(node))
 61 |         children = None
 62 |         if isinstance(node, dict):
 63 |             children = node.items()
 64 |         if dataclasses.is_dataclass(node):
 65 |             children = ((f.name, getattr(node, f.name)) for f in dataclasses.fields(node))  # type: ignore
 66 | 
 67 |         if children is None:
 68 |             return
 69 |         for key, val in children:
 70 |             for item in loggable_params(val, path + [key]):
 71 |                 yield item
 72 | 
 73 |     params = dict(loggable_params(config, []))
 74 |     mlflow.log_params(params)
 75 | 
 76 | 
 77 | def get_client() -> mlflow.tracking.MlflowClient:
 78 |     return mlflow.tracking.MlflowClient(MLFLOW_TRACKING_URI)
 79 | 
 80 | 
 81 | def get_all_experiment_ids() -> list[str]:
 82 |     return [exp.experiment_id for exp in mlflow.search_experiments()]
 83 | 
 84 | 
 85 | def get_best_run() -> dict[str, Any]:
 86 |     best_runs = mlflow.search_runs(get_all_experiment_ids(), filter_string="tag.best_run LIKE 'v%'")
 87 |     if len(best_runs) == 0:
 88 |         return {}
 89 | 
 90 |     indices = best_runs["tags.best_run"].str.split("v").str[-1].astype(int).sort_values()
 91 |     best_runs = best_runs.reindex(index=indices.index)
 92 |     best_runs_dict: dict[str, Any] = best_runs.iloc[-1].to_dict()
 93 |     return best_runs_dict
 94 | 
 95 | 
 96 | class DummyWrapper(PythonModel):  # type: ignore
 97 |     def load_context(self, some_path: str) -> None:
 98 |         pass
 99 | 
100 |     def predict(self, some_input: Any, some_other_parameter: Any) -> Optional[float]:
101 |         pass
102 | 
103 | 
104 | def log_model(mlflow_config: MLFlowConfig, new_best_run_tag: str, registered_model_name: str) -> None:
105 |     experiment_name = mlflow_config.experiment_name
106 |     run_id = mlflow_config.run_id
107 |     run_name = mlflow_config.run_name
108 | 
109 |     with activate_mlflow(experiment_name=experiment_name, run_id=run_id, run_name=run_name) as _:
110 |         mlflow.pyfunc.log_model(
111 |             artifact_path="", python_model=DummyWrapper(), registered_model_name=registered_model_name
112 |         )
113 |         mlflow.set_tag("best_run", new_best_run_tag)
114 | 


--------------------------------------------------------------------------------
/cybulde/models/common/exporter.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import sys
  3 | import tarfile
  4 | import tempfile
  5 | 
  6 | from typing import Any
  7 | 
  8 | import torch
  9 | 
 10 | from hydra.utils import instantiate
 11 | from omegaconf import DictConfig, OmegaConf
 12 | 
 13 | from cybulde.models.common.io_utils import cache_gcs_resource_locally, copy_file
 14 | from cybulde.models.common.utils import get_global_rank, get_local_rank, global_rank_zero_first, local_rank_zero_first
 15 | from cybulde.utils.utils import get_logger
 16 | 
 17 | MODELS_MODULE_PATH = "cybulde/models"
 18 | TEMP_MODELS_MODULE_PATH = "temp_module/models"
 19 | MODEL_CONFIG_PATH = "model_config.yaml"
 20 | STATE_DICT_PATH = "model_state_dict.pth"
 21 | EXPORTED_MODEL_FILE_NAME = "exported_model.tar.gz"
 22 | 
 23 | 
 24 | class TarModelExporter:
 25 |     def __init__(
 26 |         self,
 27 |         model_state_dict_path: str,
 28 |         model_config: Any,
 29 |         tar_model_export_path: str,
 30 |     ) -> None:
 31 |         self.model_state_dict_path = model_state_dict_path
 32 |         self.model_config = model_config
 33 |         self.tar_model_export_path = tar_model_export_path
 34 | 
 35 |         self.logger = get_logger(self.__class__.__name__)
 36 | 
 37 |     def export(self) -> None:
 38 |         with global_rank_zero_first():
 39 |             if get_global_rank() in [0, -1]:
 40 |                 state_dict_path = self.download_model_state_dict()
 41 |                 model_config_path = self.save_model_config()
 42 | 
 43 |                 local_tar_path = os.path.join(tempfile.gettempdir(), EXPORTED_MODEL_FILE_NAME)
 44 |                 with tarfile.open(local_tar_path, "w:gz") as tar:
 45 |                     tar.add(MODELS_MODULE_PATH, arcname=TEMP_MODELS_MODULE_PATH)
 46 |                     tar.add(state_dict_path, arcname=STATE_DICT_PATH)
 47 |                     tar.add(model_config_path, arcname=MODEL_CONFIG_PATH)
 48 | 
 49 |                 copy_file(local_tar_path, self.tar_model_export_path)
 50 | 
 51 |                 self.logger.info("Model exported successfully!")
 52 | 
 53 |     def download_model_state_dict(self) -> str:
 54 |         return cache_gcs_resource_locally(self.model_state_dict_path)
 55 | 
 56 |     def save_model_config(self) -> str:
 57 |         model_config_save_path = os.path.join(tempfile.gettempdir(), MODEL_CONFIG_PATH)
 58 |         OmegaConf.save(self.model_config, model_config_save_path)
 59 |         return model_config_save_path
 60 | 
 61 | 
 62 | class TarModelLoader:
 63 |     def __init__(self, exported_model_path: str) -> None:
 64 |         self.exported_model_path = exported_model_path
 65 |         self.replace_module_from = MODELS_MODULE_PATH.split("/")[0]
 66 |         self.replace_module_to = TEMP_MODELS_MODULE_PATH.split("/")[0]
 67 |         self.logger = get_logger(self.__class__.__name__)
 68 | 
 69 |     def load(self) -> Any:
 70 |         temp_export_path = "/tmp/temp_cybulde"
 71 | 
 72 |         with local_rank_zero_first():
 73 |             if get_local_rank() in [0, -1]:
 74 |                 self.extract_tar_gz(temp_export_path)
 75 | 
 76 |         model_config = self.load_model_config(temp_export_path)
 77 |         model = self.load_model(temp_export_path, model_config)
 78 |         return model
 79 | 
 80 |     def extract_tar_gz(self, export_path: str) -> None:
 81 |         local_exported_model_path = cache_gcs_resource_locally(self.exported_model_path)
 82 |         with tarfile.open(local_exported_model_path, "r:gz") as tar:
 83 |             tar.extractall(path=export_path)
 84 | 
 85 |     def load_model_config(self, model_dir: str) -> Any:
 86 |         model_config = OmegaConf.load(f"{model_dir}/{MODEL_CONFIG_PATH}")
 87 |         model_config = self._replace_module_in_model_config(model_config)
 88 |         return model_config
 89 | 
 90 |     def load_model(self, model_dir: str, model_config: Any) -> Any:
 91 |         sys.path.append(model_dir)
 92 | 
 93 |         model = instantiate(model_config)
 94 |         state_dict = torch.load(f"{model_dir}/{STATE_DICT_PATH}")
 95 |         model.load_state_dict(state_dict)
 96 | 
 97 |         sys.path.remove(model_dir)
 98 |         return model
 99 | 
100 |     def _replace_module_in_model_config(self, config: Any) -> Any:
101 |         for key, value in config.items():
102 |             if isinstance(value, (dict, DictConfig)):
103 |                 self._replace_module_in_model_config(value)
104 | 
105 |             if key == "_target_":
106 |                 assert isinstance(value, str)
107 |                 config[key] = value.replace(self.replace_module_from, self.replace_module_to)
108 | 
109 |         return config
110 | 


--------------------------------------------------------------------------------
/cybulde/data_modules/data_modules.py:
--------------------------------------------------------------------------------
  1 | from typing import Any, Callable, Optional, Protocol
  2 | 
  3 | from lightning.pytorch import LightningDataModule
  4 | from torch import Tensor
  5 | from torch.utils.data import BatchSampler, DataLoader, Dataset, Sampler, default_collate
  6 | from transformers import BatchEncoding
  7 | 
  8 | from cybulde.data_modules.datasets import TextClassificationDataset
  9 | from cybulde.models.transformations import HuggingFaceTokenizationTransformation, Transformation
 10 | 
 11 | 
 12 | class DataModule(LightningDataModule):
 13 |     def __init__(
 14 |         self,
 15 |         batch_size: int,
 16 |         shuffle: bool = False,
 17 |         sampler: Optional[Sampler] = None,
 18 |         batch_sampler: Optional[BatchSampler] = None,
 19 |         num_workers: int = 0,
 20 |         collate_fn: Optional[Callable[[Any], Any]] = None,
 21 |         pin_memory: bool = False,
 22 |         drop_last: bool = False,
 23 |         persistent_workers: bool = False,
 24 |     ) -> None:
 25 |         super().__init__()
 26 | 
 27 |         self.batch_size = batch_size
 28 |         self.shuffle = shuffle
 29 |         self.sampler = sampler
 30 |         self.batch_sampler = batch_sampler
 31 |         self.num_workers = num_workers
 32 |         self.collate_fn = collate_fn
 33 |         self.pin_memory = pin_memory
 34 |         self.drop_last = drop_last
 35 |         self.persistent_workers = persistent_workers
 36 | 
 37 |     def initialize_dataloader(self, dataset: Dataset, is_test: bool) -> DataLoader:
 38 |         return DataLoader(
 39 |             dataset,
 40 |             batch_size=self.batch_size,
 41 |             shuffle=self.shuffle and not is_test,
 42 |             sampler=self.sampler,
 43 |             batch_sampler=self.batch_sampler,
 44 |             num_workers=self.num_workers,
 45 |             collate_fn=self.collate_fn,
 46 |             pin_memory=self.pin_memory,
 47 |             drop_last=self.drop_last,
 48 |             persistent_workers=self.persistent_workers,
 49 |         )
 50 | 
 51 | 
 52 | class PartialDataModuleType(Protocol):
 53 |     def __call__(self, transformation: Transformation) -> DataModule:
 54 |         ...
 55 | 
 56 | 
 57 | class TextClassificationDataModule(DataModule):
 58 |     def __init__(
 59 |         self,
 60 |         train_df_path: str,
 61 |         dev_df_path: str,
 62 |         test_df_path: str,
 63 |         transformation: HuggingFaceTokenizationTransformation,
 64 |         text_column_name: str,
 65 |         label_column_name: str,
 66 |         batch_size: int,
 67 |         shuffle: bool = False,
 68 |         sampler: Optional[Sampler] = None,
 69 |         batch_sampler: Optional[BatchSampler] = None,
 70 |         num_workers: int = 0,
 71 |         pin_memory: bool = False,
 72 |         drop_last: bool = False,
 73 |         persistent_workers: bool = False,
 74 |     ) -> None:
 75 |         def tokenization_collate_fn(batch: list[tuple[str, int]]) -> tuple[BatchEncoding, Tensor]:
 76 |             texts, labels = default_collate(batch)
 77 |             encodings = transformation(texts)
 78 |             return encodings, labels
 79 | 
 80 |         super().__init__(
 81 |             batch_size=batch_size,
 82 |             shuffle=shuffle,
 83 |             sampler=sampler,
 84 |             batch_sampler=batch_sampler,
 85 |             num_workers=num_workers,
 86 |             collate_fn=tokenization_collate_fn,
 87 |             pin_memory=pin_memory,
 88 |             drop_last=drop_last,
 89 |             persistent_workers=persistent_workers,
 90 |         )
 91 | 
 92 |         self.train_df_path = train_df_path
 93 |         self.dev_df_path = dev_df_path
 94 |         self.test_df_path = test_df_path
 95 | 
 96 |         self.text_column_name = text_column_name
 97 |         self.label_column_name = label_column_name
 98 | 
 99 |     def setup(self, stage: Optional[str] = None) -> None:
100 |         if stage == "fit" or stage is None:
101 |             self.train_dataset = TextClassificationDataset(
102 |                 self.train_df_path, self.text_column_name, self.label_column_name
103 |             )
104 |             self.dev_dataset = TextClassificationDataset(
105 |                 self.dev_df_path, self.text_column_name, self.label_column_name
106 |             )
107 | 
108 |         if stage == "test":
109 |             self.test_dataset = TextClassificationDataset(
110 |                 self.test_df_path, self.text_column_name, self.label_column_name
111 |             )
112 | 
113 |     def train_dataloader(self) -> DataLoader:
114 |         return self.initialize_dataloader(self.train_dataset, is_test=False)
115 | 
116 |     def val_dataloader(self) -> DataLoader:
117 |         return self.initialize_dataloader(self.dev_dataset, is_test=True)
118 | 
119 |     def test_dataloader(self) -> DataLoader:
120 |         return self.initialize_dataloader(self.test_dataset, is_test=True)
121 | 


--------------------------------------------------------------------------------
/cybulde/training/lightning_modules/binary_text_classification.py:
--------------------------------------------------------------------------------
  1 | from collections import defaultdict
  2 | from typing import Optional
  3 | 
  4 | import mlflow
  5 | import torch
  6 | 
  7 | from torch import Tensor
  8 | from torchmetrics.classification import BinaryAccuracy, BinaryConfusionMatrix, BinaryF1Score
  9 | from transformers import BatchEncoding
 10 | 
 11 | from cybulde.models.models import Model
 12 | from cybulde.models.transformations import Transformation
 13 | from cybulde.training.lightning_modules.bases import (
 14 |     ModelStateDictExportingTrainingLightningModule,
 15 |     PartialOptimizerType,
 16 | )
 17 | from cybulde.training.loss_functions import LossFunction
 18 | from cybulde.training.schedulers import LightningScheduler
 19 | from cybulde.utils.torch_utils import plot_confusion_matrix
 20 | 
 21 | 
 22 | class BinaryTextClassificationTrainingLightningModule(ModelStateDictExportingTrainingLightningModule):
 23 |     def __init__(
 24 |         self,
 25 |         model: Model,
 26 |         loss: LossFunction,
 27 |         optimizer: PartialOptimizerType,
 28 |         scheduler: Optional[LightningScheduler] = None,
 29 |     ) -> None:
 30 |         super().__init__(model=model, loss=loss, optimizer=optimizer, scheduler=scheduler)
 31 | 
 32 |         self.training_accuracy = BinaryAccuracy()
 33 |         self.validation_accuracy = BinaryAccuracy()
 34 | 
 35 |         self.training_f1_score = BinaryF1Score()
 36 |         self.validation_f1_score = BinaryF1Score()
 37 | 
 38 |         self.training_confusion_matrix = BinaryConfusionMatrix()
 39 |         self.validation_confusion_matrix = BinaryConfusionMatrix()
 40 | 
 41 |         self.train_step_outputs: dict[str, list[Tensor]] = defaultdict(list)
 42 |         self.validation_step_outputs: dict[str, list[Tensor]] = defaultdict(list)
 43 | 
 44 |         self.pos_weight: Optional[Tensor] = None
 45 | 
 46 |     def set_pos_weight(self, pos_weight: Tensor) -> None:
 47 |         self.pos_weight = pos_weight
 48 | 
 49 |     def forward(self, texts: BatchEncoding) -> Tensor:
 50 |         output: Tensor = self.model(texts)
 51 |         return output
 52 | 
 53 |     def training_step(self, batch: tuple[BatchEncoding, Tensor], batch_idx: int) -> Tensor:
 54 |         texts, labels = batch
 55 |         logits = self(texts)
 56 | 
 57 |         self.pos_weight = self.pos_weight.to(self.device)
 58 |         loss = self.loss(logits, labels, pos_weight=self.pos_weight)
 59 |         self.log("loss", loss, sync_dist=True)
 60 | 
 61 |         self.training_accuracy(logits, labels)
 62 |         self.training_f1_score(logits, labels)
 63 |         self.training_confusion_matrix(logits, labels)
 64 | 
 65 |         self.log("training_accuracy", self.training_accuracy, on_step=False, on_epoch=True)
 66 |         self.log("training_f1_score", self.training_f1_score, on_step=False, on_epoch=True)
 67 | 
 68 |         self.train_step_outputs["logits"].append(logits)
 69 |         self.train_step_outputs["labels"].append(labels)
 70 | 
 71 |         assert isinstance(loss, Tensor)
 72 |         return loss
 73 | 
 74 |     def on_train_epoch_end(self) -> None:
 75 |         all_logits = torch.stack(self.train_step_outputs["logits"])
 76 |         all_labels = torch.stack(self.train_step_outputs["labels"])
 77 | 
 78 |         confusion_matrix = self.training_confusion_matrix(all_logits, all_labels)
 79 |         figure = plot_confusion_matrix(confusion_matrix, ["0", "1"])
 80 |         mlflow.log_figure(figure, "training_confusion_matrix.png")
 81 | 
 82 |         self.train_step_outputs = defaultdict(list)
 83 | 
 84 |     def validation_step(self, batch: tuple[BatchEncoding, Tensor], batch_idx: int) -> dict[str, Tensor]:  # type: ignore
 85 |         texts, labels = batch
 86 |         logits = self(texts)
 87 | 
 88 |         loss = self.loss(logits, labels)
 89 |         self.log("validation_loss", loss, sync_dist=True)
 90 | 
 91 |         self.validation_accuracy(logits, labels)
 92 |         self.validation_f1_score(logits, labels)
 93 | 
 94 |         self.log("validation_accuracy", self.validation_accuracy, on_step=False, on_epoch=True)
 95 |         self.log("validation_f1_score", self.validation_f1_score, on_step=False, on_epoch=True)
 96 | 
 97 |         self.validation_step_outputs["logits"].append(logits)
 98 |         self.validation_step_outputs["labels"].append(labels)
 99 | 
100 |         return {"loss": loss, "predictions": logits, "labels": labels}
101 | 
102 |     def on_validation_epoch_end(self) -> None:
103 |         all_logits = torch.stack(self.validation_step_outputs["logits"])
104 |         all_labels = torch.stack(self.validation_step_outputs["labels"])
105 | 
106 |         confusion_matrix = self.validation_confusion_matrix(all_logits, all_labels)
107 |         figure = plot_confusion_matrix(confusion_matrix, ["0", "1"])
108 |         mlflow.log_figure(figure, "validation_confusion_matrix.png")
109 | 
110 |         self.validation_step_outputs = defaultdict(list)
111 | 
112 |     def get_transformation(self) -> Transformation:
113 |         return self.model.get_transformation()
114 | 
115 |     def export_model_state_dict(self, checkpoint_path: str) -> str:
116 |         return self.common_export_model_state_dict(checkpoint_path)
117 | 


--------------------------------------------------------------------------------
/cybulde/models/adapters.py:
--------------------------------------------------------------------------------
  1 | from operator import attrgetter
  2 | from typing import Literal, Optional
  3 | 
  4 | import torch
  5 | import torch.nn.functional as F
  6 | 
  7 | from torch import Tensor, nn
  8 | from transformers.modeling_outputs import BaseModelOutputWithPooling
  9 | 
 10 | 
 11 | class Adapter(nn.Module):
 12 |     pass
 13 | 
 14 | 
 15 | class Normalization(nn.Module):
 16 |     def __init__(self, p: float = 2.0) -> None:
 17 |         super().__init__()
 18 |         self.p = p
 19 | 
 20 |     def forward(self, x: Tensor) -> Tensor:
 21 |         return F.normalize(x, p=self.p, dim=1)
 22 | 
 23 | 
 24 | class FCLayer(Adapter):
 25 |     def __init__(
 26 |         self,
 27 |         in_features: int,
 28 |         out_features: int,
 29 |         bias: bool,
 30 |         activation_fn: Optional[nn.Module] = None,
 31 |         dropout: float = 0.0,
 32 |         batch_norm: bool = False,
 33 |         order: str = "LABDN",
 34 |     ) -> None:
 35 |         super().__init__()
 36 | 
 37 |         order = order.upper()
 38 | 
 39 |         layers: dict[str, tuple[str, nn.Module]] = {"L": ("linear", nn.Linear(in_features, out_features, bias=bias))}
 40 | 
 41 |         if activation_fn is not None:
 42 |             layers["A"] = ("activation_fn", activation_fn)
 43 | 
 44 |         if batch_norm:
 45 |             layers["B"] = (
 46 |                 "batch_norm",
 47 |                 nn.BatchNorm1d(out_features if order.index("L") < order.index("B") else in_features),
 48 |             )
 49 | 
 50 |         if dropout > 0.0:
 51 |             layers["D"] = ("dropout", nn.Dropout(dropout))
 52 | 
 53 |         if "N" in order:
 54 |             layers["N"] = ("normalization", Normalization())
 55 | 
 56 |         self.layers = nn.Sequential()
 57 |         for layer_code in order:
 58 |             if layer_code in layers:
 59 |                 name, layer = layers[layer_code]
 60 |                 self.layers.add_module(name, layer)
 61 | 
 62 |     def forward(self, x: Tensor) -> Tensor:
 63 |         output: Tensor = self.layers(x)
 64 |         return output
 65 | 
 66 | 
 67 | class MLPLayer(Adapter):
 68 |     def __init__(
 69 |         self,
 70 |         output_feature_sizes: list[int],
 71 |         biases: Optional[list[bool]] = None,
 72 |         activation_fns: Optional[list[Optional[str]]] = None,
 73 |         dropout_drop_probs: Optional[list[float]] = None,
 74 |         batch_norms: Optional[list[bool]] = None,
 75 |         order: str = "LABDN",
 76 |         standardize_input: bool = True,
 77 |     ) -> None:
 78 |         super().__init__()
 79 | 
 80 |         self.output_feature_sizes = output_feature_sizes
 81 |         self.output_embedding_size = output_feature_sizes[-1]
 82 | 
 83 |         nrof_layers = len(self.output_feature_sizes) - 1
 84 |         biases = [False] * nrof_layers if biases is None else biases
 85 |         activation_functions: list[Optional[str]] = [None] * nrof_layers if activation_fns is None else activation_fns
 86 |         dropout_drop_probabilities = [0.0] * nrof_layers if dropout_drop_probs is None else dropout_drop_probs
 87 |         batch_normalizations = [False] * nrof_layers if batch_norms is None else batch_norms
 88 | 
 89 |         assert (
 90 |             nrof_layers
 91 |             == len(activation_functions)
 92 |             == len(dropout_drop_probabilities)
 93 |             == len(batch_normalizations)
 94 |             == len(biases)
 95 |         )
 96 | 
 97 |         self.adapter = nn.Sequential()
 98 | 
 99 |         if standardize_input:
100 |             self.adapter.add_module(
101 |                 "standardize_input", nn.LayerNorm(output_feature_sizes[0], elementwise_affine=False)
102 |             )
103 | 
104 |         for i in range(nrof_layers):
105 |             activation_function = activation_functions[i]
106 |             self.adapter.add_module(
107 |                 f"fc_layer_{i}",
108 |                 FCLayer(
109 |                     in_features=output_feature_sizes[i],
110 |                     out_features=output_feature_sizes[i + 1],
111 |                     bias=biases[i],
112 |                     activation_fn=getattr(nn, activation_function)() if activation_function is not None else None,
113 |                     dropout=dropout_drop_probabilities[i],
114 |                     batch_norm=batch_normalizations[i],
115 |                     order=order,
116 |                 ),
117 |             )
118 | 
119 |     def forward(self, backbone_output: Tensor) -> Tensor:
120 |         output: Tensor = self.adapter(backbone_output)
121 |         return output
122 | 
123 | 
124 | class MLPWithPooling(Adapter):
125 |     def __init__(
126 |         self,
127 |         output_feature_sizes: list[int],
128 |         biases: Optional[list[bool]] = None,
129 |         activation_fns: Optional[list[Optional[str]]] = None,
130 |         dropout_drop_probs: Optional[list[float]] = None,
131 |         batch_norms: Optional[list[bool]] = None,
132 |         order: str = "LABDN",
133 |         standardize_input: bool = True,
134 |         pooling_method: Optional[str] = None,
135 |         output_attribute_to_use: Optional[Literal["pooler_output", "last_hidden_state"]] = None,
136 |     ) -> None:
137 |         super().__init__()
138 | 
139 |         self.output_feature_sizes = output_feature_sizes
140 |         self.output_embedding_size = output_feature_sizes[-1]
141 | 
142 |         nrof_layers = len(output_feature_sizes) - 1
143 |         if nrof_layers > 0:
144 |             self.projection = MLPLayer(
145 |                 output_feature_sizes=output_feature_sizes,
146 |                 biases=biases,
147 |                 activation_fns=activation_fns,
148 |                 dropout_drop_probs=dropout_drop_probs,
149 |                 batch_norms=batch_norms,
150 |                 order=order,
151 |                 standardize_input=standardize_input,
152 |             )
153 |         else:
154 |             self.projection = nn.Identity()  # type: ignore
155 | 
156 |         if pooling_method == "mean_pooler":
157 |             self.pooler = mean_pool_tokens
158 |         elif pooling_method == "cls_pooler":
159 |             self.pooler = cls_pool_tokens
160 |         else:
161 |             self.pooler = nn.Identity()
162 | 
163 |         if output_attribute_to_use is not None:
164 |             self.get_output_tensor = attrgetter(output_attribute_to_use)
165 |         else:
166 |             self.get_output_tensor = nn.Identity()  # type: ignore
167 | 
168 |     def forward(self, backbone_output: BaseModelOutputWithPooling) -> Tensor:
169 |         output = self.get_output_tensor(backbone_output)
170 |         output = self.pooler(output)
171 |         output = self.projection(output)
172 |         assert isinstance(output, Tensor)
173 |         return output
174 | 
175 | 
176 | def mean_pool_tokens(tensor: Tensor) -> Tensor:
177 |     dims = len(tensor.shape)
178 |     if dims != 3:
179 |         raise ValueError(f"Tokens pooling expects exactly 3 dimensional tensor, got: {dims}")
180 |     return torch.mean(tensor, dim=1)
181 | 
182 | 
183 | def cls_pool_tokens(tensor: Tensor) -> Tensor:
184 |     dims = len(tensor.shape)
185 |     if dims != 3:
186 |         raise ValueError(f"Tokens pooling expects exactly 3 dimensional tensor, got: {dims}")
187 |     return tensor[:, 0, :]
188 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
  1 | # Make all targets .PHONY
  2 | .PHONY: $(shell sed -n -e '/^$$/ { n ; /^[^ .\#][^ ]*:/ { s/:.*$$// ; p ; } ; }' $(MAKEFILE_LIST))
  3 | 
  4 | include .envs/.postgres
  5 | include .envs/.mlflow-common
  6 | include .envs/.mlflow-dev
  7 | include .envs/.infrastructure
  8 | export
  9 | 
 10 | SHELL = /usr/bin/env bash
 11 | USER_NAME = $(shell whoami)
 12 | USER_ID = $(shell id -u)
 13 | HOST_NAME = $(shell hostname)
 14 | 
 15 | ifeq (, $(shell which docker-compose))
 16 | 	DOCKER_COMPOSE_COMMAND = docker compose
 17 | else
 18 | 	DOCKER_COMPOSE_COMMAND = docker-compose
 19 | endif
 20 | 
 21 | PROD_SERVICE_NAME = app-prod
 22 | PROD_CONTAINER_NAME = cybulde-model-prod-container
 23 | PROD_PROFILE_NAME = prod
 24 | 
 25 | ifeq (, $(shell which nvidia-smi))
 26 | 	PROFILE = ci
 27 | 	CONTAINER_NAME = cybulde-model-ci-container
 28 | 	SERVICE_NAME = app-ci
 29 | else
 30 | 	PROFILE = dev
 31 | 	CONTAINER_NAME = cybulde-model-dev-container
 32 | 	SERVICE_NAME = app-dev
 33 | endif
 34 | 
 35 | DIRS_TO_VALIDATE = cybulde
 36 | DOCKER_COMPOSE_RUN = $(DOCKER_COMPOSE_COMMAND) run --rm $(SERVICE_NAME)
 37 | DOCKER_COMPOSE_EXEC = $(DOCKER_COMPOSE_COMMAND) exec $(SERVICE_NAME)
 38 | 
 39 | DOCKER_COMPOSE_RUN_PROD = $(DOCKER_COMPOSE_COMMAND) run --rm $(PROD_SERVICE_NAME)
 40 | DOCKER_COMPOSE_EXEC_PROD = $(DOCKER_COMPOSE_COMMAND) exec $(PROD_SERVICE_NAME)
 41 | 
 42 | IMAGE_TAG := $(shell echo "train-$$(uuidgen)")
 43 | 
 44 | # Returns true if the stem is a non-empty environment variable, or else raises an error.
 45 | guard-%:
 46 | 	@#$(or ${$*}, $(error $* is not set))
 47 | 
 48 | ## Generate final config. For overrides use: OVERRIDES=<overrides>
 49 | generate-final-config: up-prod
 50 | 	@$(DOCKER_COMPOSE_EXEC_PROD) python cybulde/generate_final_config.py docker_image=${GCP_DOCKER_REGISTRY_URL}:${IMAGE_TAG} ${OVERRIDES}
 51 | 
 52 | ## Generate final config local. For overrides use: OVERRIDES=<overrides>
 53 | local-generate-final-config: up
 54 | 	@$(DOCKER_COMPOSE_EXEC) python cybulde/generate_final_config.py ${OVERRIDES}
 55 | 
 56 | ## Local run tasks
 57 | run-tasks: generate-final-config push
 58 | 	$(DOCKER_COMPOSE_EXEC_PROD) python cybulde/launch_job_on_gcp.py
 59 | 
 60 | ## Local run tasks
 61 | local-run-tasks: local-generate-final-config
 62 | 	$(DOCKER_COMPOSE_EXEC) torchrun cybulde/run_tasks.py
 63 | 
 64 | ## Starts jupyter lab
 65 | notebook: up
 66 | 	$(DOCKER_COMPOSE_EXEC) jupyter-lab --ip 0.0.0.0 --port 8888 --no-browser
 67 | 
 68 | ## Sort code using isort
 69 | sort: up
 70 | 	$(DOCKER_COMPOSE_EXEC) isort --atomic $(DIRS_TO_VALIDATE)
 71 | 
 72 | ## Check sorting using isort
 73 | sort-check: up
 74 | 	$(DOCKER_COMPOSE_EXEC) isort --check-only --atomic $(DIRS_TO_VALIDATE)
 75 | 
 76 | ## Format code using black
 77 | format: up
 78 | 	$(DOCKER_COMPOSE_EXEC) black $(DIRS_TO_VALIDATE)
 79 | 
 80 | ## Check format using black
 81 | format-check: up
 82 | 	$(DOCKER_COMPOSE_EXEC) black --check $(DIRS_TO_VALIDATE)
 83 | 
 84 | ## Format and sort code using black and isort
 85 | format-and-sort: sort format
 86 | 
 87 | ## Lint code using flake8
 88 | lint: up format-check sort-check
 89 | 	$(DOCKER_COMPOSE_EXEC) flake8 $(DIRS_TO_VALIDATE)
 90 | 
 91 | ## Check type annotations using mypy
 92 | check-type-annotations: up
 93 | 	$(DOCKER_COMPOSE_EXEC) mypy $(DIRS_TO_VALIDATE)
 94 | 
 95 | ## Run tests with pytest
 96 | test: up
 97 | 	$(DOCKER_COMPOSE_EXEC) pytest
 98 | 
 99 | ## Perform a full check
100 | full-check: lint check-type-annotations
101 | 	$(DOCKER_COMPOSE_EXEC) pytesta --cov --cov-report xml --verbose
102 | 
103 | ## Builds docker image
104 | build:
105 | 	$(DOCKER_COMPOSE_COMMAND) build $(SERVICE_NAME)
106 | 
107 | ## Remove poetry.lock and build docker image
108 | build-for-dependencies:
109 | 	rm -f *.lock
110 | 	$(DOCKER_COMPOSE_COMMAND) build $(SERVICE_NAME)
111 | 
112 | ## Lock dependencies with poetry
113 | lock-dependencies: build-for-dependencies
114 | 	$(DOCKER_COMPOSE_RUN) bash -c "if [ -e /home/$(USER_NAME)/poetry.lock.build ]; then cp /home/$(USER_NAME)/poetry.lock.build ./poetry.lock; else poetry lock; fi"
115 | 
116 | ## Starts docker containers using "docker-compose up -d"
117 | up:
118 | ifeq (, $(shell docker ps -a | grep $(CONTAINER_NAME)))
119 | 	@make down
120 | endif
121 | 	@$(DOCKER_COMPOSE_COMMAND) --profile $(PROFILE) up -d --remove-orphans
122 | 
123 | ## Starts prod docker containers
124 | up-prod:
125 | ifeq (, $(shell docker ps -a | grep $(PROD_CONTAINER_NAME)))
126 | 	@make down
127 | endif
128 | 	@$(DOCKER_COMPOSE_COMMAND) --profile $(PROD_PROFILE_NAME) up -d --remove-orphans
129 | 
130 | ## docker-compose down
131 | down:
132 | 	$(DOCKER_COMPOSE_COMMAND) down
133 | 
134 | ## Open an interactive shell in docker container
135 | exec-in: up
136 | 	docker exec -it $(CONTAINER_NAME) bash
137 | 
138 | push: guard-IMAGE_TAG build
139 | 	@gcloud auth configure-docker --quiet europe-west4-docker.pkg.dev
140 | 	@docker tag "$${DOCKER_IMAGE_NAME}:latest" "$${GCP_DOCKER_REGISTRY_URL}:$${IMAGE_TAG}"
141 | 	@docker push "$${GCP_DOCKER_REGISTRY_URL}:$${IMAGE_TAG}"
142 | 
143 | ## Run ssh tunnel for MLFlow
144 | mlflow-ssh-tunnel:
145 | 	gcloud compute ssh "$${VM_NAME}" --zone "$${ZONE}" --tunnel-through-iap -- -N -L "$${PROD_MLFLOW_SERVER_PORT}:localhost:$${PROD_MLFLOW_SERVER_PORT}"
146 | 
147 | ## Clean MLFlow volumes
148 | clean-mlflow-volumes: down
149 | 	docker volume rm cybulde-model_postgresql-mlflow-data cybulde-model_mlflow-artifact-store
150 | 
151 | ## Deploy etcd server on GCE
152 | deploy-etcd-server:
153 | 	chmod +x ./scripts/deploy-etcd-server.sh
154 | 	./scripts/deploy-etcd-server.sh
155 | 
156 | .DEFAULT_GOAL := help
157 | 
158 | # Inspired by <http://marmelab.com/blog/2016/02/29/auto-documented-makefile.html>
159 | # sed script explained:
160 | # /^##/:
161 | # 	* save line in hold space
162 | # 	* purge line
163 | # 	* Loop:
164 | # 		* append newline + line to hold space
165 | # 		* go to next line
166 | # 		* if line starts with doc comment, strip comment character off and loop
167 | # 	* remove target prerequisites
168 | # 	* append hold space (+ newline) to line
169 | # 	* replace newline plus comments by `---`
170 | # 	* print line
171 | # Separate expressions are necessary because labels cannot be delimited by
172 | # semicolon; see <http://stackoverflow.com/a/11799865/1968>
173 | .PHONY: help
174 | help:
175 | 	@echo "$$(tput bold)Available rules:$$(tput sgr0)"
176 | 	@echo
177 | 	@sed -n -e "/^## / { \
178 | 		h; \
179 | 		s/.*//; \
180 | 		:doc" \
181 | 		-e "H; \
182 | 		n; \
183 | 		s/^## //; \
184 | 		t doc" \
185 | 		-e "s/:.*//; \
186 | 		G; \
187 | 		s/\\n## /---/; \
188 | 		s/\\n/ /g; \
189 | 		p; \
190 | 	}" ${MAKEFILE_LIST} \
191 | 	| LC_ALL='C' sort --ignore-case \
192 | 	| awk -F '---' \
193 | 		-v ncol=$$(tput cols) \
194 | 		-v indent=36 \
195 | 		-v col_on="$$(tput setaf 6)" \
196 | 		-v col_off="$$(tput sgr0)" \
197 | 	'{ \
198 | 		printf "%s%*s%s ", col_on, -indent, $$1, col_off; \
199 | 		n = split($$2, words, " "); \
200 | 		line_length = ncol - indent; \
201 | 		for (i = 1; i <= n; i++) { \
202 | 			line_length -= length(words[i]) + 1; \
203 | 			if (line_length <= 0) { \
204 | 				line_length = ncol - indent - length(words[i]) - 1; \
205 | 				printf "\n%*s ", -indent, " "; \
206 | 			} \
207 | 			printf "%s ", words[i]; \
208 | 		} \
209 | 		printf "\n"; \
210 | 	}' \
211 | 	| more $(shell test $(shell uname) = Darwin && echo '--no-init --raw-control-chars')
212 | 
213 | 


--------------------------------------------------------------------------------
/cybulde/infrastructure/instance_template_creator.py:
--------------------------------------------------------------------------------
  1 | from dataclasses import dataclass
  2 | from enum import Enum
  3 | from pathlib import Path
  4 | 
  5 | from google.cloud import compute_v1
  6 | 
  7 | from cybulde.utils.gcp_utils import wait_for_extended_operation
  8 | from cybulde.utils.utils import get_logger
  9 | 
 10 | 
 11 | class VMType(Enum):
 12 |     STANDARD = "STANDARD"
 13 |     SPOT = "SPOT"
 14 |     PREEMPTIBLE = "PREEMPTIBLE"
 15 | 
 16 | 
 17 | @dataclass
 18 | class BootDiskConfig:
 19 |     project_id: str
 20 |     name: str
 21 |     size_gb: int
 22 |     labels: dict[str, str]
 23 | 
 24 | 
 25 | @dataclass
 26 | class VMConfig:
 27 |     machine_type: str
 28 |     accelerator_count: int
 29 |     accelerator_type: str
 30 |     vm_type: VMType
 31 |     disks: list[str]
 32 | 
 33 | 
 34 | @dataclass
 35 | class VMMetadataConfig:
 36 |     instance_group_name: str
 37 |     docker_image: str
 38 |     zone: str
 39 |     python_hash_seed: int
 40 |     mlflow_tracking_uri: str
 41 |     node_count: int
 42 |     disks: list[str]
 43 | 
 44 | 
 45 | class InstanceTemplateCreator:
 46 |     def __init__(
 47 |         self,
 48 |         scopes: list[str],
 49 |         network: str,
 50 |         subnetwork: str,
 51 |         startup_script_path: str,
 52 |         vm_config: VMConfig,
 53 |         boot_disk_config: BootDiskConfig,
 54 |         vm_metadata_config: VMMetadataConfig,
 55 |         template_name: str,
 56 |         project_id: str,
 57 |         labels: dict[str, str] = {},
 58 |     ) -> None:
 59 |         self.logger = get_logger(self.__class__.__name__)
 60 | 
 61 |         self.scopes = scopes
 62 |         self.network = network
 63 |         self.subnetwork = subnetwork
 64 |         self.startup_script_path = startup_script_path
 65 |         self.vm_config = vm_config
 66 |         self.boot_disk_config = boot_disk_config
 67 |         self.vm_metadata_config = vm_metadata_config
 68 |         self.template_name = template_name.lower()
 69 |         self.project_id = project_id
 70 |         self.labels = labels
 71 | 
 72 |         self.template = compute_v1.InstanceTemplate()
 73 |         self.template.name = self.template_name
 74 | 
 75 |     def create_template(self) -> compute_v1.InstanceTemplate:
 76 |         self.logger.info("Started creating instance template...")
 77 |         self.logger.info(f"{self.vm_metadata_config=}")
 78 | 
 79 |         self._create_boot_disk()
 80 |         self._attach_disks()
 81 |         self._create_network_interface()
 82 |         self._create_machine_configuration()
 83 |         self._attach_metadata()
 84 | 
 85 |         self.logger.info("Creating instance template...")
 86 |         template_client = compute_v1.InstanceTemplatesClient()
 87 |         operation = template_client.insert(project=self.project_id, instance_template_resource=self.template)
 88 |         wait_for_extended_operation(operation, "instance template creation")
 89 | 
 90 |         self.logger.info("Instance template has been created...")
 91 |         return template_client.get(project=self.project_id, instance_template=self.template_name)
 92 | 
 93 |     def _create_boot_disk(self) -> None:
 94 |         boot_disk = compute_v1.AttachedDisk()
 95 |         boot_disk_initialize_params = compute_v1.AttachedDiskInitializeParams()
 96 |         boot_disk_image = self._get_disk_image(self.boot_disk_config.project_id, self.boot_disk_config.name)
 97 |         boot_disk_initialize_params.source_image = boot_disk_image.self_link
 98 |         boot_disk_initialize_params.disk_size_gb = self.boot_disk_config.size_gb
 99 |         boot_disk_initialize_params.labels = self.boot_disk_config.labels
100 |         boot_disk.initialize_params = boot_disk_initialize_params
101 |         boot_disk.auto_delete = True
102 |         boot_disk.boot = True
103 |         boot_disk.device_name = self.boot_disk_config.name
104 | 
105 |         if boot_disk:
106 |             self.template.properties.disks = [boot_disk]
107 | 
108 |     def _get_disk_image(self, project_id: str, image_name: str) -> compute_v1.Image:
109 |         """
110 |         Retrieve detailed information about a single image from a project.
111 |         Args:
112 |             project_id: project ID or project number of the Cloud project you want to list images from.
113 |             image_name: name of the image you want to get details of.
114 |         Returns:
115 |             An instance of compute_v1.Image object with information about specified image.
116 |         """
117 |         image_client = compute_v1.ImagesClient()
118 |         return image_client.get(project=project_id, image=image_name)
119 | 
120 |     def _attach_disks(self) -> None:
121 |         disk_names = self.vm_config.disks
122 |         for disk_name in disk_names:
123 |             disk = compute_v1.AttachedDisk(
124 |                 auto_delete=False, boot=False, mode="READ_ONLY", device_name=disk_name, source=disk_name
125 |             )
126 |             self.template.properties.disks.append(disk)
127 | 
128 |         if len(disk_names) > 0:
129 |             self.template.properties.metadata.items.append(compute_v1.Items(key="disks", value="\n".join(disk_names)))
130 | 
131 |     def _create_network_interface(self) -> None:
132 |         network_interface = compute_v1.NetworkInterface()
133 |         network_interface.name = "nic0"  # The default value
134 |         network_interface.network = self.network
135 |         network_interface.subnetwork = self.subnetwork
136 |         self.template.properties.network_interfaces = [network_interface]
137 | 
138 |     def _create_machine_configuration(self) -> None:
139 |         self.template.properties.machine_type = self.vm_config.machine_type
140 |         if self.vm_config.accelerator_count > 0:
141 |             self.template.properties.guest_accelerators = [
142 |                 compute_v1.AcceleratorConfig(
143 |                     accelerator_type=self.vm_config.accelerator_type, accelerator_count=self.vm_config.accelerator_count
144 |                 )
145 |             ]
146 |         self.template.properties.service_accounts = [compute_v1.ServiceAccount(email="default", scopes=self.scopes)]
147 |         self.template.properties.labels = self.labels
148 | 
149 |         vm_type = VMType(self.vm_config.vm_type)
150 |         if vm_type == VMType.PREEMPTIBLE:
151 |             self.logger.info("Using PREEMPTIBLE machine")
152 |             self.template.properties.scheduling = compute_v1.Scheduling(preemptible=True)
153 |         elif vm_type == VMType.SPOT:
154 |             self.logger.info("Using SPOT machine")
155 |             self.template.properties.scheduling = compute_v1.Scheduling(
156 |                 provisioning_model=compute_v1.Scheduling.ProvisioningModel.SPOT.name,  # type: ignore
157 |                 on_host_maintenance=compute_v1.Scheduling.OnHostMaintenance.TERMINATE.name,  # type: ignore
158 |             )
159 |         elif vm_type == VMType.STANDARD:
160 |             self.logger.info("Using STANDARD machine")
161 |             self.template.properties.scheduling = compute_v1.Scheduling(
162 |                 provisioning_model=compute_v1.Scheduling.ProvisioningModel.STANDARD.name,  # type: ignore
163 |                 on_host_maintenance=compute_v1.Scheduling.OnHostMaintenance.TERMINATE.name,  # type: ignore
164 |             )
165 |         else:
166 |             raise RuntimeError(f"Unsupported {vm_type=}")
167 | 
168 |     def _attach_metadata(self) -> None:
169 |         startup_script = self._read_startup_script(self.startup_script_path)
170 |         self.template.properties.metadata.items.append(compute_v1.Items(key="startup-script", value=startup_script))
171 | 
172 |         for meta_data_name, meta_data_value in self.vm_metadata_config.items():  # type: ignore
173 |             self.template.properties.metadata.items.append(
174 |                 compute_v1.Items(key=meta_data_name, value=str(meta_data_value))
175 |             )
176 | 
177 |     def _read_startup_script(self, startup_script_path: str) -> str:
178 |         return Path(startup_script_path).read_text()
179 | 


--------------------------------------------------------------------------------
/cybulde/configs/automatically_generated/config.yaml:
--------------------------------------------------------------------------------
  1 | # Do not edit this file. It is automatically generated by cybulde/generate_final_config.py.
  2 | # If you want to modify configuration, edit source files in cybulde/configs directory.
  3 | 
  4 | defaults:
  5 | - override hydra/hydra_logging: disabled
  6 | - _self_
  7 | hydra:
  8 |   output_subdir: null
  9 |   run:
 10 |     dir: .
 11 | 
 12 | infrastructure:
 13 |   project_id: cybulde
 14 |   zone: europe-west4-b
 15 |   instance_group_creator:
 16 |     _target_: cybulde.infrastructure.instance_group_creator.InstanceGroupCreator
 17 |     instance_template_creator:
 18 |       _target_: cybulde.infrastructure.instance_template_creator.InstanceTemplateCreator
 19 |       scopes:
 20 |       - https://www.googleapis.com/auth/cloud-platform
 21 |       - https://www.googleapis.com/auth/cloud.useraccounts.readonly
 22 |       - https://www.googleapis.com/auth/cloudruntimeconfig
 23 |       network: https://www.googleapis.com/compute/v1/projects/cybulde/global/networks/default
 24 |       subnetwork: https://www.googleapis.com/compute/v1/projects/cybulde/regions/europe-west4/subnetworks/default
 25 |       startup_script_path: scripts/vm_startup/task_runner_startup_script.sh
 26 |       vm_config:
 27 |         machine_type: n1-standard-8
 28 |         accelerator_count: 1
 29 |         accelerator_type: nvidia-tesla-t4
 30 |         vm_type: STANDARD
 31 |         disks: []
 32 |       boot_disk_config:
 33 |         project_id: deeplearning-platform-release
 34 |         name: common-cu113-v20230925
 35 |         size_gb: 50
 36 |         labels:
 37 |           project: cybulde
 38 |       vm_metadata_config:
 39 |         instance_group_name: cybulde-None-20231115215415
 40 |         docker_image: null
 41 |         zone: europe-west4-b
 42 |         python_hash_seed: 42
 43 |         mlflow_tracking_uri: http://127.0.0.1:6101
 44 |         node_count: 1
 45 |         disks: []
 46 |         etcd_ip: 10.164.0.12:2379
 47 |       template_name: cybulde-None-20231115215415
 48 |       project_id: cybulde
 49 |       labels:
 50 |         project: cybulde
 51 |     name: cybulde-None-20231115215415
 52 |     node_count: 1
 53 |     project_id: cybulde
 54 |     zone: europe-west4-b
 55 |   mlflow:
 56 |     mlflow_external_tracking_uri: http://127.0.0.1:6101
 57 |     mlflow_internal_tracking_uri: http://127.0.0.1:6101
 58 |     experiment_name: cybulde
 59 |     run_name: null
 60 |     run_id: 63b73e0189824d4abcbd42d07fa26428
 61 |     experiment_id: '16'
 62 |     experiment_url: http://127.0.0.1:6101/#/experiments/16/runs/63b73e0189824d4abcbd42d07fa26428
 63 |     artifact_uri: /mlflow-artifact-store/16/63b73e0189824d4abcbd42d07fa26428/artifacts
 64 |   etcd_ip: 10.164.0.12:2379
 65 | save_last_checkpoint_every_n_train_steps: 500
 66 | seed: 1234
 67 | tasks:
 68 |   binary_text_classification_task:
 69 |     _target_: cybulde.training.tasks.tar_model_exporting_training_task.TarModelExportingTrainingTask
 70 |     name: binary_text_classfication_task
 71 |     data_module:
 72 |       _target_: cybulde.data_modules.data_modules.TextClassificationDataModule
 73 |       batch_size: 1024
 74 |       shuffle: false
 75 |       num_workers: 8
 76 |       pin_memory: true
 77 |       drop_last: true
 78 |       persistent_workers: false
 79 |       train_df_path: gs://emkademy/cybulde/data/processed/rebalanced_splits/train.parquet
 80 |       dev_df_path: gs://emkademy/cybulde/data/processed/rebalanced_splits/dev.parquet
 81 |       test_df_path: gs://emkademy/cybulde/data/processed/rebalanced_splits/test.parquet
 82 |       transformation:
 83 |         _target_: cybulde.models.transformations.HuggingFaceTokenizationTransformation
 84 |         pretrained_tokenizer_name_or_path: gs://emkademy/cybulde/data/processed/rebalanced_splits/trained_tokenizer
 85 |         max_sequence_length: 200
 86 |       text_column_name: cleaned_text
 87 |       label_column_name: label
 88 |     lightning_module:
 89 |       _target_: cybulde.training.lightning_modules.binary_text_classification.BinaryTextClassificationTrainingLightningModule
 90 |       model:
 91 |         _target_: cybulde.models.models.BinaryTextClassificationModel
 92 |         backbone:
 93 |           _target_: cybulde.models.backbones.HuggingFaceBackbone
 94 |           transformation:
 95 |             _target_: cybulde.models.transformations.HuggingFaceTokenizationTransformation
 96 |             pretrained_tokenizer_name_or_path: gs://emkademy/cybulde/data/processed/rebalanced_splits/trained_tokenizer
 97 |             max_sequence_length: 200
 98 |           pretrained_model_name_or_path: prajjwal1/bert-tiny
 99 |           pretrained: false
100 |         adapter:
101 |           _target_: cybulde.models.adapters.MLPWithPooling
102 |           output_feature_sizes:
103 |           - -1
104 |           biases: null
105 |           activation_fns: null
106 |           dropout_drop_probs: null
107 |           batch_norms: null
108 |           order: LABDN
109 |           standardize_input: true
110 |           pooling_method: null
111 |           output_attribute_to_use: pooler_output
112 |         head:
113 |           _target_: cybulde.models.heads.SigmoidHead
114 |           in_features: 128
115 |           out_features: 1
116 |       loss:
117 |         _target_: cybulde.training.loss_functions.BCEWithLogitsLoss
118 |         reduction: mean
119 |       optimizer:
120 |         _target_: torch.optim.AdamW
121 |         _partial_: true
122 |         lr: 5.0e-05
123 |         betas:
124 |         - 0.9
125 |         - 0.999
126 |         eps: 1.0e-08
127 |         weight_decay: 0.001
128 |         amsgrad: false
129 |         foreach: null
130 |         maximize: false
131 |         capturable: false
132 |       scheduler:
133 |         _target_: cybulde.training.schedulers.CommonLightningScheduler
134 |         scheduler:
135 |           _target_: torch.optim.lr_scheduler.ReduceLROnPlateau
136 |           _partial_: true
137 |           mode: max
138 |           factor: 0.1
139 |           patience: 5
140 |           threshold: 0.0001
141 |           threshold_mode: rel
142 |           cooldown: 0
143 |           min_lr: 0.0
144 |           eps: 1.0e-08
145 |           verbose: false
146 |         interval: epoch
147 |         frequency: 1
148 |         monitor: validation_f1_score
149 |         strict: true
150 |         name: null
151 |     trainer:
152 |       _target_: lightning.pytorch.trainer.trainer.Trainer
153 |       accelerator: gpu
154 |       strategy: ddp_find_unused_parameters_true
155 |       devices: auto
156 |       num_nodes: 1
157 |       precision: 16-mixed
158 |       logger:
159 |       - _target_: lightning.pytorch.loggers.mlflow.MLFlowLogger
160 |         experiment_name: cybulde
161 |         run_name: null
162 |         tracking_uri: http://127.0.0.1:6101
163 |         tags: null
164 |         save_dir: null
165 |         prefix: ''
166 |         artifact_location: null
167 |         run_id: 63b73e0189824d4abcbd42d07fa26428
168 |       callbacks:
169 |       - _target_: lightning.pytorch.callbacks.ModelCheckpoint
170 |         dirpath: /mlflow-artifact-store/16/63b73e0189824d4abcbd42d07fa26428/artifacts/best-checkpoints/
171 |         filename: null
172 |         monitor: validation_f1_score
173 |         verbose: false
174 |         save_last: true
175 |         save_top_k: 2
176 |         mode: max
177 |         auto_insert_metric_name: false
178 |         save_weights_only: false
179 |         every_n_train_steps: null
180 |         train_time_interval: null
181 |         every_n_epochs: null
182 |         save_on_train_epoch_end: null
183 |       - _target_: lightning.pytorch.callbacks.ModelCheckpoint
184 |         dirpath: /mlflow-artifact-store/16/63b73e0189824d4abcbd42d07fa26428/artifacts/last-checkpoints/
185 |         filename: checkpoint-{epoch}
186 |         monitor: null
187 |         verbose: false
188 |         save_last: true
189 |         save_top_k: -1
190 |         mode: min
191 |         auto_insert_metric_name: false
192 |         save_weights_only: false
193 |         every_n_train_steps: 500
194 |         train_time_interval: null
195 |         every_n_epochs: null
196 |         save_on_train_epoch_end: null
197 |       - _target_: lightning.pytorch.callbacks.LearningRateMonitor
198 |         logging_interval: step
199 |       fast_dev_run: false
200 |       max_epochs: 20
201 |       min_epochs: null
202 |       max_steps: -1
203 |       min_steps: null
204 |       max_time: null
205 |       limit_train_batches: 1.0
206 |       limit_val_batches: 1.0
207 |       limit_test_batches: 1.0
208 |       limit_predict_batches: 1.0
209 |       overfit_batches: 0.0
210 |       val_check_interval: 1.0
211 |       check_val_every_n_epoch: 1
212 |       num_sanity_val_steps: 2
213 |       log_every_n_steps: 20
214 |       enable_checkpointing: true
215 |       enable_progress_bar: true
216 |       enable_model_summary: true
217 |       accumulate_grad_batches: 1
218 |       gradient_clip_val: 5.0
219 |       gradient_clip_algorithm: value
220 |       deterministic: null
221 |       benchmark: null
222 |       inference_mode: true
223 |       use_distributed_sampler: true
224 |       detect_anomaly: false
225 |       barebones: false
226 |       sync_batchnorm: true
227 |       reload_dataloaders_every_n_epochs: 0
228 |       default_root_dir: ./data/pytorch-lightning
229 |     best_training_checkpoint: /mlflow-artifact-store/16/63b73e0189824d4abcbd42d07fa26428/artifacts/best-checkpoints/last.ckpt
230 |     last_training_checkpoint: /mlflow-artifact-store/16/63b73e0189824d4abcbd42d07fa26428/artifacts/last-checkpoints/last.ckpt
231 |     tar_model_export_path: /mlflow-artifact-store/16/63b73e0189824d4abcbd42d07fa26428/artifacts/exported_model.tar.gz
232 |   binary_text_evaluation_task:
233 |     _target_: cybulde.evaluation.tasks.common_evaluation_task.CommonEvaluationTask
234 |     name: binary_text_evaluation_task
235 |     data_module:
236 |       _target_: cybulde.data_modules.data_modules.TextClassificationDataModule
237 |       batch_size: 1024
238 |       shuffle: false
239 |       num_workers: 8
240 |       pin_memory: true
241 |       drop_last: true
242 |       persistent_workers: false
243 |       train_df_path: gs://emkademy/cybulde/data/processed/rebalanced_splits/train.parquet
244 |       dev_df_path: gs://emkademy/cybulde/data/processed/rebalanced_splits/dev.parquet
245 |       test_df_path: gs://emkademy/cybulde/data/processed/rebalanced_splits/test.parquet
246 |       transformation:
247 |         _target_: cybulde.models.transformations.HuggingFaceTokenizationTransformation
248 |         pretrained_tokenizer_name_or_path: gs://emkademy/cybulde/data/processed/rebalanced_splits/trained_tokenizer
249 |         max_sequence_length: 200
250 |       text_column_name: cleaned_text
251 |       label_column_name: label
252 |     lightning_module:
253 |       _target_: cybulde.evaluation.lightning_modules.binary_text_evaluation.BinaryTextEvaluationLightningModule
254 |       _partial_: true
255 |     trainer:
256 |       _target_: lightning.pytorch.trainer.trainer.Trainer
257 |       accelerator: gpu
258 |       strategy: ddp_find_unused_parameters_true
259 |       devices: auto
260 |       num_nodes: 1
261 |       precision: 16-mixed
262 |       logger:
263 |       - _target_: lightning.pytorch.loggers.mlflow.MLFlowLogger
264 |         experiment_name: cybulde
265 |         run_name: null
266 |         tracking_uri: http://127.0.0.1:6101
267 |         tags: null
268 |         save_dir: null
269 |         prefix: ''
270 |         artifact_location: null
271 |         run_id: 63b73e0189824d4abcbd42d07fa26428
272 |       callbacks:
273 |       - _target_: lightning.pytorch.callbacks.ModelCheckpoint
274 |         dirpath: /mlflow-artifact-store/16/63b73e0189824d4abcbd42d07fa26428/artifacts/best-checkpoints/
275 |         filename: null
276 |         monitor: validation_f1_score
277 |         verbose: false
278 |         save_last: true
279 |         save_top_k: 2
280 |         mode: max
281 |         auto_insert_metric_name: false
282 |         save_weights_only: false
283 |         every_n_train_steps: null
284 |         train_time_interval: null
285 |         every_n_epochs: null
286 |         save_on_train_epoch_end: null
287 |       - _target_: lightning.pytorch.callbacks.ModelCheckpoint
288 |         dirpath: /mlflow-artifact-store/16/63b73e0189824d4abcbd42d07fa26428/artifacts/last-checkpoints/
289 |         filename: checkpoint-{epoch}
290 |         monitor: null
291 |         verbose: false
292 |         save_last: true
293 |         save_top_k: -1
294 |         mode: min
295 |         auto_insert_metric_name: false
296 |         save_weights_only: false
297 |         every_n_train_steps: 500
298 |         train_time_interval: null
299 |         every_n_epochs: null
300 |         save_on_train_epoch_end: null
301 |       - _target_: lightning.pytorch.callbacks.LearningRateMonitor
302 |         logging_interval: step
303 |       fast_dev_run: false
304 |       max_epochs: 20
305 |       min_epochs: null
306 |       max_steps: -1
307 |       min_steps: null
308 |       max_time: null
309 |       limit_train_batches: 1.0
310 |       limit_val_batches: 1.0
311 |       limit_test_batches: 1.0
312 |       limit_predict_batches: 1.0
313 |       overfit_batches: 0.0
314 |       val_check_interval: 1.0
315 |       check_val_every_n_epoch: 1
316 |       num_sanity_val_steps: 2
317 |       log_every_n_steps: 20
318 |       enable_checkpointing: true
319 |       enable_progress_bar: true
320 |       enable_model_summary: true
321 |       accumulate_grad_batches: 1
322 |       gradient_clip_val: 5.0
323 |       gradient_clip_algorithm: value
324 |       deterministic: null
325 |       benchmark: null
326 |       inference_mode: true
327 |       use_distributed_sampler: true
328 |       detect_anomaly: false
329 |       barebones: false
330 |       sync_batchnorm: true
331 |       reload_dataloaders_every_n_epochs: 0
332 |       default_root_dir: ./data/pytorch-lightning
333 |     tar_model_path: /mlflow-artifact-store/16/63b73e0189824d4abcbd42d07fa26428/artifacts/exported_model.tar.gz
334 | model_selector:
335 |   _target_: cybulde.evaluation.model_selector.ModelSelector
336 |   mlflow_run_id: 63b73e0189824d4abcbd42d07fa26428
337 |   must_be_better_metric_comparers:
338 |     f1_score:
339 |       _target_: cybulde.evaluation.model_selector.MetricComparer
340 |       bigger_is_better: true
341 |       can_be_equal: false
342 |       metric_name: test_f1_score
343 |       threshold: 0.0
344 |     model_size:
345 |       _target_: cybulde.evaluation.model_selector.MetricComparer
346 |       bigger_is_better: false
347 |       can_be_equal: true
348 |       metric_name: model_size
349 |       threshold: 0.0
350 |   to_be_thresholded_metric_comparers: {}
351 |   threshold: 0.0
352 | registered_model_name: bert_tiny
353 | docker_image: null
354 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                     GNU GENERAL PUBLIC LICENSE
  2 |                        Version 2, June 1991
  3 | 
  4 |  Copyright (C) 1989, 1991 Free Software Foundation, Inc.,
  5 |  51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  6 |  Everyone is permitted to copy and distribute verbatim copies
  7 |  of this license document, but changing it is not allowed.
  8 | 
  9 |                             Preamble
 10 | 
 11 |   The licenses for most software are designed to take away your
 12 | freedom to share and change it.  By contrast, the GNU General Public
 13 | License is intended to guarantee your freedom to share and change free
 14 | software--to make sure the software is free for all its users.  This
 15 | General Public License applies to most of the Free Software
 16 | Foundation's software and to any other program whose authors commit to
 17 | using it.  (Some other Free Software Foundation software is covered by
 18 | the GNU Lesser General Public License instead.)  You can apply it to
 19 | your programs, too.
 20 | 
 21 |   When we speak of free software, we are referring to freedom, not
 22 | price.  Our General Public Licenses are designed to make sure that you
 23 | have the freedom to distribute copies of free software (and charge for
 24 | this service if you wish), that you receive source code or can get it
 25 | if you want it, that you can change the software or use pieces of it
 26 | in new free programs; and that you know you can do these things.
 27 | 
 28 |   To protect your rights, we need to make restrictions that forbid
 29 | anyone to deny you these rights or to ask you to surrender the rights.
 30 | These restrictions translate to certain responsibilities for you if you
 31 | distribute copies of the software, or if you modify it.
 32 | 
 33 |   For example, if you distribute copies of such a program, whether
 34 | gratis or for a fee, you must give the recipients all the rights that
 35 | you have.  You must make sure that they, too, receive or can get the
 36 | source code.  And you must show them these terms so they know their
 37 | rights.
 38 | 
 39 |   We protect your rights with two steps: (1) copyright the software, and
 40 | (2) offer you this license which gives you legal permission to copy,
 41 | distribute and/or modify the software.
 42 | 
 43 |   Also, for each author's protection and ours, we want to make certain
 44 | that everyone understands that there is no warranty for this free
 45 | software.  If the software is modified by someone else and passed on, we
 46 | want its recipients to know that what they have is not the original, so
 47 | that any problems introduced by others will not reflect on the original
 48 | authors' reputations.
 49 | 
 50 |   Finally, any free program is threatened constantly by software
 51 | patents.  We wish to avoid the danger that redistributors of a free
 52 | program will individually obtain patent licenses, in effect making the
 53 | program proprietary.  To prevent this, we have made it clear that any
 54 | patent must be licensed for everyone's free use or not licensed at all.
 55 | 
 56 |   The precise terms and conditions for copying, distribution and
 57 | modification follow.
 58 | 
 59 |                     GNU GENERAL PUBLIC LICENSE
 60 |    TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
 61 | 
 62 |   0. This License applies to any program or other work which contains
 63 | a notice placed by the copyright holder saying it may be distributed
 64 | under the terms of this General Public License.  The "Program", below,
 65 | refers to any such program or work, and a "work based on the Program"
 66 | means either the Program or any derivative work under copyright law:
 67 | that is to say, a work containing the Program or a portion of it,
 68 | either verbatim or with modifications and/or translated into another
 69 | language.  (Hereinafter, translation is included without limitation in
 70 | the term "modification".)  Each licensee is addressed as "you".
 71 | 
 72 | Activities other than copying, distribution and modification are not
 73 | covered by this License; they are outside its scope.  The act of
 74 | running the Program is not restricted, and the output from the Program
 75 | is covered only if its contents constitute a work based on the
 76 | Program (independent of having been made by running the Program).
 77 | Whether that is true depends on what the Program does.
 78 | 
 79 |   1. You may copy and distribute verbatim copies of the Program's
 80 | source code as you receive it, in any medium, provided that you
 81 | conspicuously and appropriately publish on each copy an appropriate
 82 | copyright notice and disclaimer of warranty; keep intact all the
 83 | notices that refer to this License and to the absence of any warranty;
 84 | and give any other recipients of the Program a copy of this License
 85 | along with the Program.
 86 | 
 87 | You may charge a fee for the physical act of transferring a copy, and
 88 | you may at your option offer warranty protection in exchange for a fee.
 89 | 
 90 |   2. You may modify your copy or copies of the Program or any portion
 91 | of it, thus forming a work based on the Program, and copy and
 92 | distribute such modifications or work under the terms of Section 1
 93 | above, provided that you also meet all of these conditions:
 94 | 
 95 |     a) You must cause the modified files to carry prominent notices
 96 |     stating that you changed the files and the date of any change.
 97 | 
 98 |     b) You must cause any work that you distribute or publish, that in
 99 |     whole or in part contains or is derived from the Program or any
100 |     part thereof, to be licensed as a whole at no charge to all third
101 |     parties under the terms of this License.
102 | 
103 |     c) If the modified program normally reads commands interactively
104 |     when run, you must cause it, when started running for such
105 |     interactive use in the most ordinary way, to print or display an
106 |     announcement including an appropriate copyright notice and a
107 |     notice that there is no warranty (or else, saying that you provide
108 |     a warranty) and that users may redistribute the program under
109 |     these conditions, and telling the user how to view a copy of this
110 |     License.  (Exception: if the Program itself is interactive but
111 |     does not normally print such an announcement, your work based on
112 |     the Program is not required to print an announcement.)
113 | 
114 | These requirements apply to the modified work as a whole.  If
115 | identifiable sections of that work are not derived from the Program,
116 | and can be reasonably considered independent and separate works in
117 | themselves, then this License, and its terms, do not apply to those
118 | sections when you distribute them as separate works.  But when you
119 | distribute the same sections as part of a whole which is a work based
120 | on the Program, the distribution of the whole must be on the terms of
121 | this License, whose permissions for other licensees extend to the
122 | entire whole, and thus to each and every part regardless of who wrote it.
123 | 
124 | Thus, it is not the intent of this section to claim rights or contest
125 | your rights to work written entirely by you; rather, the intent is to
126 | exercise the right to control the distribution of derivative or
127 | collective works based on the Program.
128 | 
129 | In addition, mere aggregation of another work not based on the Program
130 | with the Program (or with a work based on the Program) on a volume of
131 | a storage or distribution medium does not bring the other work under
132 | the scope of this License.
133 | 
134 |   3. You may copy and distribute the Program (or a work based on it,
135 | under Section 2) in object code or executable form under the terms of
136 | Sections 1 and 2 above provided that you also do one of the following:
137 | 
138 |     a) Accompany it with the complete corresponding machine-readable
139 |     source code, which must be distributed under the terms of Sections
140 |     1 and 2 above on a medium customarily used for software interchange; or,
141 | 
142 |     b) Accompany it with a written offer, valid for at least three
143 |     years, to give any third party, for a charge no more than your
144 |     cost of physically performing source distribution, a complete
145 |     machine-readable copy of the corresponding source code, to be
146 |     distributed under the terms of Sections 1 and 2 above on a medium
147 |     customarily used for software interchange; or,
148 | 
149 |     c) Accompany it with the information you received as to the offer
150 |     to distribute corresponding source code.  (This alternative is
151 |     allowed only for noncommercial distribution and only if you
152 |     received the program in object code or executable form with such
153 |     an offer, in accord with Subsection b above.)
154 | 
155 | The source code for a work means the preferred form of the work for
156 | making modifications to it.  For an executable work, complete source
157 | code means all the source code for all modules it contains, plus any
158 | associated interface definition files, plus the scripts used to
159 | control compilation and installation of the executable.  However, as a
160 | special exception, the source code distributed need not include
161 | anything that is normally distributed (in either source or binary
162 | form) with the major components (compiler, kernel, and so on) of the
163 | operating system on which the executable runs, unless that component
164 | itself accompanies the executable.
165 | 
166 | If distribution of executable or object code is made by offering
167 | access to copy from a designated place, then offering equivalent
168 | access to copy the source code from the same place counts as
169 | distribution of the source code, even though third parties are not
170 | compelled to copy the source along with the object code.
171 | 
172 |   4. You may not copy, modify, sublicense, or distribute the Program
173 | except as expressly provided under this License.  Any attempt
174 | otherwise to copy, modify, sublicense or distribute the Program is
175 | void, and will automatically terminate your rights under this License.
176 | However, parties who have received copies, or rights, from you under
177 | this License will not have their licenses terminated so long as such
178 | parties remain in full compliance.
179 | 
180 |   5. You are not required to accept this License, since you have not
181 | signed it.  However, nothing else grants you permission to modify or
182 | distribute the Program or its derivative works.  These actions are
183 | prohibited by law if you do not accept this License.  Therefore, by
184 | modifying or distributing the Program (or any work based on the
185 | Program), you indicate your acceptance of this License to do so, and
186 | all its terms and conditions for copying, distributing or modifying
187 | the Program or works based on it.
188 | 
189 |   6. Each time you redistribute the Program (or any work based on the
190 | Program), the recipient automatically receives a license from the
191 | original licensor to copy, distribute or modify the Program subject to
192 | these terms and conditions.  You may not impose any further
193 | restrictions on the recipients' exercise of the rights granted herein.
194 | You are not responsible for enforcing compliance by third parties to
195 | this License.
196 | 
197 |   7. If, as a consequence of a court judgment or allegation of patent
198 | infringement or for any other reason (not limited to patent issues),
199 | conditions are imposed on you (whether by court order, agreement or
200 | otherwise) that contradict the conditions of this License, they do not
201 | excuse you from the conditions of this License.  If you cannot
202 | distribute so as to satisfy simultaneously your obligations under this
203 | License and any other pertinent obligations, then as a consequence you
204 | may not distribute the Program at all.  For example, if a patent
205 | license would not permit royalty-free redistribution of the Program by
206 | all those who receive copies directly or indirectly through you, then
207 | the only way you could satisfy both it and this License would be to
208 | refrain entirely from distribution of the Program.
209 | 
210 | If any portion of this section is held invalid or unenforceable under
211 | any particular circumstance, the balance of the section is intended to
212 | apply and the section as a whole is intended to apply in other
213 | circumstances.
214 | 
215 | It is not the purpose of this section to induce you to infringe any
216 | patents or other property right claims or to contest validity of any
217 | such claims; this section has the sole purpose of protecting the
218 | integrity of the free software distribution system, which is
219 | implemented by public license practices.  Many people have made
220 | generous contributions to the wide range of software distributed
221 | through that system in reliance on consistent application of that
222 | system; it is up to the author/donor to decide if he or she is willing
223 | to distribute software through any other system and a licensee cannot
224 | impose that choice.
225 | 
226 | This section is intended to make thoroughly clear what is believed to
227 | be a consequence of the rest of this License.
228 | 
229 |   8. If the distribution and/or use of the Program is restricted in
230 | certain countries either by patents or by copyrighted interfaces, the
231 | original copyright holder who places the Program under this License
232 | may add an explicit geographical distribution limitation excluding
233 | those countries, so that distribution is permitted only in or among
234 | countries not thus excluded.  In such case, this License incorporates
235 | the limitation as if written in the body of this License.
236 | 
237 |   9. The Free Software Foundation may publish revised and/or new versions
238 | of the General Public License from time to time.  Such new versions will
239 | be similar in spirit to the present version, but may differ in detail to
240 | address new problems or concerns.
241 | 
242 | Each version is given a distinguishing version number.  If the Program
243 | specifies a version number of this License which applies to it and "any
244 | later version", you have the option of following the terms and conditions
245 | either of that version or of any later version published by the Free
246 | Software Foundation.  If the Program does not specify a version number of
247 | this License, you may choose any version ever published by the Free Software
248 | Foundation.
249 | 
250 |   10. If you wish to incorporate parts of the Program into other free
251 | programs whose distribution conditions are different, write to the author
252 | to ask for permission.  For software which is copyrighted by the Free
253 | Software Foundation, write to the Free Software Foundation; we sometimes
254 | make exceptions for this.  Our decision will be guided by the two goals
255 | of preserving the free status of all derivatives of our free software and
256 | of promoting the sharing and reuse of software generally.
257 | 
258 |                             NO WARRANTY
259 | 
260 |   11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY
261 | FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW.  EXCEPT WHEN
262 | OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES
263 | PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED
264 | OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
265 | MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.  THE ENTIRE RISK AS
266 | TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU.  SHOULD THE
267 | PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING,
268 | REPAIR OR CORRECTION.
269 | 
270 |   12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
271 | WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR
272 | REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES,
273 | INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING
274 | OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED
275 | TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY
276 | YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER
277 | PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE
278 | POSSIBILITY OF SUCH DAMAGES.
279 | 
280 |                      END OF TERMS AND CONDITIONS
281 | 
282 |             How to Apply These Terms to Your New Programs
283 | 
284 |   If you develop a new program, and you want it to be of the greatest
285 | possible use to the public, the best way to achieve this is to make it
286 | free software which everyone can redistribute and change under these terms.
287 | 
288 |   To do so, attach the following notices to the program.  It is safest
289 | to attach them to the start of each source file to most effectively
290 | convey the exclusion of warranty; and each file should have at least
291 | the "copyright" line and a pointer to where the full notice is found.
292 | 
293 |     <one line to give the program's name and a brief idea of what it does.>
294 |     Copyright (C) <year>  <name of author>
295 | 
296 |     This program is free software; you can redistribute it and/or modify
297 |     it under the terms of the GNU General Public License as published by
298 |     the Free Software Foundation; either version 2 of the License, or
299 |     (at your option) any later version.
300 | 
301 |     This program is distributed in the hope that it will be useful,
302 |     but WITHOUT ANY WARRANTY; without even the implied warranty of
303 |     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
304 |     GNU General Public License for more details.
305 | 
306 |     You should have received a copy of the GNU General Public License along
307 |     with this program; if not, write to the Free Software Foundation, Inc.,
308 |     51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
309 | 
310 | Also add information on how to contact you by electronic and paper mail.
311 | 
312 | If the program is interactive, make it output a short notice like this
313 | when it starts in an interactive mode:
314 | 
315 |     Gnomovision version 69, Copyright (C) year name of author
316 |     Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
317 |     This is free software, and you are welcome to redistribute it
318 |     under certain conditions; type `show c' for details.
319 | 
320 | The hypothetical commands `show w' and `show c' should show the appropriate
321 | parts of the General Public License.  Of course, the commands you use may
322 | be called something other than `show w' and `show c'; they could even be
323 | mouse-clicks or menu items--whatever suits your program.
324 | 
325 | You should also get your employer (if you work as a programmer) or your
326 | school, if any, to sign a "copyright disclaimer" for the program, if
327 | necessary.  Here is a sample; alter the names:
328 | 
329 |   Yoyodyne, Inc., hereby disclaims all copyright interest in the program
330 |   `Gnomovision' (which makes passes at compilers) written by James Hacker.
331 | 
332 |   <signature of Ty Coon>, 1 April 1989
333 |   Ty Coon, President of Vice
334 | 
335 | This General Public License does not permit incorporating your program into
336 | proprietary programs.  If your program is a subroutine library, you may
337 | consider it more useful to permit linking proprietary applications with the
338 | library.  If this is what you want to do, use the GNU Lesser General
339 | Public License instead of this License.
340 | 


--------------------------------------------------------------------------------