├── app ├── __init__.py ├── backend │ ├── __init__.py │ ├── api │ │ ├── __init__.py │ │ ├── api_router.py │ │ └── routers │ │ │ ├── crashed_containers.py │ │ │ └── auth.py │ ├── core │ │ ├── __init__.py │ │ ├── state.py │ │ ├── database.py │ │ ├── password_utils.py │ │ ├── security.py │ │ ├── logger.py │ │ └── config.py │ ├── models │ │ ├── __init__.py │ │ ├── user.py │ │ └── crashed_container.py │ ├── schemas │ │ ├── __init__.py │ │ ├── user_schema.py │ │ ├── chart_stats_schema.py │ │ └── crashed_container_schema.py │ ├── services │ │ ├── __init__.py │ │ ├── stats_service.py │ │ └── monitor_service.py │ ├── utils │ │ ├── __init__.py │ │ └── string_utils.py │ ├── notifications │ │ ├── __init__.py │ │ ├── apprise_client.py │ │ └── notification_manager.py │ └── repositories │ │ ├── __init__.py │ │ ├── user_repository.py │ │ └── crashed_container_repository.py └── dashboard │ ├── src │ ├── assets │ │ └── logo.png │ ├── components │ │ ├── spinner │ │ │ └── spinner.tsx │ │ ├── navbar │ │ │ └── navbar.tsx │ │ ├── chart │ │ │ └── chart.tsx │ │ ├── datepickerform │ │ │ └── datepickerform.tsx │ │ └── datepicker │ │ │ └── datepicker.tsx │ ├── utils │ │ └── utils.ts │ ├── main.tsx │ ├── api │ │ ├── auth.ts │ │ ├── client.ts │ │ └── crashedContainers.ts │ ├── models │ │ └── crashedContainer.ts │ ├── App.tsx │ ├── App.css │ ├── context │ │ └── auth.tsx │ ├── index.css │ └── pages │ │ ├── login │ │ └── index.tsx │ │ └── homepage │ │ └── index.tsx │ ├── tsconfig.json │ ├── .gitignore │ ├── vite.config.ts │ ├── index.html │ ├── eslint.config.js │ ├── tsconfig.node.json │ ├── tsconfig.app.json │ ├── package.json │ └── README.md ├── .gitignore ├── requirements.txt ├── docs └── images │ └── preview.png ├── .releaserc.json ├── Dockerfile ├── .github ├── FUNDING.yml └── workflows │ └── release.yml ├── LICENSE ├── example.env ├── CHANGELOG.md ├── main.py └── README.md /app/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /app/backend/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /app/backend/api/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /app/backend/core/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /app/backend/models/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /app/backend/schemas/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /app/backend/services/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /app/backend/utils/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /app/backend/notifications/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /app/backend/repositories/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .env 2 | .venv 3 | __pycache__/ 4 | *.py[cod] 5 | *.db -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kRYstall9/docker-surgeon/HEAD/requirements.txt -------------------------------------------------------------------------------- /docs/images/preview.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kRYstall9/docker-surgeon/HEAD/docs/images/preview.png -------------------------------------------------------------------------------- /app/dashboard/src/assets/logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kRYstall9/docker-surgeon/HEAD/app/dashboard/src/assets/logo.png -------------------------------------------------------------------------------- /app/backend/core/state.py: -------------------------------------------------------------------------------- 1 | from app.backend.core.config import Config 2 | from logging import Logger 3 | 4 | config: Config | None = None 5 | logger: Logger | None = None -------------------------------------------------------------------------------- /app/dashboard/tsconfig.json: -------------------------------------------------------------------------------- 1 | { 2 | "files": [], 3 | "references": [ 4 | { "path": "./tsconfig.app.json" }, 5 | { "path": "./tsconfig.node.json" } 6 | ] 7 | } 8 | -------------------------------------------------------------------------------- /app/dashboard/src/components/spinner/spinner.tsx: -------------------------------------------------------------------------------- 1 | export function Spinner() { 2 | return ( 3 |
4 | ); 5 | } 6 | -------------------------------------------------------------------------------- /app/backend/api/api_router.py: -------------------------------------------------------------------------------- 1 | from fastapi import APIRouter 2 | from app.backend.api.routers import auth, crashed_containers 3 | 4 | api_router = APIRouter() 5 | api_router.include_router(crashed_containers.router) 6 | api_router.include_router(auth.router) -------------------------------------------------------------------------------- /app/dashboard/src/utils/utils.ts: -------------------------------------------------------------------------------- 1 | export function formatLocalDate(date: Date){ 2 | const year = date.getFullYear(); 3 | const month = String(date.getMonth() + 1).padStart(2, "0"); 4 | const day = String(date.getDate()).padStart(2, "0"); 5 | 6 | return `${year}-${month}-${day}`; 7 | } -------------------------------------------------------------------------------- /app/dashboard/src/main.tsx: -------------------------------------------------------------------------------- 1 | import { StrictMode } from 'react' 2 | import { createRoot } from 'react-dom/client' 3 | import './index.css' 4 | import App from './App.tsx' 5 | 6 | createRoot(document.getElementById('root')!).render( 7 | 8 | 9 | , 10 | ) 11 | -------------------------------------------------------------------------------- /app/dashboard/src/api/auth.ts: -------------------------------------------------------------------------------- 1 | import axios from 'axios'; 2 | 3 | export async function login(password: string){ 4 | return axios.post("/api/auth/login", { password }, { 5 | withCredentials: true 6 | }); 7 | } 8 | 9 | export async function logout(){ 10 | return axios.post("/api/logout", {}, { 11 | withCredentials: true 12 | }); 13 | } -------------------------------------------------------------------------------- /app/backend/schemas/user_schema.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime 2 | from pydantic import BaseModel, ConfigDict 3 | 4 | class UserBase(BaseModel): 5 | username: str 6 | 7 | class UserCreate(UserBase): 8 | password: str 9 | 10 | class User(UserBase): 11 | id: int 12 | createdon: datetime 13 | 14 | model_config = ConfigDict(from_attributes=True) -------------------------------------------------------------------------------- /app/dashboard/src/api/client.ts: -------------------------------------------------------------------------------- 1 | import axios from "axios"; 2 | 3 | export const api = axios.create({ 4 | baseURL: "/api" 5 | }); 6 | 7 | api.interceptors.response.use( 8 | res => res, 9 | err => { 10 | if (err.response?.status === 401){ 11 | window.location.href = "/login"; 12 | } 13 | return Promise.reject(err); 14 | } 15 | ); -------------------------------------------------------------------------------- /app/dashboard/.gitignore: -------------------------------------------------------------------------------- 1 | # Logs 2 | logs 3 | *.log 4 | npm-debug.log* 5 | yarn-debug.log* 6 | yarn-error.log* 7 | pnpm-debug.log* 8 | lerna-debug.log* 9 | 10 | node_modules 11 | dist 12 | dist-ssr 13 | *.local 14 | 15 | # Editor directories and files 16 | .vscode/* 17 | !.vscode/extensions.json 18 | .idea 19 | .DS_Store 20 | *.suo 21 | *.ntvs* 22 | *.njsproj 23 | *.sln 24 | *.sw? 25 | -------------------------------------------------------------------------------- /app/backend/utils/string_utils.py: -------------------------------------------------------------------------------- 1 | def normalize_escapes(value: str) -> str: 2 | """ 3 | Converts \\n, \\t, \\r, \\uXXXX escape sequences 4 | into real characters. 5 | """ 6 | if not isinstance(value, str): 7 | return value 8 | try: 9 | return value.encode().decode("unicode_escape") 10 | except Exception: 11 | return value # fallback safe 12 | -------------------------------------------------------------------------------- /.releaserc.json: -------------------------------------------------------------------------------- 1 | { 2 | "branches": ["main"], 3 | "plugins": [ 4 | "@semantic-release/commit-analyzer", 5 | "@semantic-release/release-notes-generator", 6 | "@semantic-release/changelog", 7 | [ 8 | "@semantic-release/exec", 9 | { 10 | "prepareCmd": "echo ${nextRelease.version} > VERSION" 11 | } 12 | ], 13 | "@semantic-release/git" 14 | ] 15 | } 16 | -------------------------------------------------------------------------------- /app/dashboard/vite.config.ts: -------------------------------------------------------------------------------- 1 | import { defineConfig } from 'vite' 2 | import tailwindcss from '@tailwindcss/vite' 3 | import react from '@vitejs/plugin-react' 4 | 5 | // https://vite.dev/config/ 6 | export default defineConfig({ 7 | plugins: [ 8 | react({ 9 | babel: { 10 | plugins: [['babel-plugin-react-compiler']], 11 | }, 12 | }), 13 | tailwindcss() 14 | ], 15 | }) 16 | -------------------------------------------------------------------------------- /app/dashboard/src/models/crashedContainer.ts: -------------------------------------------------------------------------------- 1 | export interface CrashedContainerBase { 2 | container_id: string; 3 | container_name: string; 4 | crashed_on?: string | null; 5 | } 6 | 7 | export interface CrashedContainerLogs extends CrashedContainerBase { 8 | logs: string; 9 | 10 | } 11 | 12 | export interface CrashedContainerChartStats extends CrashedContainerBase { 13 | crash_count: number; 14 | } -------------------------------------------------------------------------------- /app/backend/schemas/chart_stats_schema.py: -------------------------------------------------------------------------------- 1 | from pydantic import BaseModel, Field 2 | from datetime import date 3 | 4 | class ChartStats(BaseModel): 5 | container_id:str = Field(description="The unique identifier of the container") 6 | container_name:str = Field(description="The name of the container") 7 | crash_count:int = Field(description="The number of times the container has crashed on the specified date") 8 | crashed_on:date 9 | -------------------------------------------------------------------------------- /app/backend/schemas/crashed_container_schema.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime 2 | from pydantic import BaseModel, Field 3 | 4 | 5 | class CrashedContainerBase(BaseModel): 6 | container_id:str 7 | container_name: str | None = None 8 | logs:str 9 | 10 | class CrashedContainerLogs(CrashedContainerBase): 11 | crashed_on:datetime 12 | 13 | class CrashedContainerStats(CrashedContainerBase): 14 | crash_count:int 15 | -------------------------------------------------------------------------------- /app/dashboard/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | Docker Surgeon 8 | 9 | 10 |
11 | 12 | 13 | 14 | -------------------------------------------------------------------------------- /app/dashboard/src/App.tsx: -------------------------------------------------------------------------------- 1 | import './App.css' 2 | import { BrowserRouter, Routes, Route } from 'react-router-dom' 3 | import { Homepage } from './pages/homepage' 4 | import { Login } from './pages/login' 5 | 6 | function App() { 7 | 8 | return ( 9 | 10 | 11 | }> 12 | }> 13 | 14 | 15 | ) 16 | } 17 | 18 | export default App 19 | -------------------------------------------------------------------------------- /app/backend/models/user.py: -------------------------------------------------------------------------------- 1 | from sqlalchemy import Column, Integer, String, Date, ForeignKey 2 | from sqlalchemy.orm import relationship 3 | from app.backend.core.database import Base 4 | 5 | class User(Base): 6 | __tablename__ = "users" 7 | 8 | id = Column(Integer, primary_key=True) 9 | username = Column(String(200), unique=True, nullable=False) 10 | password = Column(String(100), unique=True, nullable=False) 11 | createdon = Column(Date, nullable=False) 12 | 13 | def __repr__(self): 14 | return f"" -------------------------------------------------------------------------------- /app/backend/core/database.py: -------------------------------------------------------------------------------- 1 | from logging import Logger 2 | from sqlalchemy import create_engine 3 | from sqlalchemy.ext.declarative import declarative_base 4 | from sqlalchemy.orm import sessionmaker 5 | 6 | engine = create_engine('sqlite:///./app/data/database.db', echo=False) 7 | Base = declarative_base() 8 | SessionLocal = sessionmaker(bind=engine, autoflush=False, autocommit=False) 9 | 10 | 11 | def init_db(logger:Logger): 12 | from app.backend.models.crashed_container import CrashedContainer 13 | 14 | Base.metadata.create_all(engine) 15 | 16 | logger.info('DB initialized') -------------------------------------------------------------------------------- /app/backend/models/crashed_container.py: -------------------------------------------------------------------------------- 1 | from sqlalchemy import Column, Integer, String, DateTime 2 | from app.backend.core.database import Base 3 | 4 | 5 | class CrashedContainer(Base): 6 | __tablename__ = 'crashedcontainers' 7 | 8 | id = Column(Integer, primary_key=True) 9 | container_id = Column(String(100)) 10 | container_name = Column(String(100)) 11 | logs = Column(String(5000), nullable=True) 12 | crashedon = Column(DateTime, nullable=False) 13 | 14 | def __repr__(self): 15 | return f"" 16 | 17 | -------------------------------------------------------------------------------- /app/backend/core/password_utils.py: -------------------------------------------------------------------------------- 1 | from argon2 import PasswordHasher 2 | import bcrypt 3 | 4 | password_hasher = PasswordHasher() 5 | 6 | def verify_hash(user_input: str, stored: str) -> bool: 7 | 8 | # Argon2 9 | if stored.startswith("$argon2"): 10 | try: 11 | return password_hasher.verify(stored, user_input) 12 | except Exception: 13 | return False 14 | 15 | # Bcrypt 16 | if stored.startswith("$2"): 17 | try: 18 | return bcrypt.checkpw(user_input.encode(), stored.encode()) 19 | except Exception: 20 | return False 21 | 22 | # Plain Text 23 | return user_input == stored -------------------------------------------------------------------------------- /app/backend/notifications/apprise_client.py: -------------------------------------------------------------------------------- 1 | from threading import Lock 2 | from apprise import Apprise 3 | 4 | class AppriseClient(): 5 | _instance = None 6 | _lock = Lock() 7 | 8 | def __new__(cls, urls: list[str]): 9 | with cls._lock: 10 | if cls._instance is None: 11 | cls._instance = super().__new__(cls) 12 | cls._instance.apprise_client = Apprise() 13 | 14 | for url in urls: 15 | cls._instance.apprise_client.add(url) 16 | 17 | return cls._instance 18 | 19 | def send(self, title:str, body:str): 20 | self.apprise_client.notify(body=body, title=title) -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM node:22-slim AS dashboard_builder 2 | 3 | WORKDIR /dashboard 4 | COPY app/dashboard/package.json app/dashboard/package-lock.json ./ 5 | RUN npm ci 6 | 7 | COPY app/dashboard/ ./ 8 | RUN npm run build 9 | 10 | 11 | FROM python:3.13-slim 12 | 13 | RUN apt-get update && \ 14 | apt-get install -y --no-install-recommends bash sqlite3 && \ 15 | rm -rf /var/lib/apt/lists/* 16 | 17 | WORKDIR /app 18 | 19 | COPY app/ ./app 20 | COPY main.py . 21 | COPY requirements.txt . 22 | 23 | RUN rm -rf ./app/dashboard 24 | 25 | COPY --from=dashboard_builder /dashboard/dist ./app/dashboard_build 26 | 27 | RUN mkdir -p ./app/data 28 | RUN pip install --no-cache-dir -r requirements.txt 29 | 30 | CMD ["python", "main.py"] -------------------------------------------------------------------------------- /app/dashboard/eslint.config.js: -------------------------------------------------------------------------------- 1 | import js from '@eslint/js' 2 | import globals from 'globals' 3 | import reactHooks from 'eslint-plugin-react-hooks' 4 | import reactRefresh from 'eslint-plugin-react-refresh' 5 | import tseslint from 'typescript-eslint' 6 | import { defineConfig, globalIgnores } from 'eslint/config' 7 | 8 | export default defineConfig([ 9 | globalIgnores(['dist']), 10 | { 11 | files: ['**/*.{ts,tsx}'], 12 | extends: [ 13 | js.configs.recommended, 14 | tseslint.configs.recommended, 15 | reactHooks.configs['recommended-latest'], 16 | reactRefresh.configs.vite, 17 | ], 18 | languageOptions: { 19 | ecmaVersion: 2020, 20 | globals: globals.browser, 21 | }, 22 | }, 23 | ]) 24 | -------------------------------------------------------------------------------- /app/dashboard/tsconfig.node.json: -------------------------------------------------------------------------------- 1 | { 2 | "compilerOptions": { 3 | "tsBuildInfoFile": "./node_modules/.tmp/tsconfig.node.tsbuildinfo", 4 | "target": "ES2023", 5 | "lib": ["ES2023"], 6 | "module": "ESNext", 7 | "types": ["node"], 8 | "skipLibCheck": true, 9 | 10 | /* Bundler mode */ 11 | "moduleResolution": "bundler", 12 | "allowImportingTsExtensions": true, 13 | "verbatimModuleSyntax": true, 14 | "moduleDetection": "force", 15 | "noEmit": true, 16 | 17 | /* Linting */ 18 | "strict": true, 19 | "noUnusedLocals": true, 20 | "noUnusedParameters": true, 21 | "erasableSyntaxOnly": true, 22 | "noFallthroughCasesInSwitch": true, 23 | "noUncheckedSideEffectImports": true 24 | }, 25 | "include": ["vite.config.ts"] 26 | } 27 | -------------------------------------------------------------------------------- /app/dashboard/tsconfig.app.json: -------------------------------------------------------------------------------- 1 | { 2 | "compilerOptions": { 3 | "tsBuildInfoFile": "./node_modules/.tmp/tsconfig.app.tsbuildinfo", 4 | "target": "ES2022", 5 | "useDefineForClassFields": true, 6 | "lib": ["ES2022", "DOM", "DOM.Iterable"], 7 | "module": "ESNext", 8 | "types": ["vite/client"], 9 | "skipLibCheck": true, 10 | 11 | /* Bundler mode */ 12 | "moduleResolution": "bundler", 13 | "allowImportingTsExtensions": true, 14 | "verbatimModuleSyntax": true, 15 | "moduleDetection": "force", 16 | "noEmit": true, 17 | "jsx": "react-jsx", 18 | 19 | /* Linting */ 20 | "strict": true, 21 | "noUnusedLocals": true, 22 | "noUnusedParameters": true, 23 | "erasableSyntaxOnly": true, 24 | "noFallthroughCasesInSwitch": true, 25 | "noUncheckedSideEffectImports": true 26 | }, 27 | "include": ["src"] 28 | } 29 | -------------------------------------------------------------------------------- /.github/FUNDING.yml: -------------------------------------------------------------------------------- 1 | # These are supported funding model platforms 2 | 3 | github: kRYstall9 4 | patreon: # Replace with a single Patreon username 5 | open_collective: # Replace with a single Open Collective username 6 | ko_fi: # Replace with a single Ko-fi username 7 | tidelift: # Replace with a single Tidelift platform-name/package-name e.g., npm/babel 8 | community_bridge: # Replace with a single Community Bridge project-name e.g., cloud-foundry 9 | liberapay: # Replace with a single Liberapay username 10 | issuehunt: # Replace with a single IssueHunt username 11 | lfx_crowdfunding: # Replace with a single LFX Crowdfunding project-name e.g., cloud-foundry 12 | polar: # Replace with a single Polar username 13 | buy_me_a_coffee: kRYstall9 14 | thanks_dev: # Replace with a single thanks.dev username 15 | custom: # Replace with up to 4 custom sponsorship URLs e.g., ['link1', 'link2'] 16 | -------------------------------------------------------------------------------- /app/backend/api/routers/crashed_containers.py: -------------------------------------------------------------------------------- 1 | from fastapi import APIRouter, Depends 2 | from app.backend.core.security import require_admin 3 | from app.backend.schemas.crashed_container_schema import CrashedContainerLogs 4 | from app.backend.schemas.chart_stats_schema import ChartStats 5 | from app.backend.services.stats_service import StatsService 6 | 7 | router = APIRouter(prefix="/crashed_containers", tags=["Crashed Containers"], dependencies=[Depends(require_admin)]) 8 | 9 | @router.get("", response_model=list[CrashedContainerLogs]) 10 | def list_crashed_containers(date_from:str, date_to:str): 11 | return StatsService.get_crashed_containers(date_from, date_to) 12 | 13 | @router.get("/chart-stats", response_model=list[ChartStats]) 14 | def get_crashed_containers_graph_stats(date_from:str, date_to:str): 15 | return StatsService.get_crashed_containers_chart_stats(date_from, date_to) -------------------------------------------------------------------------------- /app/dashboard/src/App.css: -------------------------------------------------------------------------------- 1 | #root { 2 | margin: 0; 3 | padding: 0; 4 | text-align: center; 5 | display: flex; 6 | align-items: center; 7 | justify-content: center; 8 | } 9 | 10 | .logo { 11 | height: 6em; 12 | padding: 1.5em; 13 | will-change: filter; 14 | transition: filter 300ms; 15 | } 16 | .logo:hover { 17 | filter: drop-shadow(0 0 2em #646cffaa); 18 | } 19 | .logo.react:hover { 20 | filter: drop-shadow(0 0 2em #61dafbaa); 21 | } 22 | 23 | @keyframes logo-spin { 24 | from { 25 | transform: rotate(0deg); 26 | } 27 | to { 28 | transform: rotate(360deg); 29 | } 30 | } 31 | 32 | @media (prefers-reduced-motion: no-preference) { 33 | a:nth-of-type(2) .logo { 34 | animation: logo-spin infinite 20s linear; 35 | } 36 | } 37 | 38 | .card { 39 | padding: 2em; 40 | } 41 | 42 | .read-the-docs { 43 | color: #888; 44 | } 45 | 46 | .container-bg-color{ 47 | background-color: var(--homepage-bg-color); 48 | } 49 | -------------------------------------------------------------------------------- /app/dashboard/src/components/navbar/navbar.tsx: -------------------------------------------------------------------------------- 1 | import { logout } from "../../api/auth"; 2 | import logo from "../../assets/logo.png"; 3 | import { LogOut } from "lucide-react"; 4 | 5 | export function Navbar() { 6 | return ( 7 |
8 |
9 | Logo 10 |

Docker Surgeon

11 |
12 |
13 | 23 |
24 |
25 | ); 26 | } 27 | -------------------------------------------------------------------------------- /app/dashboard/src/api/crashedContainers.ts: -------------------------------------------------------------------------------- 1 | import { api } from "./client"; 2 | import type { 3 | CrashedContainerChartStats, 4 | CrashedContainerLogs, 5 | } from "../models/crashedContainer"; 6 | 7 | export async function getChartStats( 8 | date_from: string, 9 | date_to: string 10 | ): Promise { 11 | try { 12 | const res = await api.get("/crashed_containers/chart-stats", { 13 | params: { date_from, date_to }, 14 | }); 15 | 16 | return res.data; 17 | } catch (error) { 18 | console.error(error); 19 | return []; 20 | } 21 | } 22 | 23 | export async function getCrashedContainersMetrics( 24 | date_from: string, 25 | date_to: string 26 | ): Promise { 27 | try { 28 | const res = await api.get("/crashed_containers", { 29 | params: { date_from, date_to }, 30 | }); 31 | 32 | return res.data; 33 | } catch (error) { 34 | console.error(error); 35 | return []; 36 | } 37 | } 38 | -------------------------------------------------------------------------------- /app/backend/api/routers/auth.py: -------------------------------------------------------------------------------- 1 | from fastapi import APIRouter, Response, HTTPException 2 | from app.backend.core.config import Config 3 | from app.backend.core.password_utils import verify_hash 4 | from app.backend.core.security import create_token 5 | 6 | router = APIRouter() 7 | 8 | @router.post("/auth/login") 9 | def login(body:dict , response: Response): 10 | password = body.get("password") 11 | config = Config.get() 12 | 13 | if not verify_hash(password, config.admin_password): 14 | raise HTTPException(status_code=401, detail="Invalid password") 15 | 16 | token = create_token({"sub": "admin"}) 17 | 18 | response.set_cookie( 19 | key="token", 20 | value=token, 21 | httponly=True, 22 | max_age=3600, 23 | path="/" 24 | ) 25 | 26 | return {"ok": True} 27 | 28 | @router.post("/logout") 29 | def logout(response:Response): 30 | response.delete_cookie("token") 31 | return {"ok": True} 32 | 33 | -------------------------------------------------------------------------------- /app/dashboard/src/context/auth.tsx: -------------------------------------------------------------------------------- 1 | import { createContext, useState, useEffect } from "react"; 2 | 3 | interface AuthContextType { 4 | token: string | null; 5 | login: (token: string) => void; 6 | logout: () => void; 7 | } 8 | 9 | // eslint-disable-next-line react-refresh/only-export-components 10 | export const AuthContext = createContext({ 11 | token: null, 12 | login: () => {}, 13 | logout: () => {}, 14 | }); 15 | 16 | export function AuthProvider({ children }: { children: React.ReactNode }) { 17 | const [token, setToken] = useState(null); 18 | 19 | useEffect(() => { 20 | const saved = localStorage.getItem("auth_token"); 21 | if (saved) setToken(saved); 22 | }, []); 23 | 24 | function login(token: string) { 25 | localStorage.setItem("auth_token", token); 26 | setToken(token); 27 | } 28 | 29 | function logout() { 30 | localStorage.removeItem("auth_token"); 31 | setToken(null); 32 | } 33 | 34 | return ( 35 | 36 | {children} 37 | 38 | ); 39 | } 40 | -------------------------------------------------------------------------------- /app/backend/repositories/user_repository.py: -------------------------------------------------------------------------------- 1 | from logging import Logger 2 | from app.backend.core.database import SessionLocal 3 | from app.backend.schemas.user_schema import User as UserSchema 4 | from app.backend.models.user import User 5 | 6 | class UserRepository: 7 | 8 | @staticmethod 9 | def get_user(uid: int): 10 | with SessionLocal() as db: 11 | return db.query(User).filter(User.id == uid).first() 12 | 13 | @staticmethod 14 | def update_user(user:UserSchema, logger:Logger): 15 | with SessionLocal() as db: 16 | db_user = db.query(User).filter(User.id == user.id).first() 17 | 18 | if not db_user: 19 | logger.error(f"User with id {user.id} not found for update") 20 | return None 21 | 22 | db_user.username = user.username 23 | db_user.password = user.password 24 | 25 | db.commit() 26 | db.refresh(db_user) 27 | 28 | logger.info(f"User with id {user.id} updated successfully") 29 | 30 | return db_user -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2025 kRYstall9 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /app/backend/services/stats_service.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime 2 | from app.backend.repositories.crashed_container_repository import CrashedContainerRepository 3 | 4 | class StatsService: 5 | 6 | @staticmethod 7 | def get_crashed_containers(date_from:str, date_to:str): 8 | 9 | try: 10 | date_from = datetime.strptime(date_from, "%Y-%m-%d") 11 | date_to = datetime.strptime(date_to, "%Y-%m-%d") 12 | except ValueError: 13 | raise ValueError("Incorrect date format, should be YYYY-MM-DD") 14 | 15 | crashed_containers = CrashedContainerRepository.get_all_crashed_containers(date_from, date_to) 16 | return crashed_containers 17 | 18 | @staticmethod 19 | def get_crashed_containers_chart_stats(date_from:str, date_to:str): 20 | 21 | try: 22 | date_from = datetime.strptime(date_from, "%Y-%m-%d") 23 | date_to = datetime.strptime(date_to, "%Y-%m-%d") 24 | except ValueError: 25 | raise ValueError("Incorrect date format, should be YYYY-MM-DD") 26 | 27 | graph_stats = CrashedContainerRepository.get_crashed_containers_stats_by_date(date_from, date_to) 28 | return graph_stats 29 | -------------------------------------------------------------------------------- /app/dashboard/src/index.css: -------------------------------------------------------------------------------- 1 | @import url('https://fonts.googleapis.com/css2?family=Montserrat:ital,wght@0,100..900;1,100..900&display=swap'); 2 | @import "tailwindcss"; 3 | 4 | :root { 5 | font-family: "Montserrat", sans-serif; 6 | line-height: 1.5; 7 | font-weight: 400; 8 | 9 | color-scheme: light dark; 10 | color: rgba(255, 255, 255, 0.87); 11 | background-color: #242424; 12 | 13 | font-synthesis: none; 14 | text-rendering: optimizeLegibility; 15 | -webkit-font-smoothing: antialiased; 16 | -moz-osx-font-smoothing: grayscale; 17 | 18 | --login-bg-color: #0B1F2B; 19 | --homepage-bg-color: #262E36; 20 | --input-bg-color: #2D2D2D; 21 | } 22 | 23 | * { 24 | box-sizing: border-box; 25 | } 26 | 27 | a { 28 | font-weight: 500; 29 | color: #646cff; 30 | text-decoration: inherit; 31 | } 32 | a:hover { 33 | color: #535bf2; 34 | } 35 | 36 | body { 37 | margin: 0; 38 | padding: 0; 39 | min-height: 100vh; 40 | height: auto; 41 | } 42 | 43 | h1 { 44 | font-size: 3.2em; 45 | line-height: 1.1; 46 | } 47 | 48 | @media (prefers-color-scheme: light) { 49 | :root { 50 | color: #213547; 51 | background-color: #ffffff; 52 | } 53 | a:hover { 54 | color: #747bff; 55 | } 56 | button { 57 | background-color: #f9f9f9; 58 | } 59 | } 60 | -------------------------------------------------------------------------------- /app/dashboard/package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "dashboard", 3 | "private": true, 4 | "version": "0.0.0", 5 | "type": "module", 6 | "scripts": { 7 | "dev": "vite", 8 | "build": "tsc -b && vite build", 9 | "lint": "eslint .", 10 | "preview": "vite preview" 11 | }, 12 | "dependencies": { 13 | "@tailwindcss/vite": "^4.1.17", 14 | "axios": "^1.13.1", 15 | "chart.js": "^4.5.1", 16 | "lucide-react": "^0.553.0", 17 | "react": "^19.1.1", 18 | "react-chartjs-2": "^5.3.1", 19 | "react-dom": "^19.1.1", 20 | "react-router-dom": "^7.9.5", 21 | "save-dev": "^0.0.1-security", 22 | "tailwindcss": "^4.1.17" 23 | }, 24 | "devDependencies": { 25 | "@eslint/js": "^9.36.0", 26 | "@types/node": "^24.6.0", 27 | "@types/react": "^19.2.2", 28 | "@types/react-dom": "^19.2.2", 29 | "@vitejs/plugin-react": "^5.0.4", 30 | "babel-plugin-react-compiler": "^19.1.0-rc.3", 31 | "eslint": "^9.36.0", 32 | "eslint-plugin-react-hooks": "^5.2.0", 33 | "eslint-plugin-react-refresh": "^0.4.22", 34 | "globals": "^16.4.0", 35 | "typescript": "~5.9.3", 36 | "typescript-eslint": "^8.45.0", 37 | "vite": "npm:rolldown-vite@7.1.14" 38 | }, 39 | "overrides": { 40 | "vite": "npm:rolldown-vite@7.1.14" 41 | } 42 | } 43 | -------------------------------------------------------------------------------- /example.env: -------------------------------------------------------------------------------- 1 | RESTART_POLICY = '{ 2 | "excludedContainers": ["container_name"], #-> More than 1 container could be excluded. Specify them as ["container1", "container2"] 3 | "statuses": { 4 | "exited": { 5 | "codesToExclude": [0] #-> More than 1 exit code could be excluded. Specify them as ["code1", "code2", "code3"] 6 | } 7 | } 8 | }' 9 | 10 | ENABLE_DASHBOARD=True #-> Possible values [True | False] 11 | LOGS_AMOUNT=10 #-> This will display the last n logs on the dashboard to clearly indicate the issue that triggered the restart policy 12 | DASHBOARD_ADDRESS=0.0.0.0 #-> Possible values [0.0.0.0 | 127.0.0.1] 13 | DASHBOARD_PORT=8000 #-> Possible values [ Any free port ] 14 | ADMIN_PASSWORD= 15 | ENABLE_NOTIFICATIONS=True #-> Possible values [True | False] 16 | NOTIFICATION_URLS='["url1", "url2"]' #-> Check https://github.com/caronc/apprise/wiki#notification-services 17 | NOTIFICATION_TITLE="" #-> Edit the notification title as you wish 18 | NOTIFICATION_BODY="" #-> Edit the notification body as you wish 19 | 20 | 21 | 22 | ############### 23 | # LOGGING # 24 | ############### 25 | 26 | # --- Log Level --- 27 | # Set the verbosity of logs. Options: "error", "warn", "info", "debug" 28 | # Default: info 29 | LOG_LEVEL= info 30 | 31 | # --- Log Timezone --- 32 | # Adjust the timezone used for logging 33 | # e.g. Europe/Rome, America/New_York 34 | LOG_TIMEZONE=UTC -------------------------------------------------------------------------------- /app/backend/core/security.py: -------------------------------------------------------------------------------- 1 | import secrets 2 | from pathlib import Path 3 | from datetime import datetime, timezone, timedelta 4 | from fastapi import HTTPException, Request 5 | import jwt 6 | 7 | SECRET_FILE = Path("/app/app/data/jwt_secret.key") 8 | 9 | def load_or_create_jwt_secret() -> str: 10 | if SECRET_FILE.exists(): 11 | return SECRET_FILE.read_text().strip() 12 | 13 | SECRET_FILE.parent.mkdir(parents=True, exist_ok=True) 14 | 15 | new_token = secrets.token_hex(64) 16 | SECRET_FILE.write_text(new_token) 17 | return new_token 18 | 19 | JWT_SECRET = load_or_create_jwt_secret() 20 | 21 | def create_token(data:dict, expires_minutes:int = 60): 22 | to_encode= data.copy() 23 | expire = datetime.now(timezone.utc) + timedelta(minutes=expires_minutes) 24 | to_encode["exp"] = expire 25 | 26 | return jwt.encode(to_encode, JWT_SECRET, algorithm='HS256') 27 | 28 | 29 | def require_admin(request: Request): 30 | 31 | token = request.cookies.get("token") 32 | if not token: 33 | raise HTTPException(status_code=401, detail="Missing token") 34 | 35 | try: 36 | payload = jwt.decode(token, JWT_SECRET, algorithms=["HS256"]) 37 | except jwt.PyJWTError: 38 | raise HTTPException(status_code=401, detail="Invalid token") 39 | 40 | if payload.get("sub") != "admin": 41 | raise HTTPException(status_code=403, detail="Not authorized") 42 | 43 | return payload -------------------------------------------------------------------------------- /app/backend/notifications/notification_manager.py: -------------------------------------------------------------------------------- 1 | from app.backend.notifications.apprise_client import AppriseClient 2 | from app.backend.core.logger import get_logger 3 | from app.backend.core.config import Config 4 | import re 5 | 6 | config = Config.get() 7 | logger = get_logger(config) 8 | urls = [u.strip() for u in config.notification_urls] 9 | client = AppriseClient(urls) 10 | ANSI_ESCAPE = re.compile(r'\x1B\[[0-?]*[ -/]*[@-~]') 11 | 12 | class NotificationManager: 13 | @staticmethod 14 | def container_crashed_event(container_name:str, container_logs:str, container_exit_code:str): 15 | 16 | logger.info("Sending notifications") 17 | 18 | try: 19 | context = { 20 | "container_name": container_name, 21 | "logs": ANSI_ESCAPE.sub('',container_logs.decode('utf-8', errors='ignore')), 22 | "exit_code": container_exit_code, 23 | "n_logs": config.logs_amount 24 | } 25 | 26 | title = (config.notification_title or '⚠️ {container_name} crashed').format(**context) 27 | body = (config.notification_body or '`exit code`: `{exit_code}`\nLast {n_logs} logs of `{container_name}`: {logs}').format(**context) 28 | client.send(body=body, title=title) 29 | 30 | logger.info("Notification sent") 31 | 32 | except Exception as e: 33 | logger.error(e) -------------------------------------------------------------------------------- /app/backend/core/logger.py: -------------------------------------------------------------------------------- 1 | from app.backend.core.config import Config 2 | import pytz 3 | from datetime import datetime 4 | import logging 5 | from logging import Logger 6 | 7 | def get_logger(config:Config, name:str = __name__) -> Logger: 8 | 9 | try: 10 | tz = pytz.timezone(config.timezone) 11 | except Exception: 12 | tz = pytz.UTC 13 | logging.warning(f"Timezone '{config.timezone}' not valid. Using UTC") 14 | 15 | def time_in_tz(*args): 16 | return datetime.now(tz).timetuple() 17 | 18 | logger = logging.getLogger(name) 19 | logger.setLevel(config.log_level) 20 | logger.propagate = False 21 | 22 | if not logger.handlers: 23 | handler = logging.StreamHandler() 24 | formatter = logging.Formatter('%(asctime)s - %(levelname)s - Func: %(funcName)s - [MSG]: %(message)s') 25 | formatter.converter = time_in_tz 26 | handler.setFormatter(formatter) 27 | logger.addHandler(handler) 28 | 29 | return logger 30 | 31 | def get_bootstrap_logger() -> Logger: 32 | 33 | logger = logging.getLogger("bootstrap") 34 | logger.setLevel(logging.INFO) 35 | if not logger.handlers: 36 | handler = logging.StreamHandler() 37 | handler.setFormatter(logging.Formatter('%(asctime)s - %(levelname)s - Func: %(funcName)s - [MSG]: %(message)s')) 38 | logger.addHandler(handler) 39 | 40 | return logger 41 | 42 | -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | ## [1.4.2](https://github.com/kRYstall9/docker-surgeon/compare/v1.4.1...v1.4.2) (2025-11-19) 2 | 3 | 4 | ### Bug Fixes 5 | 6 | * the JSON object must be str, bytes or bytearray, not list ([13cc185](https://github.com/kRYstall9/docker-surgeon/commit/13cc1852c59c3fb09421c95a2c66af16b8b8b2e8)) 7 | 8 | ## [1.4.1](https://github.com/kRYstall9/docker-surgeon/compare/v1.4.0...v1.4.1) (2025-11-19) 9 | 10 | 11 | ### Bug Fixes 12 | 13 | * 'bool' object has no attribute 'lower' ([9e28c85](https://github.com/kRYstall9/docker-surgeon/commit/9e28c850c0dba8a02d23fb43fc97a754c59fadd5)) 14 | 15 | # [1.4.0](https://github.com/kRYstall9/docker-surgeon/compare/v1.3.0...v1.4.0) (2025-11-18) 16 | 17 | 18 | ### Features 19 | 20 | * add real-time notification support using Apprise ([dfec9d4](https://github.com/kRYstall9/docker-surgeon/commit/dfec9d4411cf81acc6224e9b637484cd2d2cef94)), closes [#1](https://github.com/kRYstall9/docker-surgeon/issues/1) 21 | 22 | # [1.3.0](https://github.com/kRYstall9/docker-surgeon/compare/v1.2.0...v1.3.0) (2025-11-17) 23 | 24 | 25 | ### Features 26 | 27 | * add dashboard ([4dd4428](https://github.com/kRYstall9/docker-surgeon/commit/4dd442841483b9567e8907706fb71678932c002e)) 28 | 29 | # [1.2.0](https://github.com/kRYstall9/docker-surgeon/compare/v1.1.1...v1.2.0) (2025-11-16) 30 | 31 | 32 | ### Features 33 | 34 | * improve event handling, logging and error robustness ([b00112f](https://github.com/kRYstall9/docker-surgeon/commit/b00112f651c92a6f12026b488a2f2c82ee84d2d9)) 35 | -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | from app.backend.core.database import init_db 2 | from app.backend.core.config import Config 3 | from app.backend.core import state 4 | from app.backend.core.logger import get_bootstrap_logger, get_logger 5 | from app.backend.services.monitor_service import monitor_containers 6 | from threading import Thread 7 | 8 | 9 | if __name__ == '__main__': 10 | 11 | logger = get_bootstrap_logger() 12 | 13 | try: 14 | config = Config.load() 15 | state.config = config 16 | except Exception as e: 17 | logger.error(e) 18 | exit(1) 19 | 20 | logger = get_logger(config) 21 | logger.info(f"Config successfully loaded!\n{config}") 22 | state.logger = logger 23 | init_db(logger) 24 | worker = Thread(target=monitor_containers, args=(config, logger), daemon=True) 25 | worker.start() 26 | 27 | if config.enable_dashboard: 28 | import uvicorn 29 | from os import path 30 | from fastapi import FastAPI 31 | from app.backend.api.api_router import api_router 32 | from fastapi.responses import FileResponse 33 | from fastapi.staticfiles import StaticFiles 34 | 35 | logger.info("Starting FastAPI server for Docker Surgeon API...") 36 | DASHBOARD_DIR = "app/dashboard_build" 37 | 38 | app = FastAPI( 39 | title="Docker Surgeon API", 40 | description="A tool to monitor and manage Docker containers." 41 | ) 42 | 43 | app.mount('/assets', StaticFiles(directory=f"{DASHBOARD_DIR}/assets"), name="assets") 44 | app.include_router(router=api_router, prefix='/api') 45 | 46 | @app.get("/{full_path:path}") 47 | def serve_dashboard(full_path:str): 48 | index_path = path.join(f"{DASHBOARD_DIR}", "index.html") 49 | return FileResponse(index_path) 50 | 51 | uvicorn.run(app, host=config.dashboard_address, port=config.dashboard_port, reload=False) 52 | 53 | logger.info("FastAPI server started") 54 | 55 | worker.join() 56 | 57 | 58 | 59 | -------------------------------------------------------------------------------- /app/dashboard/src/pages/login/index.tsx: -------------------------------------------------------------------------------- 1 | import { useState } from "react"; 2 | import logo from "../../assets/logo.png"; 3 | import { login } from "../../api/auth"; 4 | 5 | export function Login() { 6 | const [password, setPassword] = useState(null); 7 | const [error, setError] = useState(null); 8 | 9 | async function handleLogin(e: React.FormEvent) { 10 | e.preventDefault(); 11 | setError(""); 12 | 13 | try { 14 | await login(password ?? ""); 15 | window.location.href = "/"; 16 | } catch (err: unknown) { 17 | console.error(err); 18 | setError("Invalid password"); 19 | } 20 | } 21 | 22 | return ( 23 |
24 |
25 |
26 | Logo 27 | Docker Surgeon 28 |
29 | 30 |
31 |

Login

32 | 33 |
34 | 40 | setPassword(e.target.value)} 46 | required 47 | /> 48 | {error &&

{error}

} 49 |
50 | 51 | 57 |
58 |
59 |
60 | ); 61 | } 62 | -------------------------------------------------------------------------------- /app/dashboard/src/components/chart/chart.tsx: -------------------------------------------------------------------------------- 1 | import { Bar } from "react-chartjs-2"; 2 | import { 3 | Chart as ChartJS, 4 | CategoryScale, 5 | LinearScale, 6 | PointElement, 7 | BarElement, 8 | Title, 9 | Tooltip, 10 | Legend, 11 | } from "chart.js"; 12 | import type { CrashedContainerChartStats } from "../../models/crashedContainer"; 13 | import { Spinner } from "../spinner/spinner"; 14 | 15 | interface ChartProps { 16 | loading: boolean; 17 | stats: CrashedContainerChartStats[]; 18 | } 19 | 20 | export function Chart({ loading, stats }: ChartProps) { 21 | ChartJS.register( 22 | CategoryScale, 23 | LinearScale, 24 | PointElement, 25 | BarElement, 26 | Title, 27 | Tooltip, 28 | Legend 29 | ); 30 | 31 | const options = { 32 | responsive: true, 33 | maintainAspectRatio: false, 34 | plugins: { 35 | legend: { 36 | position: "top" as const, 37 | }, 38 | title: { 39 | display: true, 40 | text: "Crashed Containers Chart", 41 | }, 42 | }, 43 | }; 44 | 45 | const safeStats = Array.isArray(stats) ? stats : []; 46 | 47 | const labels = Array.from( 48 | new Set( 49 | safeStats.map( 50 | (s: CrashedContainerChartStats) => 51 | new Date(s.crashed_on!).toISOString().split("T")[0] 52 | ) 53 | ) 54 | ) 55 | .sort( 56 | (a: string, b: string) => new Date(a).getTime() - new Date(b).getTime() 57 | ) 58 | .map((dateStr: string) => new Date(dateStr).toISOString().split("T")[0]); 59 | 60 | const containers = Array.from( 61 | new Set(safeStats.map((s: CrashedContainerChartStats) => s.container_name)) 62 | ); 63 | 64 | const datasets = containers.map((name) => { 65 | return { 66 | label: name, 67 | backgroundColor: `hsl(${Math.random() * 360}, 70%, 60%)`, 68 | data: labels.map((date) => { 69 | const stat = safeStats.find( 70 | (s: CrashedContainerChartStats) => 71 | s.crashed_on === date && s.container_name === name 72 | ); 73 | return stat ? stat.crash_count : 0; 74 | }), 75 | }; 76 | }); 77 | 78 | const data = { 79 | labels, 80 | datasets: datasets, 81 | }; 82 | 83 | return ( 84 |
85 | {loading && } 86 | {!loading && } 87 |
88 | ); 89 | } 90 | -------------------------------------------------------------------------------- /app/dashboard/README.md: -------------------------------------------------------------------------------- 1 | # React + TypeScript + Vite 2 | 3 | This template provides a minimal setup to get React working in Vite with HMR and some ESLint rules. 4 | 5 | Currently, two official plugins are available: 6 | 7 | - [@vitejs/plugin-react](https://github.com/vitejs/vite-plugin-react/blob/main/packages/plugin-react) uses [Babel](https://babeljs.io/) (or [oxc](https://oxc.rs) when used in [rolldown-vite](https://vite.dev/guide/rolldown)) for Fast Refresh 8 | - [@vitejs/plugin-react-swc](https://github.com/vitejs/vite-plugin-react/blob/main/packages/plugin-react-swc) uses [SWC](https://swc.rs/) for Fast Refresh 9 | 10 | ## React Compiler 11 | 12 | The React Compiler is enabled on this template. See [this documentation](https://react.dev/learn/react-compiler) for more information. 13 | 14 | Note: This will impact Vite dev & build performances. 15 | 16 | ## Expanding the ESLint configuration 17 | 18 | If you are developing a production application, we recommend updating the configuration to enable type-aware lint rules: 19 | 20 | ```js 21 | export default defineConfig([ 22 | globalIgnores(['dist']), 23 | { 24 | files: ['**/*.{ts,tsx}'], 25 | extends: [ 26 | // Other configs... 27 | 28 | // Remove tseslint.configs.recommended and replace with this 29 | tseslint.configs.recommendedTypeChecked, 30 | // Alternatively, use this for stricter rules 31 | tseslint.configs.strictTypeChecked, 32 | // Optionally, add this for stylistic rules 33 | tseslint.configs.stylisticTypeChecked, 34 | 35 | // Other configs... 36 | ], 37 | languageOptions: { 38 | parserOptions: { 39 | project: ['./tsconfig.node.json', './tsconfig.app.json'], 40 | tsconfigRootDir: import.meta.dirname, 41 | }, 42 | // other options... 43 | }, 44 | }, 45 | ]) 46 | ``` 47 | 48 | You can also install [eslint-plugin-react-x](https://github.com/Rel1cx/eslint-react/tree/main/packages/plugins/eslint-plugin-react-x) and [eslint-plugin-react-dom](https://github.com/Rel1cx/eslint-react/tree/main/packages/plugins/eslint-plugin-react-dom) for React-specific lint rules: 49 | 50 | ```js 51 | // eslint.config.js 52 | import reactX from 'eslint-plugin-react-x' 53 | import reactDom from 'eslint-plugin-react-dom' 54 | 55 | export default defineConfig([ 56 | globalIgnores(['dist']), 57 | { 58 | files: ['**/*.{ts,tsx}'], 59 | extends: [ 60 | // Other configs... 61 | // Enable lint rules for React 62 | reactX.configs['recommended-typescript'], 63 | // Enable lint rules for React DOM 64 | reactDom.configs.recommended, 65 | ], 66 | languageOptions: { 67 | parserOptions: { 68 | project: ['./tsconfig.node.json', './tsconfig.app.json'], 69 | tsconfigRootDir: import.meta.dirname, 70 | }, 71 | // other options... 72 | }, 73 | }, 74 | ]) 75 | ``` 76 | -------------------------------------------------------------------------------- /.github/workflows/release.yml: -------------------------------------------------------------------------------- 1 | name: Release 2 | 3 | on: 4 | workflow_dispatch: 5 | inputs: 6 | version: 7 | description: "Version to (re)publish on Docker Hub (i.e. 1.2.3)" 8 | required: false 9 | push: 10 | branches: 11 | - main 12 | 13 | jobs: 14 | release: 15 | runs-on: ubuntu-latest 16 | 17 | permissions: 18 | contents: write 19 | packages: write 20 | 21 | steps: 22 | - name: Checkout 23 | uses: actions/checkout@v4 24 | with: 25 | fetch-depth: 0 26 | 27 | - name: Setup Node 28 | uses: actions/setup-node@v4 29 | with: 30 | node-version: 20 31 | 32 | - name: Install Semantic Release 33 | run: | 34 | npm install -g semantic-release \ 35 | @semantic-release/changelog \ 36 | @semantic-release/exec \ 37 | @semantic-release/commit-analyzer \ 38 | @semantic-release/release-notes-generator \ 39 | @semantic-release/git 40 | 41 | - name: Configure Git user for bot 42 | run: | 43 | git config user.name "github-actions[bot]" 44 | git config user.email "github-actions[bot]@users.noreply.github.com" 45 | 46 | - name: Run Semantic Release 47 | if: ${{ github.event.inputs.version == '' }} 48 | id: semantic 49 | env: 50 | GITHUB_TOKEN: ${{ secrets.GH_TOKEN }} 51 | run: npx semantic-release 52 | 53 | - name: Set manual version 54 | if: ${{ github.event.inputs.version != '' }} 55 | run: | 56 | echo "RELEASE_VERSION=${{ github.event.inputs.version }}" >> $GITHUB_ENV 57 | echo "⚙️ Forcing rebuild for version: ${{ github.event.inputs.version }}" 58 | 59 | - name: Read release version 60 | if: ${{ github.event.inputs.version == '' }} 61 | run: | 62 | if [ -f VERSION ]; then 63 | VERSION=$(cat VERSION) 64 | echo "RELEASE_VERSION=$VERSION" >> $GITHUB_ENV 65 | echo "Detected release version: $VERSION" 66 | else 67 | echo "No new release detected" 68 | fi 69 | 70 | - name: Set up QEMU 71 | uses: docker/setup-qemu-action@v3 72 | 73 | - name: Set up Buildx 74 | uses: docker/setup-buildx-action@v3 75 | 76 | - name: Login to Docker Hub 77 | uses: docker/login-action@v3 78 | with: 79 | username: ${{ secrets.DOCKERHUB_USERNAME }} 80 | password: ${{ secrets.DOCKERHUB_TOKEN }} 81 | 82 | - name: Build and Push multi-arch image 83 | if: env.RELEASE_VERSION != '' 84 | run: | 85 | IMAGE_NAME=${{ secrets.DOCKERHUB_USERNAME }}/docker-surgeon 86 | VERSION=${RELEASE_VERSION} 87 | echo "🚀 Building image: $IMAGE_NAME:$VERSION" 88 | 89 | docker buildx build \ 90 | --platform linux/amd64,linux/arm64 \ 91 | -t $IMAGE_NAME:$VERSION \ 92 | -t $IMAGE_NAME:latest \ 93 | --push . 94 | -------------------------------------------------------------------------------- /app/dashboard/src/components/datepickerform/datepickerform.tsx: -------------------------------------------------------------------------------- 1 | import { useState } from "react"; 2 | import { DatePicker } from "../datepicker/datepicker"; 3 | import { 4 | getChartStats, 5 | getCrashedContainersMetrics, 6 | } from "../../api/crashedContainers"; 7 | import type { 8 | CrashedContainerChartStats, 9 | CrashedContainerLogs, 10 | } from "../../models/crashedContainer"; 11 | import { formatLocalDate } from "../../utils/utils"; 12 | 13 | interface DatePickerFormProps { 14 | setContainerLogs: (containerLogs: CrashedContainerLogs[]) => void; 15 | setChartStats: (stats: CrashedContainerChartStats[]) => void; 16 | setLoading: (loading: boolean) => void; 17 | initialStartDate: Date; 18 | initialEndDate: Date; 19 | } 20 | 21 | export function DatePickerForm({ 22 | setContainerLogs, 23 | setChartStats, 24 | setLoading, 25 | initialStartDate, 26 | initialEndDate, 27 | }: DatePickerFormProps) { 28 | const [startDate, setStartDate] = useState(initialStartDate); 29 | const [endDate, setEndDate] = useState(initialEndDate); 30 | const [error, setError] = useState(null); 31 | 32 | const apiCalls = async () => { 33 | try { 34 | setLoading(true); 35 | const [containerLogsRes, statsRes] = await Promise.all([ 36 | getCrashedContainersMetrics( 37 | formatLocalDate(startDate!), 38 | formatLocalDate(endDate!) 39 | ), 40 | getChartStats( 41 | formatLocalDate(startDate!), 42 | formatLocalDate(endDate!) 43 | ), 44 | ]); 45 | 46 | setContainerLogs(containerLogsRes); 47 | setChartStats(statsRes); 48 | } catch (err) { 49 | console.error(err); 50 | } finally { 51 | setLoading(false); 52 | } 53 | }; 54 | 55 | async function handleSubmit(e: React.FormEvent) { 56 | e.preventDefault(); 57 | setError(null); 58 | 59 | if(!startDate || !endDate){ 60 | setError("Please select both dates"); 61 | return; 62 | } 63 | const start = new Date(startDate); 64 | const end = new Date(endDate); 65 | start.setHours(0,0,0,0); 66 | end.setHours(0,0,0,0); 67 | 68 | if (start > end) { 69 | setError("Start date cannot be later than end date"); 70 | return; 71 | } 72 | 73 | await apiCalls(); 74 | } 75 | 76 | return ( 77 |
78 |
79 | 84 | 85 | 91 |
92 | {error && ( 93 |
94 |

{error}

95 |
96 | )} 97 |
98 | ); 99 | } 100 | -------------------------------------------------------------------------------- /app/backend/core/config.py: -------------------------------------------------------------------------------- 1 | import json 2 | from threading import Lock 3 | from dotenv import load_dotenv 4 | from os import getenv 5 | from app.backend.utils.string_utils import normalize_escapes 6 | 7 | class Config: 8 | _instance = None 9 | _lock = Lock() 10 | restart_policy:any 11 | log_level:str 12 | timezone:str 13 | enable_dashboard:bool 14 | logs_amount:int 15 | dashboard_address:str 16 | dashboard_port:int 17 | admin_password:str | None 18 | enable_notifications:bool 19 | notification_urls:list[str] 20 | notification_title:str | None 21 | notification_body:str | None 22 | 23 | def __init__(self, restart_policy:any, log_level:str, timezone:str, enable_dashboard:bool, logs_amount:int, dashboard_address:str, dashboard_port:int, admin_password: str | None, enable_notifications:bool, notification_urls:list[str], notification_title:str | None, notification_body:str | None): 24 | self.restart_policy = restart_policy 25 | self.log_level = log_level 26 | self.timezone = timezone 27 | self.enable_dashboard = enable_dashboard 28 | self.logs_amount = logs_amount 29 | self.dashboard_address = dashboard_address 30 | self.dashboard_port = dashboard_port 31 | self.admin_password = admin_password 32 | self.enable_notifications = enable_notifications 33 | self.notification_urls = notification_urls 34 | self.notification_title = notification_title 35 | self.notification_body = notification_body 36 | 37 | @classmethod 38 | def load(cls): 39 | try: 40 | load_dotenv() 41 | restart_policy = getenv("RESTART_POLICY", "") 42 | notification_urls = getenv("NOTIFICATION_URLS", "") 43 | 44 | try: 45 | notification_urls = json.loads(notification_urls) if notification_urls else [] 46 | except: 47 | notification_urls = [] 48 | 49 | return cls( 50 | restart_policy = json.loads(restart_policy), 51 | log_level = getenv("LOG_LEVEL", "INFO").upper(), 52 | timezone = getenv("LOG_TIMEZONE", "UTC"), 53 | enable_dashboard = getenv("ENABLE_DASHBOARD", "false").strip().lower() == "true", 54 | logs_amount = int(getenv("LOGS_AMOUNT", "10")), 55 | dashboard_address = getenv("DASHBOARD_ADDRESS", "0.0.0.0"), 56 | dashboard_port = int(getenv("DASHBOARD_PORT", "8000")), 57 | admin_password = getenv("ADMIN_PASSWORD", None), 58 | enable_notifications = getenv("ENABLE_NOTIFICATIONS", "false").strip().lower() == "true", 59 | notification_urls= notification_urls, 60 | notification_title = getenv("NOTIFICATION_TITLE", None), 61 | notification_body = normalize_escapes(getenv("NOTIFICATION_BODY", None)) 62 | ) 63 | except Exception as e: 64 | raise Exception(f"Unable to load the config: {e}") 65 | 66 | @classmethod 67 | def get(cls): 68 | try: 69 | if cls._instance is None: 70 | with cls._lock: 71 | if cls._instance is None: 72 | cls._instance = cls.load() 73 | 74 | return cls._instance 75 | except Exception as e: 76 | raise Exception(e) 77 | 78 | 79 | def __repr__(self): 80 | return f"Config:\nRestart Policy: {self.restart_policy}\nLog Level: {self.log_level}\nTime Zone: {self.timezone}\nEnable Dashboard: {self.enable_dashboard}\nDashboard Address: {self.dashboard_address}\nDashboard Port: {self.dashboard_port}\nLogs Amount: {self.logs_amount}\nEnable Notifications: {self.enable_notifications}" 81 | -------------------------------------------------------------------------------- /app/backend/repositories/crashed_container_repository.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime 2 | from logging import Logger 3 | from sqlalchemy import and_, func 4 | from app.backend.core.database import SessionLocal 5 | from app.backend.schemas.crashed_container_schema import CrashedContainerBase, CrashedContainerLogs 6 | from app.backend.models.crashed_container import CrashedContainer 7 | from app.backend.schemas.chart_stats_schema import ChartStats 8 | 9 | 10 | class CrashedContainerRepository: 11 | 12 | @staticmethod 13 | def add_crashed_container(ct_crashed:CrashedContainerBase, logger:Logger): 14 | with SessionLocal() as db: 15 | crashed_container = CrashedContainer( 16 | logs = ct_crashed.logs, 17 | crashedon = datetime.now(), 18 | container_id = ct_crashed.container_id, 19 | container_name = ct_crashed.container_name 20 | ) 21 | 22 | db.add(crashed_container) 23 | db.commit() 24 | db.refresh(crashed_container) 25 | 26 | logger.info(f"Container {ct_crashed.container_id} added to the crashed containers table") 27 | 28 | return crashed_container 29 | 30 | @staticmethod 31 | def get_all_crashed_containers(date_from:datetime, date_to:datetime) -> list[CrashedContainerLogs]: 32 | with SessionLocal() as db: 33 | 34 | crash_date = func.date(CrashedContainer.crashedon) 35 | date_from_str = date_from.strftime("%Y-%m-%d") 36 | date_to_str = date_to.strftime("%Y-%m-%d") 37 | 38 | 39 | crashed_containers = db.query(CrashedContainer.container_id, CrashedContainer.container_name, CrashedContainer.logs, CrashedContainer.crashedon).filter(crash_date >= date_from_str, crash_date <= date_to_str).order_by(CrashedContainer.crashedon.asc()).all() 40 | return [ 41 | CrashedContainerLogs( 42 | container_id=container_id, 43 | container_name=container_name, 44 | crashed_on=crashed_on, 45 | logs=logs 46 | ) 47 | for container_id, container_name, logs, crashed_on in crashed_containers 48 | ] 49 | 50 | @staticmethod 51 | def get_crashed_containers_stats_by_date(date_from:datetime, date_to:datetime): 52 | with SessionLocal() as db: 53 | 54 | crash_date = func.date(CrashedContainer.crashedon) 55 | 56 | date_from_str = date_from.strftime("%Y-%m-%d") 57 | date_to_str = date_to.strftime("%Y-%m-%d") 58 | 59 | rows = ( 60 | db.query( 61 | CrashedContainer.container_id, 62 | CrashedContainer.container_name, 63 | func.count(CrashedContainer.container_id).label('crash_count'), 64 | crash_date.label("crash_date") 65 | ) 66 | .filter( 67 | crash_date >= date_from_str, 68 | crash_date <= date_to_str 69 | ) 70 | .group_by( 71 | crash_date, 72 | CrashedContainer.container_name 73 | ) 74 | .order_by( 75 | crash_date.asc(), 76 | CrashedContainer.container_name.asc() 77 | ) 78 | .all() 79 | ) 80 | 81 | return [ 82 | ChartStats( 83 | crashed_on = crash_date, 84 | container_id=containerid, 85 | container_name=containername, 86 | crash_count=crash_count 87 | ) 88 | for containerid, containername, crash_count, crash_date in rows 89 | ] -------------------------------------------------------------------------------- /app/dashboard/src/pages/homepage/index.tsx: -------------------------------------------------------------------------------- 1 | import { Navbar } from "../../components/navbar/navbar"; 2 | import { useCallback, useEffect, useState } from "react"; 3 | import type { 4 | CrashedContainerChartStats, 5 | CrashedContainerLogs, 6 | } from "../../models/crashedContainer"; 7 | import { 8 | getChartStats, 9 | getCrashedContainersMetrics, 10 | } from "../../api/crashedContainers"; 11 | import { Chart } from "../../components/chart/chart"; 12 | import { Spinner } from "../../components/spinner/spinner"; 13 | import { DatePickerForm } from "../../components/datepickerform/datepickerform"; 14 | import { formatLocalDate } from "../../utils/utils"; 15 | 16 | export function Homepage() { 17 | const [containerLogs, setContainerLogs] = useState([]); 18 | const [chartStats, setChartStats] = useState( 19 | [] 20 | ); 21 | const [selectedContainer, setSelectedContainer] = useState( 22 | null 23 | ); 24 | const [loading, setLoading] = useState(true); 25 | 26 | const today = new Date(); 27 | const weekEarlier = new Date(); 28 | weekEarlier.setDate(today.getDate() - 7); 29 | 30 | const apiCalls = useCallback(async () => { 31 | try { 32 | const [containerLogsRes, statsRes] = await Promise.all([ 33 | getCrashedContainersMetrics( 34 | formatLocalDate(weekEarlier), 35 | formatLocalDate(today) 36 | ), 37 | getChartStats( 38 | formatLocalDate(weekEarlier), 39 | formatLocalDate(today) 40 | ), 41 | ]); 42 | 43 | setContainerLogs(containerLogsRes); 44 | setChartStats(statsRes); 45 | } catch (err) { 46 | console.error(err); 47 | } finally { 48 | setLoading(false); 49 | } 50 | }, []); 51 | 52 | useEffect(() => { 53 | setLoading(true); 54 | apiCalls(); 55 | }, [apiCalls]); 56 | 57 | const buildLogsSection = () => { 58 | if (!Array.isArray(containerLogs)) return []; 59 | 60 | const uniqueCrashedContainers = Object.values( 61 | containerLogs.reduce( 62 | ( 63 | acc: Record, 64 | item: CrashedContainerLogs 65 | ) => { 66 | if (!acc[item.container_name]) { 67 | acc[item.container_name] = { ...item }; 68 | } else { 69 | acc[item.container_name].logs += `\n\n${item.logs}`; 70 | } 71 | 72 | return acc; 73 | }, 74 | {} as Record 75 | ) 76 | ); 77 | 78 | return uniqueCrashedContainers; 79 | }; 80 | 81 | const uniqueCrashedContainers = buildLogsSection(); 82 | const selected = uniqueCrashedContainers.find( 83 | (x: CrashedContainerLogs) => x.container_id === selectedContainer 84 | ); 85 | 86 | return ( 87 |
88 | 89 | 90 |
91 | 98 |
99 | 100 |
101 |

Summary

102 | 103 |
104 | 105 |
106 |
107 |

Crash History

108 |
109 | 110 |
111 |
116 | {loading ? ( 117 | 118 | ) : ( 119 | uniqueCrashedContainers.map((cont: CrashedContainerLogs) => { 120 | return ( 121 | 128 | ); 129 | }) 130 | )} 131 |
132 |
133 | 134 |
139 | {loading && } 140 | {!loading && selectedContainer && selected &&

{selected.logs}

} 141 | {!loading && 142 | (!selectedContainer || !selected) && 143 | containerLogs.length > 0 &&

Select a container to view logs...

} 144 |
145 |
146 |
147 | ); 148 | } 149 | -------------------------------------------------------------------------------- /app/dashboard/src/components/datepicker/datepicker.tsx: -------------------------------------------------------------------------------- 1 | import { useEffect, useRef, useState } from "react"; 2 | 3 | interface DatePickerProps { 4 | value: Date | null; 5 | onChange: (date: Date | null) => void; 6 | label: string; 7 | } 8 | 9 | export function DatePicker({ value, onChange, label }: DatePickerProps) { 10 | const [open, setOpen] = useState(false); 11 | const calendarRef = useRef(null); 12 | 13 | const [month, setMonth] = useState(value!.getMonth()); 14 | const [year, setYear] = useState(value!.getFullYear()); 15 | 16 | useEffect(() => { 17 | if (!open) return; 18 | 19 | function handleClickOutside(e: MouseEvent) { 20 | if ( 21 | calendarRef.current && 22 | !calendarRef.current.contains(e.target as Node) 23 | ) { 24 | setOpen(false); 25 | } 26 | } 27 | 28 | document.addEventListener("mousedown", handleClickOutside); 29 | 30 | return () => { 31 | document.removeEventListener("mousedown", handleClickOutside); 32 | }; 33 | }, [open]); 34 | 35 | function getDaysInMonth(year: number, month: number) { 36 | return new Date(year, month + 1, 0).getDate(); 37 | } 38 | 39 | function getWeekday(year: number, month: number) { 40 | return new Date(year, month, 1).getDay(); // 0 = Sunday 41 | } 42 | 43 | function setToday() { 44 | const today = new Date(); 45 | onChange(today); 46 | setMonth(today.getMonth()); 47 | setYear(today.getFullYear()); 48 | } 49 | 50 | const daysInMonth = getDaysInMonth(year, month); 51 | const firstWeekday = getWeekday(year, month); 52 | 53 | function handleSelect(day: number) { 54 | const newDate = new Date(year, month, day); 55 | onChange(newDate); 56 | setOpen(false); 57 | } 58 | 59 | function prevMonth() { 60 | if (month === 0) { 61 | setMonth(11); 62 | setYear((y) => y - 1); 63 | } else setMonth((m) => m - 1); 64 | } 65 | 66 | function nextMonth() { 67 | if (month === 11) { 68 | setMonth(0); 69 | setYear((y) => y + 1); 70 | } else setMonth((m) => m + 1); 71 | } 72 | 73 | return ( 74 |
75 |
76 | 79 | 85 |
86 | 87 | {open && ( 88 |
92 | {/* Header */} 93 |
94 | 100 | 101 | {new Date(year, month).toLocaleString("default", { 102 | month: "long", 103 | })}{" "} 104 | {year} 105 | 106 | 112 |
113 | 114 | {/* Grid giorni della settimana */} 115 |
116 | Su 117 | Mo 118 | Tu 119 | We 120 | Th 121 | Fr 122 | Sa 123 |
124 | 125 | {/* Giorni */} 126 |
127 | {/* Spazi vuoti prima del primo giorno */} 128 | {Array.from({ length: firstWeekday }).map((_, i) => ( 129 | 130 | ))} 131 | 132 | {/* Giorni effettivi */} 133 | {Array.from({ length: daysInMonth }).map((_, i) => { 134 | const day = i + 1; 135 | const isSelected = 136 | value && 137 | value.getDate() === day && 138 | value.getMonth() === month && 139 | value.getFullYear() === year; 140 | 141 | const today = new Date(); 142 | 143 | const isToday = 144 | today.getDate() === day && 145 | today.getMonth() === month && 146 | today.getFullYear() === year; 147 | 148 | return ( 149 | 162 | ); 163 | })} 164 |
165 |
166 | 174 |
175 |
176 | )} 177 |
178 | ); 179 | } 180 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Docker Surgeon 2 | A Python service that monitors Docker containers in real time and automatically restarts them based on customizable rules, including any dependent containers. 3 | Ideal for environments where high availability matters and zombie containers are not welcome at the party. 4 | 5 | ## ✨ Key Features 6 | - Monitors Docker events in real-time. 7 | 8 | - Automatically restarts containers that are unhealthy or have unexpectedly exited. 9 | 10 | - Supports a restart policy configurable via environment variables. 11 | 12 | - Handles container dependencies using labels (com.monitor.depends.on). 13 | 14 | - Detailed, timezone-aware logging. 15 | 16 | - Supports container exclusion from restart policies. 17 | 18 | - Supports real-time [notifications](#-notifications) through [Apprise](https://github.com/caronc/apprise) 19 | 20 | ## 🧭 How It Works 21 | 22 | The service listens to Docker daemon events. 23 | When it detects that a container is in an unhealthy state or has exited with a non-excluded code, it restarts it. 24 | If the container has dependencies (defined through labels), it restarts those too, in the correct order, using topological sorting. 25 | 26 | Example: `[db] --> [backend] --> [frontend]`
27 | If `db` goes down, the service will restart `db`, then `backend`, and finally `frontend`. 28 | 29 | 30 | ## 🧪 Environment Variables 31 | Configuration is handled through a `.env` file in the project root. 32 | Here’s an example: 33 | 34 | ``` 35 | # Restart policy in JSON format 36 | RESTART_POLICY = '{ 37 | "excludedContainers": ["container_name"], #-> More than 1 container could be excluded. Specify them as ["container1", "container2"] 38 | "statuses": { 39 | "exited": { 40 | "codesToExclude": [0] #-> More than 1 exit code could be excluded. Specify them as ["code1", "code2", "code3"] 41 | } 42 | } 43 | }' 44 | 45 | ENABLE_DASHBOARD=True #-> Possible values [True | False] 46 | LOGS_AMOUNT=10 #-> This will display the last n logs on the dashboard to clearly indicate the issue that triggered the restart policy 47 | DASHBOARD_ADDRESS=0.0.0.0 #-> Possible values [0.0.0.0 | 127.0.0.1] 48 | DASHBOARD_PORT=8000 #-> Possible values [ Any free port ] 49 | ADMIN_PASSWORD= 50 | ENABLE_NOTIFICATIONS=True #-> Possible values [True | False] 51 | NOTIFICATION_URLS='["url1", "url2"]' #-> Check https://github.com/caronc/apprise/wiki#notification-services 52 | NOTIFICATION_TITLE="" #-> Edit the notification title as you wish 53 | NOTIFICATION_BODY="" #-> Edit the notification body as you wish 54 | 55 | 56 | 57 | ############### 58 | # LOGGING # 59 | ############### 60 | 61 | # --- Log Level --- 62 | # Set the verbosity of logs. Options: "error", "warn", "info", "debug" 63 | # Default: info 64 | LOG_LEVEL= info 65 | 66 | # --- Log Timezone --- 67 | # Adjust the timezone used for logging 68 | # e.g. Europe/Rome, America/New_York 69 | LOG_TIMEZONE=UTC 70 | 71 | ``` 72 | 73 | ### RESTART_POLICY 74 | 75 | Defines which containers to ignore and which states should trigger a restart. 76 | 77 | - `excludedContainers`: list of containers that should never be restarted. 78 | - `statuses`: 79 | - `exited` → restart if the container exited with a non-excluded code. 80 | - `codesToExclude`: -> A list of codes that should *not* trigger a restart. Check codes [here](https://komodor.com/learn/exit-codes-in-containers-and-kubernetes-the-complete-guide/#:~:text=%EE%80%80Exit%EE%80%81%20%EE%80%80codes%EE%80%81%20are%20used) 81 | 82 | 83 | ### LOG_LEVEL 84 | 85 | Controls log verbosity.
86 | Supported values: `error`, `warn`, `info`, `debug`.
87 | Default: `info`. 88 | 89 | ### LOG_TIMEZONE 90 | 91 | Sets the timezone used in logs.
92 | Must be a valid pytz timezone.
93 | Examples: `UTC`, `Europe/Rome`, `America/New_York`.
94 | Default: `UTC` 95 | 96 | Check the valid timezones [here](https://gist.github.com/heyalexej/8bf688fd67d7199be4a1682b3eec7568) 97 | 98 | ### ENABLE_DASHBOARD 99 | Enables or disables the web dashboard.
100 | Default: `False` 101 | 102 | ### LOGS_AMOUNT 103 | Number of log entries to retain when a container is restarted. 104 | 105 | Default: `10` 106 | 107 | ### DASHBOARD_ADDRESS 108 | Address interface for the dashboard: 109 | - `127.0.0.1` -> Local only 110 | - `0.0.0.0` -> accessible on LAN 111 | 112 | Default: `0.0.0.0` 113 | 114 | ### DASHBOARD_PORT 115 | Port on which the dashboard is served.
116 | Default: `8000` 117 | 118 | ### ADMIN_PASSWORD 119 | Password for accessing the dashboard. 120 | Support for three formats: 121 | - **Plain text** 122 | - ADMIN_PASSWORD=r4nd0mP4ssW0rD 123 | - [**Bcrypt**](https://bcrypt-generator.com/) 124 | - ADMIN_PASSWORD=$2a$12$9s8F... 125 | - [**Argon2**](https://argon2.online/) 126 | - ADMIN_PASSWORD=$argon2id$v=19$m=65536,t=3,p=4$... 127 | 128 | The system automatically detects whether the value is plain text, bcrypt, or Argon2.
129 | If you want a strong random password (plain text), you can generate one using: `openssl rand -hex 32` *This is a plain password, not an encrypted hash* 130 | 131 | ### ENABLE_NOTIFICATIONS 132 | Enables or disables real-time notifications.
133 | Supported values: `True` | `False`
134 | Default: `False`
135 | See [the notification's section](#-notifications) for more details 136 | 137 | ### NOTIFICATION_URLS 138 | A JSON-formatted list of notification endpoints, as documented in the [Apprise URL specification](https://github.com/caronc/apprise/wiki)
139 | Expected Syntax: `'["url1", "url2"]'`
140 | ⚠️ *This must be valid JSON — use double quotes inside the list*. 141 | 142 | ### NOTIFICATION_TITLE 143 | The title template for notifications.
144 | Supports placeholders and emoji.
145 | Default: `'⚠️ {container_name} crashed'` 146 | 147 | Supported placeholders: 148 | - {container_name} 149 | - {logs} 150 | - {exit_code} 151 | - {n_logs} 152 | 153 | ### NOTIFICATION_BODY 154 | The body template for notifications.
155 | Supports placeholders, multiline text (\n), and Markdown formatting.
156 | Does **not** support icons/emoji (depending on the provider).
157 | Default: ```'`exit code`: `{exit_code}`\nLast {n_logs} logs of `{container_name}`: {logs}'``` 158 | 159 | Supported placeholders: 160 | - {container_name} 161 | - {logs} 162 | - {exit_code} 163 | - {n_logs} 164 | 165 | 166 | ## 🔐 Authentication Flow 167 | 1. User submits their password to /auth/login 168 | 2. The server validates it in this order: 169 | - argon2 verification 170 | - bcrypt `checkpw` 171 | - direct comparison (plain text) 172 | 3. If valid, a JWT token is created and stored in a **HttpOnly Cookie** 173 | 4. Protected routes require thise cookie to be present and valid 174 | 175 | ## 🔗 Managing Container Dependencies 176 | 177 | You can define container dependencies using the label `com.monitor.depends.on`.
178 | When a parent container is restarted, its dependent containers will be restarted too, in the correct order. 179 | 180 | Example `docker-compose.yml`: 181 | 182 | ``` 183 | services: 184 | db: 185 | image: postgres 186 | container_name: db 187 | 188 | backend: 189 | image: my-backend 190 | container_name: backend 191 | labels: 192 | - "com.monitor.depends.on=db" 193 | 194 | frontend: 195 | image: my-frontend 196 | container_name: frontend 197 | labels: 198 | - "com.monitor.depends.on=backend" 199 | 200 | docker-surgeon: 201 | image: docker-surgeon-image 202 | container_name: docker-surgeon 203 | volumes: 204 | - /var/run/docker.sock:/var/run/docker.sock 205 | env_file: 206 | - path/to/.env 207 | ``` 208 | 209 | In this setup:
210 | If `db` crashes → `db`, `backend`, and `frontend` will be restarted in order.
211 | If `backend` crashes → `backend` and `frontend` will be restarted.
212 | If `frontend` crashes → only `frontend` will be restarted. 213 | 214 | Multiple dependents can be specified for a container by separating them with a comma: `com.monitor.depends.on=backend,frontend,db` 215 | 216 | ## 🚀 Quick Start 217 | ``` 218 | docker run -d \ 219 | --name docker-surgeon \ 220 | -v /var/run/docker.sock:/var/run/docker.sock \ 221 | -v /your/path/data:/app/app/data \ # persistent data (recommended if dashboard is enabled) 222 | -v $(pwd)/.env:/app/.env \ 223 | krystall0/docker-surgeon:latest 224 | ``` 225 | 226 | You can also override environment variables directly: 227 | ``` 228 | docker run -d \ 229 | --name docker-surgeon \ 230 | -v /var/run/docker.sock:/var/run/docker.sock \ 231 | -v /your/path/data:/app/app/data \ # persistent data (recommended if dashboard is enabled) 232 | -e LOG_LEVEL=INFO \ 233 | -e LOG_TIMEZONE=Europe/Rome \ 234 | -e RESTART_POLICY='{"excludedContainers":["pihole"],"statuses":{"exited":{"codesToExclude":[0]}}}' \ 235 | krystall0/docker-surgeon:latest 236 | ``` 237 | 238 | ### Example `docker-compose.yml` 239 | ``` 240 | version: "3.8" 241 | 242 | services: 243 | docker-surgeon: 244 | image: krystall0/docker-surgeon:latest 245 | container_name: docker-surgeon 246 | restart: always 247 | volumes: 248 | - /var/run/docker.sock:/var/run/docker.sock 249 | - /your/path/data:/app/app/data # persistent data (recommended if dashboard is enabled) 250 | env_file: 251 | - /path/to/.env 252 | 253 | db: 254 | image: postgres 255 | container_name: db 256 | 257 | backend: 258 | image: my-backend 259 | container_name: backend 260 | labels: 261 | - "com.monitor.depends.on=db" 262 | 263 | frontend: 264 | image: my-frontend 265 | container_name: frontend 266 | labels: 267 | - "com.monitor.depends.on=backend" 268 | ``` 269 | 270 | ## 📊 Dashboard Overview 271 | Docker Surgeon includes a built-in web dashboard that helps you inspect: 272 | - Recent container crashes 273 | - Logs grouped by container 274 | - Crash statistics over time 275 | - Interactive charts 276 | - Date-based filtering 277 | - Full log viewer with multiline formatting 278 | 279 | To access the dashboard:
280 | ``` 281 | http://: 282 | ``` 283 | (Requires authentication — see [**Authentication Flow**](#-authentication-flow)) 284 | 285 | ### Dashboard Preview 286 | ![alt text](docs/images/preview.png) 287 | 288 | ## 🔔 Notifications 289 | 290 | Docker Surgeon can send real-time notifications whenever a container crashes. 291 | Notifications are handled through Apprise, supporting 70+ services including: 292 | - Discord 293 | - Telegram 294 | - Slack 295 | - Matrix 296 | - Email 297 | - Webhooks 298 | - Gotify / Pushover / Pushbullet 299 | 300 | And many others… 301 | 302 | See [Apprise](https://github.com/caronc/apprise) for more details 303 | 304 | ### Enabling Notifications 305 | Add these variables to your `.env`:
306 | ``` 307 | ENABLE_NOTIFICATIONS=True 308 | NOTIFICATION_URLS=["discord:///"] 309 | NOTIFICATION_TITLE="⚠️ {container_name} crashed" 310 | NOTIFICATION_BODY="`exit code`: `{exit_code}`\nLast {n_logs} logs:\n{logs}" 311 | ``` 312 | 313 | ### Formatting Notifications 314 | Docker Surgeon supports placeholder variables inside `NOTIFICATION_TITLE` and `NOTIFICATION_BODY`.
315 | Available placeholders: 316 | - `{container_name}` → name of the crashed container 317 | - `{exit_code}` → container exit code 318 | - `{logs}` → last N logs (ANSI colors removed) 319 | - `{n_logs}` → number of logs configured in `LOGS_AMOUNT` 320 | 321 | Example notification body:
322 | `exit code`: `{exit_code}`
323 | Container `{container_name}` crashed.
324 | Last {n_logs} logs:
325 | {logs} 326 | 327 | 328 | ### ⚠️ Security Notes 329 | - Do **not** expose the dashboard over the internet without HTTPS and reverse proxy protections 330 | - Always use a strong admin password (preferably hashed) -------------------------------------------------------------------------------- /app/backend/services/monitor_service.py: -------------------------------------------------------------------------------- 1 | from docker import DockerClient, from_env 2 | from app.backend.core.config import Config 3 | from logging import Logger 4 | from datetime import datetime 5 | from time import sleep 6 | from app.backend.schemas.crashed_container_schema import CrashedContainerBase 7 | from app.backend.repositories.crashed_container_repository import CrashedContainerRepository 8 | from app.backend.notifications.notification_manager import NotificationManager 9 | 10 | 11 | ############ 12 | #  CONSTS  # 13 | ############ 14 | LABEL_NAME:str = "com.monitor.depends.on" 15 | 16 | 17 | def monitor_containers(config:Config, logger:Logger): 18 | 19 | client = from_env() 20 | if client is None: 21 | print("[ERROR] - Failed to connect to Docker daemon.") 22 | return 23 | 24 | _watch_container_events(client, config.restart_policy, config.logs_amount, logger) 25 | 26 | def _watch_container_events(client: DockerClient, restart_policy:any, logs_amount:int, logger:Logger): 27 | already_processed = set() 28 | in_progress = set() 29 | 30 | for event in client.events(decode=True): 31 | try: 32 | if event.get("Type") != "container": 33 | continue 34 | 35 | container_id = None 36 | 37 | # --- FIX: Safely retrieve container ID --- 38 | # 1. Try to get the standard top-level ID 39 | if 'id' in event: 40 | container_id = event['id'] 41 | 42 | # 2. If the standard ID is missing, check the Actor ID (common for exec/healthcheck events) 43 | elif 'Actor' in event and 'ID' in event['Actor']: 44 | container_id = event['Actor']['ID'] 45 | # ---------------------------------------- 46 | 47 | if container_id is None: 48 | logger.debug(f"Skipping container event with no ID: {event.get('Action')}") 49 | continue 50 | 51 | container_id = container_id[:12] # Truncate to short ID 52 | container_object = client.containers.get(container_id) 53 | 54 | if container_object is None: 55 | logger.warning(f"Container with ID {container_id} not found.") 56 | continue 57 | 58 | if _canBeRestarted(client, container_object, restart_policy, logger): 59 | containerStatusAndExitCode = _getContainerStatusAndExitCode(client, container_object) 60 | container_health_status = containerStatusAndExitCode["healthStatus"] 61 | container_exit_code = containerStatusAndExitCode["exitCode"] 62 | 63 | logger.info(f"Container: {container_object.name} | ID: {container_id} | Status: {container_health_status} | Exit Code: {container_exit_code}. The container will be restarted including all its dependent containers") 64 | crashed_container = CrashedContainerBase(container_id=container_id, container_name=container_object.name, logs=container_object.logs(tail=logs_amount)) 65 | CrashedContainerRepository.add_crashed_container(crashed_container, logger) 66 | NotificationManager.container_crashed_event(container_name=container_object.name, container_logs=container_object.logs(tail=logs_amount), container_exit_code=container_exit_code) 67 | _restart_with_graph(client, container_object, already_processed, in_progress, restart_policy, logger) 68 | 69 | except Exception as e: 70 | # Added logging of the full event for easier debugging of new issues 71 | logger.error(f"Exception occurred: {e}. Event data: {event}") 72 | 73 | 74 | def _build_dependency_graph(client: DockerClient) -> dict: 75 | graph = {} 76 | all_containers = client.containers.list(all=True) 77 | 78 | for container in all_containers: 79 | depends_on = container.labels.get(LABEL_NAME) 80 | if depends_on: 81 | parents = [name.strip() for name in depends_on.split(",")] 82 | for parent in parents: 83 | if parent not in graph: 84 | graph[parent] = [] 85 | graph[parent].append(container.name) 86 | 87 | return graph 88 | 89 | def _topological_sort(graph: dict) -> list: 90 | already_visited = set() 91 | stack = [] 92 | 93 | def visit(node): 94 | if node in already_visited: 95 | return 96 | already_visited.add(node) 97 | for child in graph.get(node, []): 98 | visit(child) 99 | stack.append(node) 100 | 101 | for node in graph: 102 | visit(node) 103 | 104 | return stack[::-1] # Invert the results 105 | 106 | 107 | def _restart_with_graph(client: DockerClient, unhealthy_container, already_processed: set, in_progress: set, restart_policy:any, logger: Logger): 108 | graph = _build_dependency_graph(client) 109 | sorted_container_names = _topological_sort(graph) 110 | 111 | logger.debug(f"Graph: {graph}") 112 | logger.debug(f"Sorted container names: {sorted_container_names}") 113 | 114 | 115 | to_restart = [unhealthy_container.name] 116 | relevant = set(to_restart) 117 | 118 | for container_name in sorted_container_names: 119 | # Check if the current container's parent(s) are in the list of containers to be restarted 120 | parents = [parent for parent, children in graph.items() if container_name in children] 121 | 122 | # Check if the container actually exists before trying to get it 123 | try: 124 | ct = client.containers.get(container_name) 125 | except Exception: 126 | logger.debug(f"Dependent container {container_name} not found, skipping.") 127 | continue 128 | 129 | 130 | if any(parent in relevant for parent in parents) and _canBeRestarted(client, ct, restart_policy, logger, True): 131 | to_restart.append(container_name) 132 | relevant.add(container_name) 133 | 134 | logger.debug(f"Containers to restart: {to_restart}") 135 | 136 | parents = set(graph.keys()) 137 | restarted_parents = set() 138 | pending_children = set() 139 | 140 | for container_name in to_restart: 141 | if container_name in in_progress or container_name in already_processed: 142 | continue 143 | in_progress.add(container_name) 144 | try: 145 | container = client.containers.get(container_name) 146 | logger.info(f"Restarting container {container.name} ({container.id[:12]})") 147 | 148 | if container.name in parents: 149 | timeout = 60 150 | container.restart() 151 | logger.debug(f"Dependent containers found for {container.name}. Waiting until it is either 'running' or 'healthy'") 152 | logger.debug(f"Waiting {timeout} seconds before aborting the restart operation") 153 | operationInitTime = datetime.now() 154 | while True: 155 | if _parentSuccessfullyRestarted(client, container, logger): 156 | logger.debug(f"{container.name} restarted successfully. Proceeding to restart his children") 157 | restarted_parents.add(container.name) 158 | break 159 | if _operationTimedOut(operationInitTime, datetime.now(), timeout, logger): 160 | logger.warning(f"{container.name} did not recover in time - skipping dependent containers") 161 | break 162 | sleep(2) 163 | else: 164 | parents_of_child = [p for p, children in graph.items() if container.name in children] 165 | 166 | if not parents_of_child: 167 | logger.info(f"Container {container.name} has no parent dependencies - restarting independently.") 168 | container.restart() 169 | 170 | else: 171 | parents_of_child.sort(key=lambda p: len(graph.get(p, [])), reverse=True) 172 | unready_parents = [p for p in parents_of_child if p in to_restart and p not in restarted_parents] 173 | 174 | if unready_parents: 175 | logger.warning(f"Skipping {container.name}: waiting for parent(s) {unready_parents}.") 176 | pending_children.add(container.name) 177 | continue 178 | 179 | logger.info(f"Restarting child {container.name} ({container.id[:12]}) - parent(s) ready") 180 | container.restart() 181 | 182 | logger.info(f"Successfully restarted {container.name}") 183 | already_processed.add(container_name) 184 | except Exception as e: 185 | logger.error(f"Failed to restart {container_name}: {e}") 186 | finally: 187 | in_progress.discard(container_name) 188 | 189 | if pending_children: 190 | _retry_pending_children(client, pending_children, graph, restarted_parents, logger) 191 | 192 | 193 | in_progress.clear() 194 | already_processed.clear() 195 | 196 | 197 | def _getContainerStatusAndExitCode(client, container): 198 | 199 | # Reload container object to get the latest state 200 | container.reload() 201 | 202 | container_health_status = container.attrs.get("State", {}).get("Health", {}).get("Status", "unknown") 203 | container_exit_code = container.attrs.get("State", {}).get("ExitCode") 204 | container_real_status = container.attrs.get("State", {}).get("Status", "unknown") 205 | 206 | return {"healthStatus": container_health_status, "exitCode": container_exit_code, "realStatus": container_real_status} 207 | 208 | def _canBeRestarted(client, container, restart_policy:any, logger: Logger, checkOnChildren:bool = False) -> bool: 209 | 210 | if container.name in restart_policy.get("excludedContainers", []): 211 | logger.debug(f"{container.name} won't be restarted due to the restart policy") 212 | return False 213 | 214 | containerStatusAndExitCode = _getContainerStatusAndExitCode(client, container) 215 | container_status = containerStatusAndExitCode["healthStatus"] 216 | container_exit_code = containerStatusAndExitCode["exitCode"] 217 | container_real_status = containerStatusAndExitCode["realStatus"] 218 | 219 | logger.debug(f"CONTAINER NAME: {container.name} | STATUS: {container_status} | CODE: {container_exit_code} | REALSTATUS: {container_real_status}") 220 | 221 | # Use health status, then fall back to real status if health is unknown/missing from policy 222 | policy = restart_policy["statuses"].get(container_status, {}) or restart_policy["statuses"].get(container_real_status, {}) 223 | 224 | isContainerUnhealthy: bool = container_real_status == "running" and container_status == "unhealthy" 225 | 226 | if not policy and not isContainerUnhealthy: 227 | if checkOnChildren: 228 | return True # Allow children to be restarted if parent is being restarted 229 | logger.debug(f"No policy found. Container {container.name} won't be restarted") 230 | return False 231 | 232 | excluded_exit_codes = policy.get("codesToExclude", []) 233 | 234 | logger.debug(f"POLICY EXCLUDED EXIT CODES: {excluded_exit_codes}") 235 | 236 | # Container has to pass any of these checks to be restarted: 237 | # 1) Container is unhealthy 238 | # 2) Container matches the user-defined state and the exit code is not defined as excluded 239 | # 3) The parent of this container has to be restarted (checkOnChildren is True) 240 | 241 | if isContainerUnhealthy or container_exit_code not in excluded_exit_codes or checkOnChildren: 242 | logger.debug(f"{container.name} will be restarted soon") 243 | return True 244 | 245 | return False 246 | 247 | def _parentSuccessfullyRestarted(client, container, logger:Logger) -> bool: 248 | """ 249 | Checks if the parent container has successfully restarted 250 | """ 251 | 252 | containerStatuses = _getContainerStatusAndExitCode(client, container) 253 | # A container Health status is unknown when there's not healthcheck for it 254 | logger.debug(f"{container.name} statuses: {containerStatuses}") 255 | if (containerStatuses["healthStatus"] == "unknown" or containerStatuses["healthStatus"] == "healthy") and containerStatuses["realStatus"] == "running": 256 | return True 257 | 258 | return False 259 | 260 | def _operationTimedOut(initialDate:datetime, endDate:datetime, timeout:int, logger:Logger) -> bool: 261 | """ 262 | Checks if the operation time has exceeded the given timeout 263 | """ 264 | 265 | deltaSeconds = (endDate-initialDate).seconds 266 | if deltaSeconds >= timeout: 267 | logger.debug(f"Aborting operation due to timeout exceed. Time passed: {deltaSeconds}. Max allowed: {timeout}") 268 | return True 269 | 270 | return False 271 | 272 | def _retry_pending_children(client: DockerClient, pending_children: list, graph: dict, restarted_parents: set, logger: Logger, max_retries: int = 1, delay: int = 10): 273 | """ 274 | Tries to restart pending children 275 | """ 276 | logger.info(f"Retrying {len(pending_children)} skipped child container(s).") 277 | 278 | for attempt in range(1, max_retries + 1): 279 | if not pending_children: 280 | break 281 | 282 | logger.debug(f"Retry attempt {attempt}/{max_retries}") 283 | still_pending = [] 284 | 285 | for child_name in pending_children: 286 | try: 287 | parents_of_child = [p for p, children in graph.items() if child_name in children] 288 | unready_parents = [p for p in parents_of_child if p not in restarted_parents] 289 | 290 | if unready_parents: 291 | logger.debug(f"{child_name} still waiting for parent(s): {unready_parents}") 292 | still_pending.append(child_name) 293 | continue 294 | 295 | container = client.containers.get(child_name) 296 | logger.info(f"Retrying restart for {child_name} — parent(s) ready.") 297 | container.restart() 298 | logger.info(f"Successfully restarted {child_name}") 299 | 300 | except Exception as e: 301 | logger.error(f"Retry failed for {child_name}: {e}") 302 | still_pending.append(child_name) 303 | 304 | pending_children = still_pending 305 | if pending_children and attempt < max_retries: 306 | logger.debug(f"{len(pending_children)} child container(s) still pending. Waiting {delay}s before next retry.") 307 | sleep(delay) 308 | else: 309 | logger.info("All pending child containers have been restarted successfully.") 310 | 311 | if pending_children: 312 | logger.warning(f"Some child containers could not be restarted after {max_retries} retries: {pending_children}") --------------------------------------------------------------------------------