├── .eleventy.js
├── .gitignore
├── CONTRIBUTING.md
├── LICENSE.md
├── README.md
├── TEMPLATE.md
├── generate-sitemap.js
├── glossary_terms - sorted_glossary_terms.csv
├── hero.png
├── package-lock.json
├── package.json
└── src
├── _includes
├── header.hbs
├── layouts
│ ├── base.hbs
│ └── glossary-item.hbs
└── subscribe.hbs
├── acknowledge-time.md
├── acknowledge.md
├── actionable-alert.md
├── adaptive-response-systems.md
├── affected-service.md
├── after-action-review.md
├── ai-assisted-incident-response.md
├── ai-driven-root-cause-analysis.md
├── ai-incident-prediction.md
├── ai-triage.md
├── aiops.md
├── alert-aggregation.md
├── alert-correlation.md
├── alert-deduplication.md
├── alert-enrichment.md
├── alert-escalation.md
├── alert-fatigue.md
├── alert-filtering.md
├── alert-grouping.md
├── alert-management-dashboard.md
├── alert-noise.md
├── alert-prioritization.md
├── alert-routing.md
├── alert-suppression.md
├── alert-threshold.md
├── alert-volume.md
├── alert.md
├── algorithmic-alert-correlation.md
├── algorithmic-incident-classification.md
├── anamoly-based-detection.md
├── andon-cord.md
├── anomaly-detection.md
├── anomaly.md
├── anticipatory-incident-management.md
├── asset-management.md
├── asset.md
├── assets
├── css
│ ├── style.css
│ └── style.css.map
├── images
│ └── spike-logo.svg
├── js
│ ├── glossary.js
│ └── main.js
└── scss
│ ├── base
│ ├── _reset.scss
│ ├── _responsive.scss
│ ├── _typography.scss
│ └── _variables.scss
│ ├── components
│ ├── _alphabet-filter.scss
│ ├── _buttons.scss
│ ├── _glossary-item.scss
│ ├── _home.scss
│ └── _mobile.scss
│ ├── layout
│ ├── _footer.scss
│ └── _grid.scss
│ └── main.scss
├── assigned-incident.md
├── asynchronous-communication.md
├── attack-surface.md
├── attack-vector.md
├── audit-log.md
├── audit-trail.md
├── audit.md
├── automated-escalation.md
├── automated-incident-creation.md
├── automated-incident-routing.md
├── automated-notification.md
├── automated-remediation.md
├── automated-response.md
├── automated-severity-assignment.md
├── automated-status-updates.md
├── automated-triage-workflow.md
├── automated-triage.md
├── automation.md
├── autonomous-incident-resolution.md
├── autonomous-remediation.md
├── backup-responder.md
├── backup.md
├── baseline.md
├── behavioral-analytics.md
├── bi-directional-integration.md
├── blackout-period.md
├── blameless-culture.md
├── blameless-postmortem.md
├── blockchain-incident-monitoring.md
├── bot-assisted-triage.md
├── bottleneck.md
├── breach.md
├── break-fix.md
├── bridge-automation.md
├── bridge-call.md
├── broadcast-notifications.md
├── bug.md
├── bulk-alert-management.md
├── burnout-prevention-algorithms.md
├── burnout.md
├── business-continuity-plan-bcp.md
├── business-continuity.md
├── business-impact-analysis-bia.md
├── business-impact-dashboard.md
├── business-service-intelligence.md
├── business-service-mapping.md
├── business-service.md
├── categorization.md
├── centralized-dashboard.md
├── chain-of-command.md
├── change-management.md
├── chaos-engineering.md
├── cloud-native-incident-management.md
├── cognitive-incident-analysis.md
├── collaborative-incident-response.md
├── collaborative-resolution.md
├── command-and-control.md
├── command-center.md
├── command-post.md
├── compliance.md
├── computer-security-incident-reponse-team-csirt.md
├── configurable-workflows.md
├── configuration-item-ci.md
├── containerized recovery.md
├── containment.md
├── context-enrichment.md
├── contextual-intelligence.md
├── continuous-monitoring.md
├── continuous-resilience.md
├── correlation-rules.md
├── correlation.md
├── crisis-management.md
├── critical-incident.md
├── critical-service.md
├── cross-platform-automation.md
├── cross-team-coordination.md
├── custom-incident-fields.md
├── customer-experience-monitoring.md
├── customer-impact.md
├── customer-notification-system.md
├── cyber-physical-systems-incidents.md
├── dashboard-customization.md
├── dashboard.md
├── data-breach.md
├── data-driven-incident-response.md
├── data-loss-prevention-dlp.md
├── data
└── site.js
├── decentralized-monitoring-systems.md
├── deduplication-rules.md
├── deduplication.md
├── dependency-mapping.md
├── detection-time-mttd.md
├── diagnosis.md
├── disaster-recovery-dr.md
├── disaster-recovery-plan-drp.md
├── distributed-incident-management.md
├── downtime.md
├── dynamic-alert-routing.md
├── dynamic-escalation-policies.md
├── dynamic-incident-prediction.md
├── dynamic-thresholds.md
├── edge-computing-incident-management.md
├── elastic-incident-response-teams.md
├── emergency-change-advisory-board-ecab.md
├── emergency-change.md
├── emergency-committee.md
├── emergency-plan.md
├── enhanced-monitoring-with-ai-ml.md
├── enterprise-aiops-solutions.md
├── enterprise-architect.md
├── enterprise-architecture-ea.md
├── enterprise-policies-and-regulations.md
├── error-budget.md
├── escalate.md
├── escalation-delay.md
├── escalation-matrix.md
├── escalation-policy.md
├── escalation-workflow.md
├── event-categorization-scheme.md
├── event-correlation.md
├── event-deduplication.md
├── event-driven-automation.md
├── event-enrichment.md
├── event-filtering.md
├── event-management.md
├── event-monitoring.md
├── event-record.md
├── event-review.md
├── event-routing.md
├── event-suppression.md
├── event-trends-and-patterns.md
├── event.md
├── external-status-page.md
├── failure-mode-and-effects-analysis-fmea.md
├── failure-point.md
├── false-alarm.md
├── fault-injection-testing-chaos-engineering.md
├── fault-isolation-dashboard.md
├── fault-prediction-with-ai-ml.md
├── fault-tolerance.md
├── fault-tree-analysis.md
├── federated-incident-management-systems.md
├── feedback-loop.md
├── first-line-support.md
├── first-responder-assignment.md
├── first-responder.md
├── fix.md
├── fixed-asset.md
├── flexible-escalation-policy.md
├── flexible-workflows-for-distributed-teams.md
├── follow-the-sun-schedule.md
├── follow-up-notification.md
├── gamification-of-incident-training.md
├── gap-analysis.md
├── generative-ai-for-incident-response.md
├── geo-aware-incident-management.md
├── geo-distributed-alert-routing.md
├── global-incident-intelligence-sharing.md
├── global-incident-response-team.md
├── global-status-dashboard.md
├── glossary.hbs
├── gold-silver-bronze-command-structure.md
├── graph-based-dependency-mapping.md
├── ground-support-unit.md
├── group-notifications.md
├── guided-remediation.md
├── guided-response.md
├── handover.md
├── hazard-identification.md
├── hazard-mitigation.md
├── health-check.md
├── health-monitoring-dashboards.md
├── high-availability.md
├── high-priority-incident.md
├── high-severity-alert-routing.md
├── historical-data-analysis.md
├── historical-incident-reports.md
├── hotfix.md
├── human-error.md
├── human-in-the-loop-ai-for-incident-response.md
├── hybrid-cloud-incident-management.md
├── hybrid-incident-escalation.md
├── hyperautomation-in-incident-management.md
├── immediate-resolution.md
├── impact-analysis-tools-for-incidents.md
├── incident-categorization.md
├── incident-closure.md
├── incident-command-system-ics.md
├── incident-commander.md
├── incident-detection.md
├── incident-escalation.md
├── incident-identification.md
├── incident-lifecycle.md
├── incident-logging.md
├── incident-management.md
├── incident-manager.md
├── incident-model.md
├── incident-monitoring.md
├── incident-prediction-with-ai-ml.md
├── incident-prioritization.md
├── incident-record.md
├── incident-report.md
├── incident-resolution.md
├── incident-response.md
├── incident-status-information.md
├── incident-summary.md
├── incident.md
├── index.hbs
├── initial-response.md
├── instant-notifications.md
├── integrated-aiops-for-proactive-responses.md
├── integrated-status-pages.md
├── integration-ecosystem.md
├── intelligent-alert-routing.md
├── intelligent-automation-in-incident-management.md
├── interactive-postmortems.md
├── internal-status-page.md
├── jeopardy-management.md
├── joint-ai-human-response-teams.md
├── joint-command.md
├── joint-incident-view.md
├── joint-information-center-jic.md
├── journey-mapping-for-incident-response.md
├── judgement-call.md
├── jump-host-access.md
├── just-in-time-alert-routing.md
├── just-in-time-knowledge-base.md
├── key-performance-indicators-kpis.md
├── key-risk-indicators.md
├── key-stakeholder-notifications.md
├── knowledge-automation-in-incident-resolution.md
├── knowledge-base.md
├── knowledge-centered-postmortems.md
├── knowledge-graphs-for-incident-correlation.md
├── knowledge-management.md
├── known-error-database-kedb.md
├── known-error.md
├── latency-alerts.md
├── latency.md
├── learning-algorithms-for-root-cause-analysis.md
├── level-1-support-l1.md
├── level-2-support-l2.md
├── level-3-support-l3.md
├── live-incident-updates.md
├── log-analysis.md
├── log-based-anomaly-detection.md
├── log-monitoring.md
├── logging.md
├── low-code-incident-automation.md
├── machine-learning-for-incident-prediction.md
├── machine-learning-for-root-cause-analysis.md
├── maintenance-mode.md
├── major-incident.md
├── manual-escalation.md
├── mean-time-between-failures-mtbf.md
├── mean-time-to-acknowledge-mtta.md
├── mean-time-to-detect-mttd.md
├── mean-time-to-diagnose-mttd.md
├── mean-time-to-recovery-mttr.md
├── mean-time-to-resolve-mttr.md
├── metrics-dashboard.md
├── microservices-monitoring.md
├── mobile-alerts.md
├── mobile-first-incident-response.md
├── monitoring-system.md
├── monitoring.md
├── monkey-patching.md
├── multi-channel-notifications.md
├── multi-cloud-incident-management.md
├── multi-factor-authentication.md
├── mutual-aid-agreement.md
├── national-incident-management-system-nims.md
├── natural-language-processing-for-incident-analysis.md
├── network-dependency-mapping.md
├── network-latency.md
├── network-monitoring.md
├── network-operations-center-noc.md
├── network-outage.md
├── network-resilience-automation.md
├── neural-network-monitoring.md
├── noise-reduction.md
├── non-compliance.md
├── non-conformance.md
├── non-critical-incident.md
├── normal-operations.md
├── notification-escalation.md
├── notification-protocol.md
├── notification-routing.md
├── notification-templates.md
├── notification.md
├── observability-driven-incident-response.md
├── observability-integration.md
├── observability.md
├── on-call-shift.md
├── oncall-calendar.md
├── oncall-engineer.md
├── oncall-load-distribution.md
├── oncall-load.md
├── oncall-management.md
├── oncall-override.md
├── oncall-responder.md
├── oncall-rotation.md
├── oncall-schedule.md
├── oncall.md
├── open-telemetry.md
├── operational-analytics.md
├── operational-dashboard.md
├── operational-intelligence.md
├── operational-maturity-om.md
├── operational-readiness.md
├── operational-resilience.md
├── operations-bridge.md
├── operations-lead.md
├── operations.md
├── outage-tracking.md
├── outage.md
├── outcome-based-incident-management.md
├── p0-priority-zero.md
├── p1-priority-one.md
├── p2-priority-two.md
├── p3-priority-three.md
├── p4-priority-four.md
├── phone-call-notifications.md
├── platform-engineering.md
├── platform-integration.md
├── playbook.md
├── post-incident-review-pir.md
├── postmortem-templates.md
├── postmortem.md
├── predictable-pricing.md
├── predictive-analytics.md
├── preventive-action.md
├── preventive-intelligence.md
├── primary-responder.md
├── priority-automation.md
├── priority-detection.md
├── priority-matrix.md
├── priority.md
├── proactive-alerts.md
├── proactive-incident-response.md
├── proactive-monitoring.md
├── proactive-response.md
├── problem-management.md
├── problem-record.md
├── process-automation.md
├── production-environment.md
├── quality-assurance.md
├── quality-control.md
├── quality-management-system-qms.md
├── quantitative-analysis.md
├── quantitative-incident-analytics.md
├── quantitative-risk-assessment-qra.md
├── quantum-computing-security-incidents.md
├── quantum-resistant-encryption.md
├── query-builder.md
├── queue-management.md
├── queue-prioritization.md
├── queue.md
├── quick-actions.md
├── quick-reponse.md
├── real-time-alerts.md
├── real-time-collaboration-tools.md
├── recovery-plan.md
├── recovery-point-objective-rpo.md
├── recovery-time-objective-rto.md
├── recovery.md
├── release-management.md
├── remote-incident-response.md
├── resilience-engineering.md
├── resilience.md
├── resolution-time.md
├── resolution-tracking.md
├── resolve.md
├── response-automation.md
├── response-time.md
├── risk-analysis.md
├── risk-management.md
├── risk-prediction-with-ai.md
├── risk-register.md
├── robotic-process-automation-rpa.md
├── role-based-access-control.md
├── root-cause-analysis-rca.md
├── root-cause.md
├── runbook.md
├── scheduled-maintenance.md
├── secondary-responder.md
├── security-incident-response.md
├── security-incident.md
├── self-healing-incident.md
├── self-healing-systems.md
├── sentiment-analysis-for-customer-impact.md
├── server-incident-management.md
├── service-degradation.md
├── service-dependency-visualization.md
├── service-desk.md
├── service-impact.md
├── service-level-agreement-sla.md
├── service-level-indicator-sli.md
├── service-level-objective-slo.md
├── service-mapping-dashboard.md
├── service-mapping.md
├── service-mesh-observability.md
├── service-owner.md
├── service-restoration.md
├── service.md
├── severity-automation.md
├── severity.md
├── shadow-on-call-rotation.md
├── single-point-of-failure-spof.md
├── single-point-of-failure.md
├── site-reliability-engineering-sre.md
├── site.manifest
├── sre-as-a-service.md
├── stakeholder.md
├── standard-operating-procedure.md
├── status-page.md
├── support-tier.md
├── suppression-rules.md
├── swarming.md
├── synthetic-monitoring.md
├── system-failure.md
├── system-outage.md
├── teams-multi-management.md
├── technical-debt.md
├── technical-support.md
├── telemetry-based-incident-detection.md
├── template-library.md
├── threat-and-error-management.md
├── threat-intelligence.md
├── threat-management.md
├── threat.md
├── threshold.md
├── ticket-automation.md
├── ticket-management.md
├── ticket.md
├── tier-1-2-3-support.md
├── time-to-acknowledge.md
├── time-to-detect.md
├── time-to-resolution.md
├── time-to-respond.md
├── timeline-view.md
├── total-cost-of-ownership-tco.md
├── triage-automation.md
├── triage.md
├── trigger.md
├── troubleshooting.md
├── unified-aiops.md
├── unified-communications.md
├── unified-monitoring.md
├── unified-observability.md
├── unplanned-downtime.md
├── unplanned-maintenance.md
├── unresolved-incident.md
├── uptime-percentage.md
├── uptime-sla.md
├── uptime.md
├── urgency-classification.md
├── user-experience-monitoring.md
├── user-experience.md
├── user-impact.md
├── user-management.md
├── user-permissions.md
├── value-stream-incident-analysis.md
├── vendor-incident.md
├── vendor-management.md
├── version-control.md
├── vip-alert-routing.md
├── virtual-incident-command-center.md
├── virtual-reality-incident-response.md
├── virtual-responder.md
├── virtual-response-team.md
├── virtual-war-room.md
├── visibility-controls.md
├── visibility.md
├── visualization-dashboard.md
├── voice-activated-incident-management.md
├── voice-alert-configuration.md
├── voice-communication.md
├── vulnerability-management.md
├── vulnerability-prediction.md
├── vulnerability.md
├── war-room.md
├── warm-standby.md
├── warning.md
├── waterfall-method.md
├── web3-incident-management.md
├── webhook.md
├── weekly-incident-reports.md
├── weekly-rotation.md
├── well-being-features.md
├── widespread-outage.md
├── work-log.md
├── workflow-automation.md
├── workflow-builder.md
├── workflow-engine.md
├── workflow-intelligence.md
├── workflow-orchestration.md
├── workflow-template.md
├── workload-management.md
├── x-team-cross-functional-team.md
├── xops-cross-operational-practices.md
├── xss-cross-site-scripting.md
├── yearly-incident-review.md
├── yearly-incident-trends.md
├── yearly-maintenance-window.md
├── yoy-year-over-year-incident-analysis.md
├── zero-day-vulnerability.md
├── zero-downtime.md
├── zero-latency-detection.md
├── zero-noise-alerting.md
├── zero-ops.md
├── zero-touch-automation.md
├── zero-trust-architecture-zta.md
├── zero-trust-security.md
├── zombie-server.md
├── zone-based-recovery.md
└── zone-based-routing.md
/.gitignore:
--------------------------------------------------------------------------------
1 | # Dependencies
2 | node_modules/
3 | npm-debug.log
4 | yarn-debug.log
5 | yarn-error.log
6 |
7 | # Environment
8 | .env
9 | .env.local
10 | .env.*.local
11 |
12 | # Build output
13 | dist/
14 | build/
15 | out/
16 | _site/
17 |
18 | pages/
19 |
20 | # IDE and editor files
21 | .idea/
22 | .vscode/
23 | *.swp
24 | *.swo
25 | .DS_Store
26 |
27 | # Logs
28 | logs/
29 | *.log
30 |
31 | # Testing
32 | coverage/
33 |
34 | # Temporary files
35 | tmp/
36 | temp/
37 |
38 | # Misc
39 | .cache/
40 | .DS_Store
41 | Thumbs.db
42 | dump.rdb
43 |
--------------------------------------------------------------------------------
/LICENSE.md:
--------------------------------------------------------------------------------
1 | Copyright (c) 2025 FatSync Software Private Limited
2 |
3 | Permission is hereby granted, free of charge, to any person
4 | obtaining a copy of this software and associated documentation
5 | files (the "Software"), to deal in the Software without
6 | restriction, including without limitation the rights to use,
7 | copy, modify, merge, publish, distribute, sublicense, and/or sell
8 | copies of the Software, and to permit persons to whom the
9 | Software is furnished to do so, subject to the following
10 | conditions:
11 |
12 | The above copyright notice and this permission notice shall be
13 | included in all copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
16 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
17 | OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
18 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
19 | HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
20 | WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
21 | FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
22 | OTHER DEALINGS IN THE SOFTWARE.
--------------------------------------------------------------------------------
/TEMPLATE.md:
--------------------------------------------------------------------------------
1 | ---
2 | term: [Term Name]
3 | excerpt: [A short, concise description (50-60 words) that summarizes the term.]
4 | featured: false
5 | # featuredHeading: [Optional custom heading to display when featured]
6 | # related:
7 | # - name: [Name of related term]
8 | # slug: [Slug of related term]
9 | ---
10 | ## What Is [Your Term]
11 |
12 | [Start with a comprehensive definition that expands on the excerpt. Explain what the term means in the context of incident management or SRE practices.]
13 |
14 | ## Why Is [Your Term] Important
15 |
16 | [Explain the significance of this term, why it matters, and how it contributes to effective incident management, SRE, or DevOps practices.]
17 |
18 | ## Example Of [Your Term]
19 |
20 | [Provide a practical, real-world example that illustrates how this term is applied in a typical scenario. Use specific details to make it relatable.]
--------------------------------------------------------------------------------
/hero.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/spikehq/glossary/bc0b1de432ccab890efa9fb3f7e26dd105e66a72/hero.png
--------------------------------------------------------------------------------
/package.json:
--------------------------------------------------------------------------------
1 | {
2 | "name": "glossary",
3 | "version": "1.0.1",
4 | "description": "Spike.sh glossary with Sass support",
5 | "main": "index.js",
6 | "scripts": {
7 | "start": "eleventy --serve",
8 | "sitemap": "node generate-sitemap.js",
9 | "build": "npm run sass && eleventy && npm run sitemap",
10 | "clean": "rm -rf _site",
11 | "sass": "sass src/assets/scss/main.scss:src/assets/css/style.css --style=compressed",
12 | "sass:watch": "sass --watch src/assets/scss/main.scss:src/assets/css/style.css --style=compressed",
13 | "dev": "concurrently \"npm run sass:watch\" \"eleventy --serve --quiet\"",
14 | "preview": "npm run build && eleventy --serve"
15 | },
16 | "keywords": [
17 | "glossary",
18 | "11ty",
19 | "handlebars",
20 | "sass"
21 | ],
22 | "author": "",
23 | "license": "ISC",
24 | "dependencies": {
25 | "@11ty/eleventy": "^3.0.0",
26 | "@11ty/eleventy-plugin-handlebars": "^1.0.0",
27 | "@11ty/eleventy-plugin-syntaxhighlight": "^5.0.0",
28 | "handlebars": "^4.7.8",
29 | "markdown-it": "^14.1.0",
30 | "sass": "^1.87.0"
31 | },
32 | "devDependencies": {
33 | "concurrently": "^9.1.2"
34 | }
35 | }
36 |
--------------------------------------------------------------------------------
/src/acknowledge.md:
--------------------------------------------------------------------------------
1 | ---
2 | excerpt: Acknowledge is the act of confirming receipt of an incident alert and taking initial ownership of the response.
3 | term: Acknowledge
4 | ---
5 | ## What Is Acknowledge In Incident Management
6 |
7 | Acknowledge is the act of confirming receipt of an incident alert and taking initial ownership of the response. It's the first step in the incident management process where a responder indicates they are aware of the issue and will begin working on it.
8 |
9 | ## Example Of Acknowledge In Incident Management
10 |
11 | A system monitoring tool detects high CPU usage and sends an alert to the on-call engineer. The engineer receives the notification on their phone, taps the "acknowledge" button, and the incident management system records the time and updates the incident status from "triggered" to "acknowledged."
12 |
13 | ## How To Acknowledge With Spike
14 |
15 | - Spike lets you acknowledge alerts with one click from web, mobile, email, or chat app.
16 | - Acknowledged incidents are instantly visible to teammates, cutting duplicate effort.
17 | - You can assign the alert to yourself or another team member after acknowledging.
18 | - Spike records who acknowledged the incident and when.
19 |
20 | Start managing incidents in [Spike](https://app.spike.sh/signup) and never miss an alert—try acknowledging your first alert today.
--------------------------------------------------------------------------------
/src/affected-service.md:
--------------------------------------------------------------------------------
1 | ---
2 | excerpt: An Affected Service is any system, application, infrastructure component, or business function that experiences degraded performance or complete failure during an incident.
3 | term: Affected Service
4 | ---
5 | ## What Is Affected Service
6 |
7 | An Affected Service is any system, application, infrastructure component, or business function that experiences degraded performance or complete failure during an incident. It represents the scope of impact that an incident has on an organization's technical environment.
8 |
9 | ## Example Of Affected Service
10 |
11 | During a network outage, the affected services might include the customer login system, payment processing, and inventory management. The support ticketing system remains operational because it runs on a separate network. This information helps the incident team prioritize restoring the payment processing service first due to its direct revenue impact.
--------------------------------------------------------------------------------
/src/after-action-review.md:
--------------------------------------------------------------------------------
1 | ---
2 | excerpt: An After-Action Review (AAR) is a structured analysis conducted after an incident to identify what happened, why it happened, and how to improve future responses.
3 | term: After-Action Review
4 | ---
5 | ## What Is After-Action Review
6 |
7 | An After-Action Review (AAR) is a structured analysis conducted after an incident to identify what happened, why it happened, and how to improve future responses. It's a learning process that examines both successes and failures without assigning blame.
8 |
9 | ## Why Is After-Action Review Important
10 |
11 | AARs create a continuous improvement cycle for incident management. They help teams learn from experience, identify systemic issues, and develop more effective response strategies. Regular AARs build institutional knowledge and prevent recurring problems.
12 |
13 | ## Example Of After-Action Review
14 |
15 | Following a major service outage, a team conducts an AAR and discovers that the incident escalated because monitoring alerts weren't properly routed. They implement new alert routing rules and create clearer escalation paths for similar incidents.
16 |
17 | ## How To Implement After-Action Review
18 |
19 | - Schedule the review within 1-2 weeks after incident resolution
20 | - Include all stakeholders who were involved in the incident
21 | - Focus on timeline reconstruction and root cause analysis
22 | - Document findings and action items with clear ownership
23 | - Share learnings with the broader organization
--------------------------------------------------------------------------------
/src/alert-escalation.md:
--------------------------------------------------------------------------------
1 | ---
2 | excerpt: Alert escalation is the process of moving an unresolved incident to higher-level responders.
3 | term: Alert Escalation
4 | ---
5 | ## What Is Alert Escalation
6 |
7 | Alert escalation is the process of moving an unresolved incident to higher-level responders. This happens when initial responders didn’t acknowledge or couldn’t fix the incident within a set time.
8 |
9 | ## Why Is Alert Escalation Important
10 |
11 | Alert escalation helps incidents get the right attention and resources quickly. It reduces downtime and prevents small issues from becoming major outages.
12 |
13 | ## Example Of Alert Escalation
14 |
15 | A critical server alert is not acknowledged in 10 minutes. The system automatically alerts a senior engineer.
16 |
17 | ## How To Implement Alert Escalation With Spike
18 |
19 | - Go to the dashboard and click "new escalation" to create a policy.
20 | - Give your policy a descriptive name based on team members involved.
21 | - Add multiple escalation levels with different notification channels (phone, Slack, email).
22 | - Set wait times between levels to control when alerts move to the next person.
23 | - Enable repeat escalations for critical alerts that can't be missed.
24 |
25 | Never miss another critical incident—set up your first escalation policy with [Spike](https://app.spike.sh/signup) today and keep the right people in the loop.
--------------------------------------------------------------------------------
/src/alert-fatigue.md:
--------------------------------------------------------------------------------
1 | ---
2 | excerpt: Alert fatigue is a condition where incident responders become desensitized to notifications due to receiving too many alerts, particularly false positives.
3 | term: Alert Fatigue
4 | ---
5 | ## What Is Alert Fatigue
6 |
7 | Alert fatigue is a condition where incident responders become desensitized to notifications due to receiving too many alerts, particularly false positives. This mental state causes responders to potentially miss critical alerts, respond more slowly, or make errors in judgment during incident management.
8 |
9 | ## Example Of Alert Fatigue
10 |
11 | A DevOps engineer receives over 200 alerts during their shift, with 90% being false positives. When a critical database failure occurs, the engineer misses the alert because it blends in with the noise, resulting in extended downtime that could have been prevented.
12 |
13 | ## How To Reduce Alert Fatigue With Spike
14 |
15 | - Spike filters non-essential alerts automatically, so you only get notifications that need attention.
16 | - Use Spike's suppression engine to cut down 97% of noise from repeat incidents.
17 | - Set up smart alert routing to direct critical alerts to phone calls and less urgent ones to Slack.
18 | - Enable work modes like Deep Work, Cooldown, and Out of Office to manage notifications during focus time or breaks.
19 | - Customize severity and priority settings to influence which alerts reach you and how.
20 |
21 | Start reducing alert fatigue today and improve your team's response quality with [Spike](https://app.spike.sh/signup).
--------------------------------------------------------------------------------
/src/alert-grouping.md:
--------------------------------------------------------------------------------
1 | ---
2 | excerpt: Alert grouping is the process of combining related alerts into a single notification or case.
3 | term: Alert Grouping
4 | ---
5 | ## What Is Alert Grouping
6 |
7 | Alert grouping is the process of combining related alerts into a single notification or case. This helps responders focus on the root issue without getting overwhelmed by multiple, similar alerts.
8 |
9 | ## Why Is Alert Grouping Important
10 |
11 | Alert grouping cuts down noise and alert fatigue, making it easier for teams to spot and respond to real incidents. It speeds up response times and helps teams avoid missing critical issues hidden in a flood of alerts.
12 |
13 | ## Example of Alert Grouping
14 |
15 | If a server goes down, several monitors might trigger alerts for CPU, memory, and network. Alert grouping combines these into one incident, so the responder gets a single notification instead of many.
16 |
17 | ## How To Implement Alert Grouping With Spike
18 |
19 | - Spike auto-groups repeated incidents so you get one alert instead of many
20 | - See how often an incident repeats, with first and last occurrence details
21 | - Use this context to spot patterns and focus on real problems
22 |
23 | Cut down alert noise and get better incident context—try alert grouping in [Spike](https://app.spike.sh/signup) today.
--------------------------------------------------------------------------------
/src/alert-volume.md:
--------------------------------------------------------------------------------
1 | ---
2 | excerpt: Alert volume is the total number of alerts generated by monitoring or incident management systems over a specific period.
3 | term: Alert Volume
4 | ---
5 | ## What Is Alert Volume
6 |
7 | Alert volume is the total number of alerts generated by monitoring or incident management systems over a specific period. It helps teams understand their workload and the noise level in their alerting process.
8 |
9 | ## Why Is Tracking Alert Volume Important
10 |
11 | Tracking alert volume helps teams spot patterns, reduce alert fatigue, and focus on real issues. High alert volume can signal too many false alarms or poorly tuned monitoring.
12 |
13 | ## How To Implement Alert Volume Tracking
14 |
15 | - Use monitoring tools that log all alerts
16 | - Set up dashboards to visualize alert counts over time
--------------------------------------------------------------------------------
/src/alert.md:
--------------------------------------------------------------------------------
1 | ---
2 | excerpt: An Alert is a notification triggered when a monitored system, application, or service exceeds predefined thresholds or exhibits abnormal behavior.
3 | term: Alert
4 | ---
5 | ## What Is Alert
6 |
7 | An Alert is a notification triggered when a monitored system, application, or service exceeds predefined thresholds or exhibits abnormal behavior. It serves as the initial signal that something requires attention in an incident management workflow.
8 |
9 | ## Why Is Alert Important
10 |
11 | Alerts provide early warning of potential issues before they impact users. They enable proactive problem resolution and minimize downtime. Well-designed alerts help teams prioritize their response efforts based on severity and business impact.
12 |
13 | ## Example Of Alert
14 |
15 | A database server's memory utilization crosses the 90% threshold for five consecutive minutes. The monitoring system generates an alert with severity "High" and automatically notifies the database team via Slack and email. The alert includes the server name, current memory usage, and a link to the performance dashboard.
--------------------------------------------------------------------------------
/src/andon-cord.md:
--------------------------------------------------------------------------------
1 | ---
2 | excerpt: An Andon Cord is a concept from lean manufacturing adapted for incident management, representing a mechanism that allows any team member to halt operations when they detect a critical issue.
3 | term: Andon Cord
4 | ---
5 | ## What Is Andon Cord
6 |
7 | An Andon Cord is a concept from lean manufacturing adapted for incident management, representing a mechanism that allows any team member to halt operations when they detect a critical issue. In incident management, it's a protocol that empowers employees to escalate and trigger an immediate response to prevent further damage.
8 |
9 | ## Why Is Andon Cord Important
10 |
11 | The Andon Cord concept creates a culture where anyone can raise an alarm about critical issues without fear. It prevents small problems from escalating into major incidents by enabling quick intervention. This approach distributes responsibility for system health across the entire team rather than limiting it to managers or specific roles.
12 |
13 | ## Example Of Andon Cord
14 |
15 | A junior developer notices unusual database query patterns that might indicate a security breach. Instead of waiting for confirmation or escalating through normal channels, they "pull the Andon Cord" by triggering a high-priority alert, which immediately notifies the security team and potentially prevents a data breach.
--------------------------------------------------------------------------------
/src/asset.md:
--------------------------------------------------------------------------------
1 | ---
2 | excerpt: In incident management, an asset is any component of an organization's IT infrastructure that needs to be monitored, maintained, and protected.
3 | term: Asset
4 | ---
5 | ## What Is Asset In Incident Management
6 |
7 | In incident management, an asset is any component of an organization's IT infrastructure that needs to be monitored, maintained, and protected. Assets include hardware, software, data, services, and systems that support business operations.
8 |
9 | ## Why Is Asset Important In Incident Management
10 |
11 | Understanding your assets is fundamental to effective incident management. You can't protect what you don't know exists. Clear asset visibility helps teams identify affected components during incidents, understand dependencies, and prioritize response efforts based on business impact.
12 |
13 | ## Example Of Asset In Incident Management
14 |
15 | A payment processing API would be considered a critical asset for an e-commerce company. If this API experiences an outage, the incident response team would prioritize its restoration because of its direct impact on revenue.
--------------------------------------------------------------------------------
/src/assets/js/main.js:
--------------------------------------------------------------------------------
1 | // Mobile alphabet dropdown functionality
2 | document.addEventListener('DOMContentLoaded', function() {
3 | const mobileDropdown = document.querySelector('.mobile-alphabet-dropdown');
4 | if (mobileDropdown) {
5 | mobileDropdown.addEventListener('change', function() {
6 | const selectedValue = this.value;
7 | if (selectedValue) {
8 | window.location.hash = selectedValue;
9 | const targetElement = document.querySelector(selectedValue);
10 | if (targetElement) {
11 | targetElement.scrollIntoView({ behavior: 'smooth' });
12 | }
13 | }
14 | });
15 | }
16 | });
--------------------------------------------------------------------------------
/src/assets/scss/base/_reset.scss:
--------------------------------------------------------------------------------
1 | // Reset styles
2 |
3 | * {
4 | margin: 0;
5 | padding: 0;
6 | box-sizing: border-box;
7 | }
8 |
9 | body {
10 | font-family: 'Inter', -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, Oxygen, Ubuntu, Cantarell, 'Open Sans', 'Helvetica Neue', sans-serif;
11 | line-height: 1.6;
12 | color: var(--text-color);
13 | background-color: #fff;
14 | font-size: 16px;
15 | }
16 |
17 | a {
18 | color: var(--primary-color);
19 | text-decoration: none;
20 |
21 | // &:hover {
22 | // text-decoration: underline;
23 | // }
24 | }
25 |
26 | .container {
27 | max-width: var(--container-width);
28 | margin: 0 auto;
29 | padding: 0 20px;
30 | }
31 |
--------------------------------------------------------------------------------
/src/assets/scss/base/_responsive.scss:
--------------------------------------------------------------------------------
1 | // Responsive styles
2 | @use 'variables' as vars;
3 |
4 | // Mixins for media queries
5 | @mixin mobile {
6 | @media (max-width: #{vars.$breakpoint-md - 1px}) {
7 | @content;
8 | }
9 | }
10 |
11 | @mixin tablet {
12 | @media (min-width: #{vars.$breakpoint-md}) and (max-width: #{vars.$breakpoint-lg - 1px}) {
13 | @content;
14 | }
15 | }
16 |
17 | @mixin desktop {
18 | @media (min-width: #{vars.$breakpoint-lg}) {
19 | @content;
20 | }
21 | }
22 |
23 | // Mobile-specific styles
24 | @include mobile {
25 | .letter-items,
26 | .related-cards {
27 | grid-template-columns: 1fr;
28 | }
29 |
30 | .header-container {
31 | flex-direction: column;
32 | gap: 15px;
33 | }
34 |
35 | .main-nav ul {
36 | gap: 15px;
37 | }
38 |
39 | .glossary-content h1 {
40 | font-size: 28px;
41 | }
42 |
43 | .glossary-sections h2 {
44 | font-size: 22px;
45 | }
46 | }
47 |
48 | // Tablet-specific styles
49 | @include tablet {
50 | .letter-items,
51 | .related-cards {
52 | grid-template-columns: repeat(2, 1fr);
53 | }
54 | }
--------------------------------------------------------------------------------
/src/assets/scss/base/_typography.scss:
--------------------------------------------------------------------------------
1 | // Typography styles
2 |
3 | h1, h2, h3, h4, h5, h6 {
4 | margin-bottom: 20px;
5 | line-height: 1.3;
6 | }
7 |
8 | p {
9 | margin-bottom: 20px;
10 | }
11 |
--------------------------------------------------------------------------------
/src/assets/scss/base/_variables.scss:
--------------------------------------------------------------------------------
1 | // Variables
2 |
3 | :root {
4 | --primary-color: #1649FF;
5 | --secondary-color: #f5f5f5;
6 | --text-color: #202020;
7 | --text-light: #777;
8 | --border-color: #e5e5e5;
9 | --text-gray: #DEE2E6;
10 | --container-width: 1200px;
11 | --orange: #FFA52F;
12 | --font-family-secondary: "Monoton", sans-serif;
13 | }
14 |
15 | // Sass variables (for use in the Sass files)
16 | $primary-color: #1649FF;
17 | $secondary-color: #f5f5f5;
18 | $text-color: #202020;
19 | $text-light: #777;
20 | $border-color: #e5e5e5;
21 | $container-width: 1200px;
22 |
23 | // Breakpoints
24 | $breakpoint-sm: 576px;
25 | $breakpoint-md: 768px;
26 | $breakpoint-lg: 992px;
27 | $breakpoint-xl: 1200px;
28 |
--------------------------------------------------------------------------------
/src/assets/scss/components/_buttons.scss:
--------------------------------------------------------------------------------
1 | // Button styles
2 | @use 'sass:color';
3 | @use '../base/variables' as vars;
4 |
5 | .button {
6 | display: inline-block;
7 | padding: 0.625rem 1.25rem; /* 10px 20px in rem */
8 | background-color: var(--primary-color);
9 | color: white;
10 | border-radius: 0.25rem; /* 4px in rem */
11 | font-weight: 500;
12 | transition: background-color 0.3s;
13 |
14 | &:hover {
15 | background-color: color.adjust(vars.$primary-color, $lightness: -5%);
16 | text-decoration: none;
17 | }
18 | }
19 |
--------------------------------------------------------------------------------
/src/assets/scss/components/_home.scss:
--------------------------------------------------------------------------------
1 | // Home page styles
2 |
3 | .home-page {
4 | text-align: center;
5 | max-width: 50rem; /* 800px in rem */
6 | margin: 0 auto;
7 |
8 | .featured-terms {
9 | margin: 2.5rem 0; /* 40px in rem */
10 | }
11 |
12 | .browse-all {
13 | margin-top: 2.5rem; /* 40px in rem */
14 | }
15 | }
16 |
17 | #header {
18 | background-color: var(--secondary-color);
19 | }
20 |
--------------------------------------------------------------------------------
/src/assets/scss/components/_mobile.scss:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/spikehq/glossary/bc0b1de432ccab890efa9fb3f7e26dd105e66a72/src/assets/scss/components/_mobile.scss
--------------------------------------------------------------------------------
/src/assets/scss/layout/_footer.scss:
--------------------------------------------------------------------------------
1 | // Footer styles
2 |
3 | footer {
4 | background-color: var(--secondary-color);
5 | padding: 30px 0;
6 | text-align: center;
7 | color: var(--text-light);
8 | }
9 |
--------------------------------------------------------------------------------
/src/assets/scss/layout/_grid.scss:
--------------------------------------------------------------------------------
1 | // Grid layouts
2 |
3 | .term-grid {
4 | display: grid;
5 | grid-template-columns: repeat(auto-fill, minmax(300px, 1fr));
6 | gap: 20px;
7 | margin-top: 30px;
8 | }
--------------------------------------------------------------------------------
/src/assets/scss/main.scss:
--------------------------------------------------------------------------------
1 | // Import all partials using newer @use syntax
2 |
3 | // Base styles
4 | @use 'base/variables';
5 | @use 'base/reset';
6 | @use 'base/typography';
7 |
8 | // Layout
9 | @use 'layout/footer';
10 | @use 'layout/grid';
11 |
12 | // Components
13 | @use 'components/buttons';
14 | @use 'components/glossary-item';
15 | @use 'components/home';
16 | @use 'components/alphabet-filter';
17 |
18 | // Responsive
19 | @use 'base/responsive';
--------------------------------------------------------------------------------
/src/assigned-incident.md:
--------------------------------------------------------------------------------
1 | ---
2 | excerpt: An assigned incident is an issue that has been formally allocated to a specific individual or team for investigation and resolution.
3 | term: Assigned Incident
4 | ---
5 | ## What Is Assigned Incident
6 |
7 | An assigned incident is an issue that has been formally allocated to a specific individual or team for investigation and resolution. It includes clear ownership, expected response times, and accountability for driving the incident to closure.
8 |
9 | ## Example Of Assigned Incident
10 |
11 | A payment processing failure is detected and automatically assigned to the financial systems team based on predefined routing rules. The team lead acknowledges the assignment and begins troubleshooting while keeping stakeholders informed.
--------------------------------------------------------------------------------
/src/attack-surface.md:
--------------------------------------------------------------------------------
1 | ---
2 | excerpt: Attack surface in incident management refers to the total sum of points where unauthorized users could potentially access systems or data.
3 | term: Attack Surface
4 | ---
5 | ## What Is Attack Surface
6 |
7 | Attack surface in incident management refers to the total sum of points where unauthorized users could potentially access systems or data. It includes all exposed services, APIs, user interfaces, protocols, and other entry points that could be exploited during a security incident.
8 |
9 | ## How To Implement Attack Surface Management
10 |
11 | - Conduct regular asset inventory to identify all systems and services
12 | - Use scanning tools to discover exposed services and endpoints
13 | - Document all external-facing applications and interfaces
14 | - Implement continuous monitoring of the attack surface
15 | - Regularly review and reduce unnecessary exposure
--------------------------------------------------------------------------------
/src/attack-vector.md:
--------------------------------------------------------------------------------
1 | ---
2 | excerpt: An attack vector is a specific path or method that an attacker uses to gain unauthorized access to a system, network, or application during a security incident.
3 | term: Attack Vector
4 | ---
5 | ## What Is Attack Vector
6 |
7 | An attack vector is a specific path or method that an attacker uses to gain unauthorized access to a system, network, or application during a security incident. Common attack vectors include phishing emails, vulnerable software, unsecured APIs, weak passwords, and social engineering techniques.
8 |
9 | ## Why Identifying Attack Vector Important
10 |
11 | Identifying attack vectors helps incident response teams understand how breaches occur and develop targeted prevention strategies. It allows for faster incident resolution by pinpointing entry methods, guides remediation efforts, and improves future security measures.
12 |
13 | ## Example Of Attack Vector
14 |
15 | A company experiences a data breach. During incident investigation, the team discovers the attack vector was a phishing email that tricked an employee into revealing their credentials, which attackers then used to access sensitive customer information.
--------------------------------------------------------------------------------
/src/backup-responder.md:
--------------------------------------------------------------------------------
1 | ---
2 | excerpt: A Backup Responder is a designated individual who steps in when the primary on-call responder is unavailable during an incident.
3 | term: Backup Responder
4 | ---
5 | ## Who Is Backup Responder
6 |
7 | A Backup Responder is a designated individual who steps in when the primary on-call responder is unavailable during an incident. They maintain the same level of authority and responsibility to investigate and resolve incidents, ensuring continuous coverage for critical systems.
8 |
9 | ## Why Is Backup Responder Important
10 |
11 | Backup Responders prevent gaps in incident response coverage that could lead to extended outages. They reduce the risk of missed alerts, provide necessary redundancy for 24/7 operations, and help prevent burnout among primary responders by distributing on-call responsibilities.
12 |
13 | ## Example Of Backup Responder
14 |
15 | During a critical database outage at 2 AM, the primary on-call engineer doesn't respond within the 5-minute SLA. The system automatically notifies the backup responder, who acknowledges the alert and begins troubleshooting immediately, minimizing downtime.
--------------------------------------------------------------------------------
/src/breach.md:
--------------------------------------------------------------------------------
1 | ---
2 | excerpt: A breach is an incident where unauthorized access to systems, networks, or data occurs, potentially compromising confidentiality, integrity, or availability of information.
3 | term: Breach
4 | ---
5 | ## What Is Breach
6 |
7 | A breach is an incident where unauthorized access to systems, networks, or data occurs, potentially compromising confidentiality, integrity, or availability of information. In incident management, it represents a security failure requiring immediate response to contain damage and prevent further unauthorized access.
8 |
9 | ## Example Of Breach
10 |
11 | A company discovers unusual database query patterns at 2 AM. Investigation reveals an attacker exploited a vulnerability to access customer payment information. The incident team immediately isolates affected systems, blocks the attacker's access, and begins the process of securing systems and notifying affected customers.
--------------------------------------------------------------------------------
/src/bug.md:
--------------------------------------------------------------------------------
1 | ---
2 | excerpt: In incident management, a bug is a flaw or error in software or hardware that causes a system to produce unexpected or incorrect results.
3 | term: Bug
4 | ---
5 | ## What Is Bug
6 |
7 | In incident management, a bug is a flaw or error in software or hardware that causes a system to produce unexpected or incorrect results. Bugs can lead to incidents ranging from minor glitches to major outages.
8 |
9 | ## Example Of Bug
10 |
11 | A software update introduces a memory leak that gradually slows down the system over time. This bug causes periodic system crashes, triggering incidents that require investigation and resolution.
--------------------------------------------------------------------------------
/src/business-service.md:
--------------------------------------------------------------------------------
1 | ---
2 | excerpt: A business service is a set of related functions that support core business activities.
3 | term: Business Service
4 | ---
5 | ## What Is Business Service
6 |
7 | A business service is a set of related functions that support core business activities. In incident management, it refers to the IT services that directly enable or support these business functions, such as email systems, customer databases, or e-commerce platforms.
8 |
9 | ## Example Of Business Service
10 |
11 | For an online retailer, the e-commerce platform is a critical business service. Any incident affecting this service, such as website downtime or payment processing issues, would have a direct impact on sales and customer satisfaction.
--------------------------------------------------------------------------------
/src/command-post.md:
--------------------------------------------------------------------------------
1 | ---
2 | excerpt: A Command Post is a designated physical or virtual location where incident response leaders gather during major incidents to coordinate activities, make decisions, and communicate with stakeholders.
3 | term: Command Post
4 | ---
5 | ## What Is Command Post
6 |
7 | A Command Post is a designated physical or virtual location where incident response leaders gather during major incidents to coordinate activities, make decisions, and communicate with stakeholders. It serves as the central hub for information flow and strategic direction during crisis situations.
8 |
9 | ## Why Is Command Post Important
10 |
11 | A Command Post creates a unified command structure during complex incidents. It improves coordination, speeds up decision-making, and centralizes communication. Without a command post, response efforts often become fragmented, leading to confusion, duplicated efforts, and delayed resolution.
--------------------------------------------------------------------------------
/src/compliance.md:
--------------------------------------------------------------------------------
1 | ---
2 | excerpt: Compliance in incident management refers to adhering to regulatory requirements, industry standards, and internal policies when handling and resolving incidents.
3 | term: Compliance
4 | ---
5 | ## What Is Compliance In Incident Management
6 |
7 | Compliance in incident management refers to adhering to regulatory requirements, industry standards, and internal policies when handling and resolving incidents. It involves maintaining proper documentation, following established procedures, and meeting reporting obligations during incident response.
8 |
9 | ## Why Is Compliance Important In Incident Management
10 |
11 | Compliance protects organizations from legal penalties, financial losses, and reputational damage. It creates a structured approach to incident management that helps teams respond consistently and effectively. Compliance also builds trust with customers and partners by demonstrating commitment to security and reliability standards.
12 |
13 | ## Example Of Compliance In Incident Management
14 |
15 | During a data breach incident, a healthcare company follows HIPAA requirements by documenting the extent of the breach, notifying affected patients within the required timeframe, and reporting the incident to regulatory authorities with all required information.
--------------------------------------------------------------------------------
/src/computer-security-incident-reponse-team-csirt.md:
--------------------------------------------------------------------------------
1 | ---
2 | excerpt: A Computer Security Incident Response Team (CSIRT) is a specialized group responsible for receiving, analyzing, and responding to computer security incidents.
3 | term: Computer Security Incident Response Team (CSIRT)
4 | ---
5 | ## What Is Computer Security Incident Response Team (CSIRT)
6 |
7 | A Computer Security Incident Response Team (CSIRT) is a specialized group responsible for receiving, analyzing, and responding to computer security incidents. This team coordinates the organization's response to security breaches, cyber attacks, and other security-related events to minimize damage and restore normal operations.
8 |
9 | ## Why Is Computer Security Incident Response Team (CSIRT) Important
10 |
11 | CSIRTs provide the expertise and focus needed to handle complex security incidents effectively. They reduce response time, limit damage from security breaches, and help organizations recover faster. Their specialized knowledge helps prevent similar incidents in the future through improved security measures.
12 |
13 | ## How To Build Computer Security Incident Response Team (CSIRT)
14 |
15 | - Recruit team members with diverse security and technical backgrounds
16 | - Define clear procedures for incident detection, analysis, and response
17 | - Establish communication protocols with other departments and external entities
18 | - Provide specialized tools and resources for security incident investigation
19 | - Develop incident classification frameworks and response playbooks
--------------------------------------------------------------------------------
/src/configuration-item-ci.md:
--------------------------------------------------------------------------------
1 | ---
2 | excerpt: A Configuration Item (CI) is any component that needs to be managed to deliver an IT service.
3 | term: Configuration Item (CI)
4 | ---
5 | ## What Is Configuration Item (CI)
6 |
7 | A Configuration Item (CI) is any component that needs to be managed to deliver an IT service. CIs include hardware, software, documentation, personnel, and facilities that contribute to service delivery and require version control, protection, and tracking throughout their lifecycle.
8 |
9 | ## Why Is Configuration Item (CI) Important
10 |
11 | Configuration Items form the foundation of effective incident management by providing visibility into system components and their relationships. They help teams quickly identify affected components during incidents, understand potential impacts, and implement appropriate fixes without causing unintended consequences.
12 |
13 | ## Example Of Configuration Item (CI)
14 |
15 | During a network outage, the incident response team uses CI information to identify that a specific router (CI) is malfunctioning. The CI record shows all connected systems, recent changes, and the responsible team, allowing for faster diagnosis and targeted resolution.
--------------------------------------------------------------------------------
/src/correlation.md:
--------------------------------------------------------------------------------
1 | ---
2 | excerpt: Correlation in incident management is the process of identifying relationships between multiple alerts, events, or incidents to determine if they share a common cause or are related in some way.
3 | term: Correlation
4 | ---
5 | ## What Is Correlation
6 |
7 | Correlation in incident management is the process of identifying relationships between multiple alerts, events, or incidents to determine if they share a common cause or are related in some way. It helps teams see the bigger picture rather than treating each alert as an isolated issue.
8 |
9 | ## Why Is Correlation Important
10 |
11 | Correlation reduces alert noise and helps teams focus on root causes rather than symptoms. It prevents duplicate work, speeds up incident resolution, and provides context for troubleshooting. Without correlation, teams often waste time addressing individual alerts while missing the underlying problem.
12 |
13 | ## Example Of Correlation
14 |
15 | A website experiences slow response times, generating alerts from the application server, database, and load balancer. Through correlation, the incident team identifies that all issues started after a recent code deployment, allowing them to quickly rollback the change rather than investigating each alert separately.
--------------------------------------------------------------------------------
/src/critical-incident.md:
--------------------------------------------------------------------------------
1 | ---
2 | excerpt: A Critical Incident is a high-severity event that significantly impacts business operations, customer experience, or data security.
3 | term: Critical Incident
4 | ---
5 | ## What Is Critical Incident
6 |
7 | A Critical Incident is a high-severity event that significantly impacts business operations, customer experience, or data security. These incidents typically involve service outages, data breaches, or system failures that require immediate attention and often activate emergency response procedures.
8 |
9 | ## Example of Critical Incident
10 |
11 | An e-commerce platform experiences complete payment system failure during Black Friday sales. Customers cannot complete purchases, revenue is lost by the minute, and social media complaints are escalating. This triggers the highest severity incident response, pulling in executives and multiple technical teams.
--------------------------------------------------------------------------------
/src/critical-service.md:
--------------------------------------------------------------------------------
1 | ## What Is Critical Service
2 |
3 | A critical service is any system or application essential for business operations. If it fails, it causes major disruption or financial loss.
4 |
5 | ## Why Is Critical Service Important
6 |
7 | Identifying critical services helps teams focus monitoring and response efforts where downtime would hurt most. This protects revenue and reputation.
8 |
9 | ## Example Of Critical Service
10 |
11 | A payment gateway for an e-commerce site is a critical service. If it goes down, customers can't buy products.
12 |
13 | ## How To Identify Critical Service
14 |
15 | - List all services and map their dependencies
16 | - Assess the business impact if each service fails
17 | - Tag critical services in monitoring and alerting tools
18 | - Review and update the list regularly
--------------------------------------------------------------------------------
/src/cyber-physical-systems-incidents.md:
--------------------------------------------------------------------------------
1 | ## What Are Cyber Physical Systems Incidents
2 |
3 | Cyber Physical Systems Incidents are events that affect systems where digital and physical components interact. These incidents can impact both digital operations and physical infrastructure, potentially causing real-world consequences.
4 |
5 | ## Example of Cyber Physical Systems Incidents
6 |
7 | A manufacturing plant experiences a cyberattack that alters the settings on robotic assembly lines. This not only disrupts digital systems but also causes physical damage to products and equipment.
8 |
9 | ## How to Implement Cyber Physical Systems Incident Management
10 |
11 | - Develop incident response plans that address both cyber and physical aspects
12 | - Create cross-functional teams with IT and operational technology expertise
13 | - Implement monitoring systems that cover both digital and physical components
14 | - Establish communication protocols between IT and physical security teams
15 | - Conduct regular simulations of cyber-physical incidents
16 |
17 | ## Best Practices
18 |
19 | - Maintain up-to-date inventories of all cyber-physical system components
20 | - Implement segmentation between IT and operational technology networks
21 | - Regularly assess and update security measures for both cyber and physical aspects
--------------------------------------------------------------------------------
/src/dashboard.md:
--------------------------------------------------------------------------------
1 | ---
2 | excerpt: A dashboard is a visual display that presents critical incident management data in real-time.
3 | term: Dashboard
4 | ---
5 | ## What Is Dashboard
6 |
7 | A dashboard is a visual display that presents critical incident management data in real-time. It consolidates key metrics, alerts, and statuses into a single interface, allowing teams to monitor system health, track ongoing incidents, and measure performance at a glance.
8 |
9 | ## Why Is Dashboard Important
10 |
11 | Dashboards provide immediate visibility into your operational status, helping teams detect and respond to incidents faster. They transform complex data into actionable insights, facilitate better decision-making during critical situations, and keep all stakeholders informed with consistent, up-to-date information.
--------------------------------------------------------------------------------
/src/data-breach.md:
--------------------------------------------------------------------------------
1 | ---
2 | excerpt: A data breach is an incident where unauthorized parties gain access to sensitive, protected, or confidential information.
3 | term: Data Breach
4 | ---
5 | ## What Is Data Breach
6 |
7 | A data breach is an incident where unauthorized parties gain access to sensitive, protected, or confidential information. In incident management, it represents a security incident requiring immediate response to contain the breach, assess the damage, and prevent further unauthorized access.
8 |
9 | ## Example Of Data Breach
10 |
11 | A company discovers unusual database query patterns at 2 AM. Investigation reveals an attacker exploited a vulnerability to access customer records. The incident team immediately isolates affected systems, blocks the attack vector, and begins assessing which data was compromised.
--------------------------------------------------------------------------------
/src/detection-time-mttd.md:
--------------------------------------------------------------------------------
1 | ---
2 | excerpt: Detection Time, often measured as Mean Time to Detect (MTTD), is the average time between when an incident occurs and when it is discovered by the organization.
3 | term: Detection Time (MTTD)
4 | ---
5 | ## What Is Detection Time (MTTD)
6 |
7 | Detection Time, often measured as Mean Time to Detect (MTTD), is the average time between when an incident occurs and when it is discovered by the organization. This metric reflects how quickly your monitoring systems and processes can identify problems that affect your services or infrastructure.
8 |
9 | ## Why Is Detection Time Important
10 |
11 | Fast detection time directly impacts your ability to resolve incidents quickly. The longer an issue goes undetected, the more damage it can cause to your systems and customer experience. Reducing MTTD helps minimize downtime, preserve customer trust, and lower the overall business impact of incidents.
12 |
13 | ## Example Of Detection Time
14 |
15 | A company's website experiences a database slowdown at 2:00 AM. Their monitoring system detects the issue at 2:07 AM, resulting in a detection time of 7 minutes. This quick detection allows the on-call team to address the problem before most customers are affected.
--------------------------------------------------------------------------------
/src/emergency-change-advisory-board-ecab.md:
--------------------------------------------------------------------------------
1 | ---
2 | excerpt: An Emergency Change Advisory Board (ECAB) is a smaller, more accessible version of the standard Change Advisory Board that convenes quickly to assess and approve emergency changes.
3 | term: Emergency Change Advisory Board (ECAB)
4 | ---
5 | ## What Is Emergency Change Advisory Board (ECAB)
6 |
7 | An Emergency Change Advisory Board (ECAB) is a smaller, more accessible version of the standard Change Advisory Board that convenes quickly to assess and approve emergency changes. The ECAB typically consists of key decision-makers who can be reached 24/7 to evaluate high-priority changes that cannot wait for regular CAB meetings.
8 |
9 | ## Why Is Emergency Change Advisory Board (ECAB) Important
10 |
11 | The ECAB provides governance and oversight for emergency changes while maintaining the agility needed during critical incidents. It balances the need for quick action with appropriate risk assessment, preventing rash decisions while still enabling rapid response to urgent situations.
--------------------------------------------------------------------------------
/src/emergency-committee.md:
--------------------------------------------------------------------------------
1 | ---
2 | excerpt: An Emergency Committee is a cross-functional team responsible for managing organizational response during major incidents or crises.
3 | term: Emergency Committee
4 | ---
5 | ## What Is Emergency Committee
6 |
7 | An Emergency Committee is a cross-functional team responsible for managing organizational response during major incidents or crises. This committee makes critical decisions, coordinates resources, and oversees communication during emergencies that affect business operations, safety, or reputation.
8 |
9 | ## Why Is Emergency Committee Important
10 |
11 | Emergency Committees provide clear leadership and decision-making authority during crises when normal operations are disrupted. They bring together expertise from across the organization to address complex incidents, coordinate response efforts, and minimize impact on business operations and stakeholders.
12 |
13 | ## How To Build Emergency Committee
14 |
15 | - Appoint members representing IT, security, legal, communications, and business units
16 | - Define clear roles and responsibilities for each committee member
17 | - Establish activation thresholds and notification procedures
18 | - Create decision-making frameworks for common emergency scenarios
19 | - Conduct regular training exercises to prepare the committee
--------------------------------------------------------------------------------
/src/enterprise-architect.md:
--------------------------------------------------------------------------------
1 | ---
2 | excerpt: An Enterprise Architect is a strategic role responsible for designing and overseeing an organization's IT architecture to align with business goals. In incident management, they provide critical insights into system dependencies, potential impacts, and architectural solutions to prevent recurring incidents.
3 | term: Enterprise Architect
4 | ---
5 | ## What Is Enterprise Architect
6 |
7 | An Enterprise Architect is a strategic role responsible for designing and overseeing an organization's IT architecture to align with business goals. In incident management, they provide critical insights into system dependencies, potential impacts, and architectural solutions to prevent recurring incidents.
8 |
9 | ## Why Is Enterprise Architect Important
10 |
11 | Enterprise Architects bring a holistic view of the organization's technology landscape during incident response and prevention. They help identify root causes that span multiple systems, design more resilient architectures, and ensure that incident-driven changes align with long-term technology strategy rather than creating technical debt.
--------------------------------------------------------------------------------
/src/error-budget.md:
--------------------------------------------------------------------------------
1 | ---
2 | excerpt: An error budget is a predefined amount of acceptable system downtime or errors within a specific period.
3 | term: Error Budget
4 | ---
5 | ## What Is Error Budget
6 |
7 | An error budget is a predefined amount of acceptable system downtime or errors within a specific period. It balances the need for system reliability with the pace of innovation.
8 |
9 | ## Why Is Error Budget Important
10 |
11 | Error budgets help teams make informed decisions about when to push new features versus focusing on stability. They create a shared responsibility for reliability between development and operations teams.
12 |
13 | ## Example Of Error Budget
14 |
15 | A company sets a 99.9% uptime goal for their service. This allows for 43 minutes of downtime per month. Teams can use this budget for planned maintenance or new feature deployments.
--------------------------------------------------------------------------------
/src/escalation-delay.md:
--------------------------------------------------------------------------------
1 | ---
2 | excerpt: Escalation delay is the time taken between an incident being detected and the moment it is escalated to the next level of response.
3 | term: Escalation Delay
4 | ---
5 | ## What Is Escalation Delay
6 |
7 | Escalation delay is the time taken between an incident being detected and the moment it is escalated to the next level of response. It measures how quickly issues move up the chain when initial responders cannot resolve them.
8 |
9 | ## Why Is Escalation Delay Important
10 |
11 | Escalation delays give enough time for the responder to acknowledge and work on the incident before calling in the next person.
12 |
13 | ## Example Of Escalation Delay
14 |
15 | A critical database alert is not acknowledged within 10 minutes. The system escalates the alert to a senior engineer, who responds immediately.
16 |
17 | ## How To Implement Escalation Delay With Spike
18 |
19 | - Go to the Escalations section in Spike and click on new escalation
20 | - Add team members and set the wait time before Spike escalates to the next level
21 | - Save your policy. Spike will now alert the next person if the incident isn’t acknowledged in time
22 |
23 | Keep incidents moving with [Spike](https://app.spike.sh/signup).
--------------------------------------------------------------------------------
/src/escalation-matrix.md:
--------------------------------------------------------------------------------
1 | ---
2 | excerpt: An escalation matrix is a visual representation of the escalation policy, showing who to contact at each level of escalation for different types of incidents.
3 | term: Escalation Matrix
4 | ---
5 | ## What Is Escalation Matrix
6 |
7 | An escalation matrix is a visual representation of the escalation policy, showing who to contact at each level of escalation for different types of incidents.
8 |
9 | ## Why Is Escalation Matrix Important
10 |
11 | An escalation matrix provides clarity and quick reference during high-stress incidents. It speeds up decision-making and ensures that the right people are involved at the right time.
12 |
13 | ## Example Of Escalation Matrix
14 |
15 | A table showing incident types (network, database, application) on one axis and escalation levels (L1, L2, L3, Management) on the other, with specific team members or roles listed in each cell.
16 |
17 | ## How to Create Escalation Matrix
18 |
19 | - Identify all possible incident types and severity levels
20 | - Define escalation levels and associated roles
21 | - Create a clear, easy-to-read matrix document
22 | - Distribute the matrix to all relevant team members
23 | - Integrate the matrix into your incident management tools
24 |
25 | ## Best Practices
26 |
27 | - Keep the matrix up-to-date with current contact information
28 | - Make the matrix easily accessible during incidents
29 | - Include backup contacts for each role in the matrix
--------------------------------------------------------------------------------
/src/event-correlation.md:
--------------------------------------------------------------------------------
1 | ---
2 | excerpt: Event Correlation is the process of analyzing relationships between multiple events to identify patterns, causes, and effects.
3 | term: Event Correlation
4 | ---
5 | ## What Is Event Correlation
6 |
7 | Event Correlation is the process of analyzing relationships between multiple events to identify patterns, causes, and effects. It helps teams connect seemingly isolated events into a coherent picture, revealing the underlying issues that may be causing incidents.
8 |
9 | ## Why Is Event Correlation Important
10 |
11 | Event correlation reduces noise by grouping related alerts together. It helps identify the root cause when multiple systems generate events due to a single underlying problem. This speeds up diagnosis, reduces mean time to resolution, and prevents teams from chasing symptoms rather than causes.
12 |
13 | ## Example Of Event Correlation
14 |
15 | A network switch failure triggers dozens of separate alerts from dependent systems. Event correlation tools recognize that all these alerts started within seconds of each other and share a network path. The system automatically creates a single incident ticket focused on the switch rather than the downstream effects.
--------------------------------------------------------------------------------
/src/event-enrichment.md:
--------------------------------------------------------------------------------
1 | ---
2 | excerpt: Event enrichment is the process of adding context and relevant information to raw event data.
3 | term: Event Enrichment
4 | ---
5 | ## What Is Event Enrichment
6 |
7 | Event enrichment is the process of adding context and relevant information to raw event data. It helps incident responders quickly understand and prioritize events.
8 |
9 | ## Why Is Event Enrichment Important
10 |
11 | Enriched events provide more actionable information, reducing the time to diagnose and resolve incidents. It helps teams focus on critical issues and make informed decisions faster.
12 |
13 | ## Example Of Event Enrichment
14 |
15 | A server error event is enriched with information about the affected service, recent code deployments, and current user impact. This additional context helps the on-call engineer quickly assess the situation.
16 |
17 | ## How to Implement Event Enrichment
18 |
19 | - Identify key data sources for enrichment (e.g., CMDB, monitoring tools)
20 | - Set up integrations to pull relevant data in real-time
21 | - Define rules for automatically adding context to events
22 | - Implement a system to correlate related events
23 | - Continuously refine your enrichment process based on feedback
24 |
25 | ## Best Practices
26 |
27 | - Focus on adding truly valuable information to avoid noise
28 | - Use standardized formats for enriched data to aid in analysis
29 | - Ensure that enrichment doesn't significantly delay event processing
--------------------------------------------------------------------------------
/src/event-filtering.md:
--------------------------------------------------------------------------------
1 | ---
2 | excerpt: Event filtering is a process in incident management that selects or excludes specific events based on predefined criteria.
3 | term: Event Filtering
4 | ---
5 | ## What Is Event Filtering
6 |
7 | Event filtering is a process in incident management that selects or excludes specific events based on predefined criteria. It helps reduce noise and focus on relevant information, allowing teams to prioritize and respond to critical incidents more effectively.
8 |
9 | ## Why Is Event Filtering Important
10 |
11 | Event filtering prevents information overload by reducing the volume of non-critical alerts. It helps incident response teams focus on significant events, improves response times, and reduces alert fatigue among team members.
12 |
13 | ## Example Of Event Filtering
14 |
15 | A monitoring system generates alerts for all server CPU usage spikes. Event filtering is applied to only show alerts when CPU usage exceeds 90% for more than 5 minutes, reducing unnecessary notifications.
16 |
17 | ## How To Implement Event Filtering
18 |
19 | - Define clear criteria for critical events
20 | - Configure monitoring tools with appropriate thresholds
21 | - Set up rules to categorize and prioritize events
22 | - Regularly review and adjust filtering rules
23 |
24 | ## Best Practices
25 |
26 | - Align filtering criteria with business priorities
27 | - Use a tiered approach to filter events by severity
28 | - Regularly audit filtered events to ensure critical issues aren't missed
--------------------------------------------------------------------------------
/src/event-record.md:
--------------------------------------------------------------------------------
1 | ---
2 | excerpt: An event record is a documented account of a significant occurrence within an IT environment.
3 | term: Event Record
4 | ---
5 | ## What Is Event Record
6 |
7 | An event record is a documented account of a significant occurrence within an IT environment. It typically includes details such as the event type, timestamp, affected systems, and any automatic or manual actions taken in response.
8 |
9 | ## Why Is Event Record Important
10 |
11 | Event records provide a historical trail of system behavior, supporting incident analysis, trend identification, and compliance requirements. They serve as valuable references for troubleshooting and improving incident response processes.
12 |
13 | ## Example Of Event Record
14 |
15 | A server crash event record might include: Event ID: 1001, Timestamp: 2023-04-14 15:30 UTC, Description: Unexpected server shutdown, Affected System: Web Server 03, Initial Action: Automatic restart initiated.
--------------------------------------------------------------------------------
/src/event-review.md:
--------------------------------------------------------------------------------
1 | ---
2 | excerpt: Event review is the process of analyzing recorded events to gain insights, identify patterns, and improve incident management processes.
3 | term: Event Review
4 | ---
5 | ## What Is Event Review
6 |
7 | Event review is the process of analyzing recorded events to gain insights, identify patterns, and improve incident management processes. It involves examining event records, discussing significant incidents, and developing action plans for future improvements.
8 |
9 | ## Why Is Event Review Important
10 |
11 | Event reviews help organizations learn from past incidents, refine their response strategies, and prevent recurring issues. They support continuous improvement in incident management and contribute to overall system reliability.
12 |
13 | ## Example Of Event Review
14 |
15 | A monthly event review reveals that 70% of critical incidents were related to a specific application. This insight leads to a focused investigation and subsequent upgrade of the problematic application.
16 |
17 | ## How To Do Event Review
18 |
19 | - Schedule regular event review sessions
20 | - Prepare summaries of significant events for discussion
21 | - Involve key stakeholders from different teams
22 | - Analyze trends and patterns in event data
23 | - Develop action plans based on review findings
24 |
25 | ## Best Practices
26 |
27 | - Focus on learning and improvement rather than blame
28 | - Use data visualization to help identify trends
29 | - Follow up on action items from previous reviews
--------------------------------------------------------------------------------
/src/event.md:
--------------------------------------------------------------------------------
1 | ---
2 | excerpt: An event is any observable occurrence in an IT system or business process that may require attention.
3 | term: Event
4 | ---
5 | ## What Is Event
6 |
7 | An event is any observable occurrence in an IT system or business process that may require attention. Events include system alerts, log entries, monitoring notifications, or user reports. Not all events are incidents—they only become incidents when they impact service or violate service level agreements.
8 |
9 | ## Why Is Event Important
10 |
11 | Events provide the raw data needed to detect and diagnose incidents. Proper event monitoring allows teams to identify potential issues before they impact users. Events also create an audit trail that helps with post-incident analysis and pattern recognition.
12 |
13 | ## Example Of Event
14 |
15 | A server generates an event when its CPU utilization exceeds 90% for five minutes. This event triggers an alert to the operations team, who investigate whether this performance issue might affect service delivery or requires intervention.
--------------------------------------------------------------------------------
/src/failure-point.md:
--------------------------------------------------------------------------------
1 | ---
2 | excerpt: A failure point is a specific component, process, or connection in a system that can malfunction and cause an incident.
3 | term: Failure Point
4 | ---
5 | ## What Is Failure Point
6 |
7 | A failure point is a specific component, process, or connection in a system that can malfunction and cause an incident. In incident management, identifying failure points helps teams understand where problems originate and how they propagate through interconnected systems.
8 |
9 | ## Why Is Failure Point Important
10 |
11 | Understanding failure points helps teams respond more effectively to incidents by targeting the root cause rather than symptoms. It also guides preventive measures to strengthen vulnerable areas. Mapping potential failure points in advance speeds up troubleshooting when incidents occur.
12 |
13 | ## Example Of Failure Point
14 |
15 | During a service outage, an incident response team identifies a load balancer as the failure point. While multiple application servers showed errors, the investigation revealed that the load balancer stopped distributing traffic properly. This insight allowed them to restore service quickly by failing over to a backup load balancer.
--------------------------------------------------------------------------------
/src/false-alarm.md:
--------------------------------------------------------------------------------
1 | ---
2 | excerpt: A false alarm in incident management is an alert triggered by something other than a real incident or threat.
3 | term: False Alarm
4 | ---
5 | ## What Is a False Alarm
6 |
7 | A false alarm in incident management is an alert triggered by something other than a real incident or threat. It signals a problem when there is none, often due to system errors or misconfigured monitoring tools.
8 |
9 | ## Example of a False Alarm
10 |
11 | A monitoring system sends an alert about high CPU usage. On investigation, the spike was caused by a scheduled backup, not a real problem.
12 |
13 | ## How to Reduce False Alarms
14 |
15 | - Regularly review and update alert rules to match current systems
16 | - Investigate each false alarm to find the root cause
17 | - Adjust thresholds and filters to reduce unnecessary alerts
18 | - Keep a log of false alarms to spot patterns
19 | - Train team members to recognize and report false alarms
--------------------------------------------------------------------------------
/src/first-responder-assignment.md:
--------------------------------------------------------------------------------
1 | ---
2 | excerpt: First Responder Assignment is the process of designating specific team members to be the initial point of contact when an incident occurs.
3 | term: First Responder Assignment
4 | ---
5 | ## What Is First Responder Assignment
6 |
7 | First Responder Assignment is the process of designating specific team members to be the initial point of contact when an incident occurs. These individuals are responsible for acknowledging alerts, performing initial triage, and either resolving the issue or escalating it appropriately.
8 |
9 | ## Why Is First Responder Assignment Important
10 |
11 | Clear First Responder Assignment prevents confusion during critical incidents and ensures someone takes immediate ownership of emerging problems. It reduces response time by eliminating the "someone else will handle it" mentality and creates accountability within incident management workflows.
12 |
13 | ## Example Of First Responder Assignment
14 |
15 | When a critical database alert triggers at 2 AM, the incident management system automatically assigns it to Sarah, the on-call database engineer. She receives the notification, acknowledges it within minutes, and begins troubleshooting while other team members are automatically notified based on the incident's severity.
--------------------------------------------------------------------------------
/src/first-responder.md:
--------------------------------------------------------------------------------
1 | ---
2 | excerpt: A first responder in incident management is the person or team who reacts first to an alert or incident.
3 | term: First Responder
4 | ---
5 | ## What Is First Responder
6 |
7 | A first responder in incident management is the person or team who reacts first to an alert or incident. They assess the situation, start troubleshooting, and may escalate the issue if needed.
8 |
9 | ## Why Is First Responder Important
10 |
11 | First responders help reduce downtime by acting quickly when incidents occur. Their fast action can limit the impact on users and business operations.
12 |
13 | ## Example Of First Responder
14 |
15 | A site reliability engineer receives a high-priority alert at 2 AM. She investigates the issue, restarts a failed service, and updates the incident log before passing it to the next team.
--------------------------------------------------------------------------------
/src/fix.md:
--------------------------------------------------------------------------------
1 | ---
2 | excerpt: A fix is a solution or correction implemented to resolve an incident or problem.
3 | term: Fix
4 | ---
5 | ## What Is Fix
6 |
7 | A fix is a solution or correction implemented to resolve an incident or problem. In incident management, fixes can range from simple restarts to complex code changes or infrastructure modifications that address the root cause of an issue.
8 |
9 | ## Why Is Fix Important
10 |
11 | Fixes restore normal service operations and prevent incident recurrence. They transform reactive incident response into proactive problem management. Properly documented fixes also build organizational knowledge and improve future incident handling.
12 |
13 | ## Example Of Fix
14 |
15 | After investigating a website outage, engineers discover a memory leak in a new code release. They implement a fix by patching the code to properly release memory resources and deploy it through an emergency change process.
--------------------------------------------------------------------------------
/src/fixed-asset.md:
--------------------------------------------------------------------------------
1 | ---
2 | excerpt: In incident management, a fixed asset refers to long-term physical infrastructure components like servers, network equipment, or data centers that support IT operations.
3 | term: Fixed Asset
4 | ---
5 | ## What Is Fixed Asset
6 |
7 | In incident management, a fixed asset refers to long-term physical infrastructure components like servers, network equipment, or data centers that support IT operations. These assets have significant value and extended useful lives.
8 |
9 | ## Why Is Fixed Asset Important
10 |
11 | Fixed assets form the foundation of IT infrastructure that supports critical services. Understanding these assets helps teams identify potential points of failure, plan maintenance activities, and make informed decisions during incidents involving physical infrastructure.
--------------------------------------------------------------------------------
/src/glossary.hbs:
--------------------------------------------------------------------------------
1 | ---
2 | layout: layouts/base.hbs
3 | title: Home
4 | ---
5 |
6 |
7 |
Spike.sh Glossary
8 |
Welcome to our comprehensive glossary of terms related to incident response, monitoring, and digital services.
9 |
10 |
11 |
12 |
Featured Terms
13 |
14 | {{#each collections.featuredItems}}
15 |
16 |
{{data.title}}
17 |
{{data.excerpt}}
18 |
19 |
20 | {{/each}}
21 |
22 |
23 |
24 |
27 |
--------------------------------------------------------------------------------
/src/ground-support-unit.md:
--------------------------------------------------------------------------------
1 | ---
2 | excerpt: A Ground Support Unit is a specialized team that provides logistical and operational support during major incidents.
3 | term: Ground Support Unit
4 | ---
5 | ## What Is Ground Support Unit
6 |
7 | A Ground Support Unit is a specialized team that provides logistical and operational support during major incidents. They handle resource coordination, equipment deployment, and physical infrastructure needs to support incident responders.
8 |
9 | ## Why Is Ground Support Unit Important
10 |
11 | Ground Support Units free up technical responders to focus on incident resolution rather than logistics. They provide crucial infrastructure stability during crisis situations and help maintain operational continuity when normal systems are compromised.
12 |
13 | ## Example Of Ground Support Unit
14 |
15 | During a data center power failure, the Ground Support Unit coordinates generator deployment, manages fuel supplies, arranges emergency cooling systems, and ensures responders have workspaces with power and connectivity to manage the incident.
16 |
17 | ## How To Build Ground Support Unit
18 |
19 | - Identify team members with logistics and operational expertise
20 | - Define clear roles and responsibilities for the unit
21 | - Create resource inventories and supplier contacts
22 | - Develop procedures for rapid deployment
23 | - Train the unit through regular exercises and simulations
--------------------------------------------------------------------------------
/src/hazard-mitigation.md:
--------------------------------------------------------------------------------
1 | ---
2 | excerpt: Hazard mitigation in incident management is the process of identifying potential risks and taking proactive steps to reduce their impact or likelihood.
3 | term: Hazard Mitigation
4 | ---
5 | ## What Is Hazard Mitigation
6 |
7 | Hazard mitigation in incident management is the process of identifying potential risks and taking proactive steps to reduce their impact or likelihood. It involves analyzing past incidents, assessing current vulnerabilities, and implementing preventive measures to minimize future threats.
8 |
9 | ## Why Is Hazard Mitigation Important
10 |
11 | Hazard mitigation helps organizations reduce the frequency and severity of incidents. By addressing potential problems before they occur, companies can save time, resources, and reputation. It also improves overall system reliability and resilience.
12 |
13 | ## Example of Hazard Mitigation
14 |
15 | A tech company identifies that power outages often lead to service disruptions. They implement backup power systems and distribute their servers across multiple locations to mitigate this hazard.
16 |
17 | ## How to Mitigate Hazards
18 |
19 | - Conduct a thorough risk assessment
20 | - Prioritize identified hazards based on impact and likelihood
21 | - Develop mitigation strategies for high-priority hazards
22 | - Implement chosen mitigation measures
23 | - Monitor and evaluate the effectiveness of mitigation efforts
--------------------------------------------------------------------------------
/src/health-check.md:
--------------------------------------------------------------------------------
1 | ---
2 | excerpt: A health check in incident management is a routine assessment of a system's operational status.
3 | term: Health Check
4 | ---
5 | ## What Is A Health Check
6 |
7 | A health check in incident management is a routine assessment of a system's operational status. It involves monitoring key performance indicators, checking for early warning signs of potential issues, and verifying that all components are functioning correctly.
8 |
9 | ## Why Is A Health Check Important
10 |
11 | Regular health checks help detect problems early, preventing minor issues from escalating into major incidents. They provide a snapshot of system health, allowing teams to maintain optimal performance and reduce downtime.
12 |
13 | ## How To Do Health Checks
14 |
15 | - Identify critical components and services to monitor
16 | - Set up automated monitoring tools for continuous checks
17 | - Define thresholds for normal vs. abnormal behavior
18 | - Establish a process for addressing issues detected during health checks
19 | - Regularly review and adjust health check parameters
20 |
21 | ## Best Practices
22 |
23 | - Automate health checks where possible
24 | - Include both technical and business metrics in health checks
25 | - Act promptly on health check results to prevent incidents
--------------------------------------------------------------------------------
/src/high-availability.md:
--------------------------------------------------------------------------------
1 | ---
2 | excerpt: High Availability is a system design approach that ensure an agreed level of operational performance, usually uptime, for a higher than normal period.
3 | term: High Availability
4 | ---
5 | ## What Is High Availability
6 |
7 | High Availability is a system design approach that ensure an agreed level of operational performance, usually uptime, for a higher than normal period. In incident management, high availability systems are designed to operate continuously without failure for a designated period of time through redundancy, failover capabilities, and elimination of single points of failure.
8 |
9 | ## Why Is High Availability Important
10 |
11 | High availability directly impacts business continuity and customer satisfaction. When critical systems remain operational despite component failures, organizations avoid costly downtime, maintain service level agreements, and protect their reputation. For incident teams, high availability reduces the frequency and severity of incidents they must manage.
12 |
13 | ## Example Of High Availability
14 |
15 | A cloud service provider implements high availability by deploying applications across multiple data centers. When a hardware failure occurs in one location, traffic automatically routes to healthy servers in another data center. Users experience no service interruption while the incident team addresses the underlying issue.
--------------------------------------------------------------------------------
/src/high-priority-incident.md:
--------------------------------------------------------------------------------
1 | ---
2 | excerpt: A High Priority Incident is an event that severely impacts business operations, affects numerous users, or threatens data security.
3 | term: High Priority Incident
4 | ---
5 | ## What Is High Priority Incident
6 |
7 | A High Priority Incident is an event that severely impacts business operations, affects numerous users, or threatens data security. These incidents require immediate attention, dedicated resources, and often involve escalation to senior technical staff or management.
8 |
9 | ## Why Identifying High Priority Incident Important
10 |
11 | Properly identifying and handling high priority incidents ensures that critical issues receive appropriate resources and attention. This prioritization helps organizations minimize damage, reduce downtime, and maintain customer trust during serious disruptions.
12 |
13 | ## Example Of High Priority Incident
14 |
15 | An e-commerce platform experiences a database failure during a major sales event, preventing customers from completing purchases. The incident team immediately classifies this as high priority, activates the incident command structure, and mobilizes database specialists to restore functionality.
--------------------------------------------------------------------------------
/src/human-error.md:
--------------------------------------------------------------------------------
1 | ---
2 | excerpt: Human error in incident management refers to mistakes, oversights, or poor decisions made by individuals that lead to or exacerbate incidents.
3 | term: Human Error
4 | ---
5 | ## What Is Human Error
6 |
7 | Human error in incident management refers to mistakes, oversights, or poor decisions made by individuals that lead to or exacerbate incidents. These can range from simple typos to complex misjudgments in critical situations.
8 |
9 | ## Why Is Understanding Human Error Important
10 |
11 | Recognizing the role of human error helps organizations develop more effective incident prevention and response strategies. It shifts focus from blame to systemic improvements, fostering a culture of learning and continuous improvement.
12 |
13 | ## Example of Human Error
14 |
15 | A system administrator accidentally deletes a critical database during routine maintenance, causing a major service outage. This incident highlights the need for better safeguards and verification processes.
--------------------------------------------------------------------------------
/src/immediate-resolution.md:
--------------------------------------------------------------------------------
1 | ---
2 | excerpt: Immediate Resolution is the rapid fixing of an incident without escalation to other teams or extensive investigation.
3 | term: Immediate Resolution
4 | ---
5 | ## What Is Immediate Resolution
6 |
7 | Immediate Resolution is the rapid fixing of an incident without escalation to other teams or extensive investigation. It typically applies to simple, well-understood issues with established solutions that can be implemented quickly by first responders.
8 |
9 | ## Why Is Immediate Resolution Important
10 |
11 | Immediate Resolution minimizes service disruption and reduces the overall cost of incidents. It prevents minor issues from escalating into major problems and frees up specialized resources to focus on more complex incidents. Quick wins also boost team morale and customer satisfaction.
12 |
13 | ## Example Of Immediate Resolution
14 |
15 | A monitoring alert shows a web server approaching memory limits. The on-call engineer recognizes this as a common issue and immediately restarts the application server process, resolving the problem within minutes before users experience any slowdown.
--------------------------------------------------------------------------------
/src/impact-analysis-tools-for-incidents.md:
--------------------------------------------------------------------------------
1 | ---
2 | excerpt: Impact Analysis Tools for Incidents are software solutions that help organizations assess and visualize the potential consequences of IT incidents on business operations, services, and users.
3 | term: Impact Analysis Tools For Incidents
4 | ---
5 | ## What Are Impact Analysis Tools For Incidents
6 |
7 | Impact Analysis Tools for Incidents are software solutions that help organizations assess and visualize the potential consequences of IT incidents on business operations, services, and users. These tools map dependencies and simulate incident scenarios to provide a comprehensive view of possible impacts.
8 |
9 | ## Why Are Impact Analysis Tools For Incidents Important
10 |
11 | These tools enable IT teams to quickly understand the scope and severity of an incident. They help prioritize response efforts, allocate resources effectively, and communicate potential impacts to stakeholders. This leads to faster, more informed decision-making during incident management.
12 |
13 | ## Example of Impact Analysis Tools For Incidents
14 |
15 | A bank's impact analysis tool detects a potential database failure. It immediately shows which customer-facing services would be affected, the number of users impacted, and the estimated financial loss per hour of downtime.
--------------------------------------------------------------------------------
/src/incident-commander.md:
--------------------------------------------------------------------------------
1 | ---
2 | excerpt: An Incident Commander is the designated leader who manages the response to an incident.
3 | term: Incident Commander
4 | ---
5 | ## What Is Incident Commander
6 |
7 | An Incident Commander is the designated leader who manages the response to an incident. They coordinate all aspects of incident resolution, make critical decisions, delegate tasks, and serve as the central communication point throughout the incident lifecycle.
8 |
9 | ## Why Is Incident Commander Important
10 |
11 | The Incident Commander provides clear leadership during high-stress situations, preventing confusion and ensuring organized response efforts. They maintain situational awareness, remove obstacles for responders, and keep stakeholders informed about incident status and progress.
12 |
13 | ## Example Of Incident Commander
14 |
15 | During a major database outage, the Incident Commander establishes a virtual war room, assigns engineers to investigate specific components, coordinates with the communications team for customer updates, and makes the final call on implementing a database rollback to restore service.
--------------------------------------------------------------------------------
/src/incident-lifecycle.md:
--------------------------------------------------------------------------------
1 | ---
2 | excerpt: The incident lifecycle is the complete sequence of stages an incident goes through from initial detection to final resolution and review.
3 | term: Incident Lifecycle
4 | ---
5 | ## What Is Incident Lifecycle
6 |
7 | The incident lifecycle is the complete sequence of stages an incident goes through from initial detection to final resolution and review. It typically includes identification, logging, categorization, prioritization, response, resolution, and post-incident analysis.
8 |
9 | ## Why Understanding Incident Lifecycle Important
10 |
11 | Understanding the incident lifecycle provides structure to incident management processes. It helps teams track progress, assign responsibilities at each stage, and maintain consistency in how incidents are handled. A well-defined lifecycle improves coordination and reduces resolution time.
12 |
13 | ## Example Of Incident Lifecycle
14 |
15 | A payment processing failure is identified (identification), recorded in the incident management system (logging), classified as a critical service disruption (categorization), assigned highest priority (prioritization), addressed by the response team (response), fixed through a database rollback (resolution), and analyzed in a postmortem meeting (review).
--------------------------------------------------------------------------------
/src/incident-logging.md:
--------------------------------------------------------------------------------
1 | ---
2 | excerpt: Incident logging is the process of creating a formal record of an incident in an incident management system.
3 | term: Incident Logging
4 | ---
5 | ## What Is Incident Logging
6 |
7 | Incident logging is the process of creating a formal record of an incident in an incident management system. It captures essential details including the time of occurrence, affected systems, symptoms, impact, and initial categorization.
8 |
9 | ## Why Is Incident Logging Important
10 |
11 | Proper incident logging creates a single source of truth for each incident. It provides the foundation for effective incident management, enables accurate tracking and reporting, and preserves critical information for analysis and knowledge sharing after resolution.
12 |
13 | ## Example Of Incident Logging
14 |
15 | After receiving reports of slow response times on the checkout page, an engineer creates an incident ticket that includes the timestamp, affected service, user impact description, initial severity assessment, and links to relevant monitoring dashboards.
16 |
17 | ## How To Log Incidents With Spike
18 |
19 | - Connect your monitoring tools, and Spike automatically logs incidents for you
20 | - You can also log incidents manually through the web UI, Slack, or API
21 | - Spike captures essential details and keeps a full timeline of all incident activity
22 |
23 | Centralize your incident records and gain clarity. Log your next incident with [Spike](https://app.spike.sh/signup).
--------------------------------------------------------------------------------
/src/incident-management.md:
--------------------------------------------------------------------------------
1 | ---
2 | excerpt: Incident management is the process of responding to unplanned events or service disruptions to restore normal operations as quickly as possible.
3 | term: Incident Management
4 | ---
5 | ## What Is Incident Management
6 |
7 | Incident management is the process of responding to unplanned events or service disruptions to restore normal operations as quickly as possible. It encompasses the entire workflow from detection through resolution, including communication, coordination, and continuous improvement.
8 |
9 | ## Why Is Incident Management Important
10 |
11 | Effective incident management minimizes downtime, reduces business impact, and maintains service quality. It provides structure during stressful situations, improves team coordination, and builds organizational resilience through systematic learning from past incidents.
12 |
13 | ## Example Of Incident Management
14 |
15 | When a cloud service provider experiences a network outage, their incident management process activates. A response team assembles, communicates with affected customers, implements workarounds, resolves the underlying issue, and documents lessons learned to prevent similar incidents.
--------------------------------------------------------------------------------
/src/incident-manager.md:
--------------------------------------------------------------------------------
1 | ---
2 | excerpt: An Incident Manager is a professional responsible for overseeing the entire incident management process.
3 | term: Incident Manager
4 | ---
5 | ## Who Is an Incident Manager
6 |
7 | An Incident Manager is a professional responsible for overseeing the entire incident management process. They coordinate response efforts, communicate with stakeholders, and ensure timely resolution of incidents to minimize impact on business operations.
8 |
9 | ## Why Is an Incident Manager Important
10 |
11 | Incident Managers are crucial for effective incident response. They provide leadership during crises, streamline communication, and make critical decisions to resolve issues quickly. Their role helps organizations maintain service quality and minimize downtime.
--------------------------------------------------------------------------------
/src/incident-model.md:
--------------------------------------------------------------------------------
1 | ---
2 | excerpt: An Incident Model is a standardized framework for categorizing and responding to different types of incidents.
3 | term: Incident Model
4 | ---
5 | ## What Is an Incident Model
6 |
7 | An Incident Model is a standardized framework for categorizing and responding to different types of incidents. It defines the steps, roles, and procedures for handling specific incident scenarios, ensuring consistent and efficient responses.
8 |
9 | ## Why Is an Incident Model Important
10 |
11 | Incident Models provide a structured approach to incident management. They help teams respond more quickly and effectively, reduce errors, and improve overall incident resolution times.
12 |
13 | ## Example of an Incident Model
14 |
15 | A company's incident model for network outages includes steps for initial assessment, team mobilization, root cause analysis, and service restoration, with defined roles for each stage.
16 |
17 | ## How to Build an Incident Model
18 |
19 | - Identify common incident types in your organization
20 | - Define clear steps for each incident type
21 | - Assign roles and responsibilities
22 | - Document the model and train staff
23 | - Regularly review and update the model
24 |
25 | ## Best Practices
26 |
27 | - Keep the model flexible to adapt to unique situations
28 | - Align the model with industry best practices and standards
29 | - Regularly test the model through simulations or drills
--------------------------------------------------------------------------------
/src/incident-monitoring.md:
--------------------------------------------------------------------------------
1 | ---
2 | excerpt: Incident Monitoring is the continuous observation of systems, networks, and applications to detect and track incidents.
3 | term: Incident Monitoring
4 | ---
5 | ## What Is Incident Monitoring
6 |
7 | Incident Monitoring is the continuous observation of systems, networks, and applications to detect and track incidents. It involves using tools and processes to identify anomalies, errors, or performance issues that could indicate an incident.
8 |
9 | ## Why Is Incident Monitoring Important
10 |
11 | Effective incident monitoring allows organizations to detect issues early, reducing response times and minimizing impact. It provides real-time visibility into system health and helps prevent minor issues from escalating into major incidents.
12 |
13 | ## Example of Incident Monitoring
14 |
15 | A monitoring system detects a sudden spike in server CPU usage and automatically alerts the on-call team, allowing them to investigate and address the issue before it affects users.
16 |
17 | ## How to Implement Incident Monitoring
18 |
19 | - Select and deploy appropriate monitoring tools
20 | - Define key metrics and thresholds
21 | - Set up alerting mechanisms
22 | - Establish a process for reviewing and acting on alerts
23 | - Continuously refine monitoring parameters
24 |
25 | ## Best Practices
26 |
27 | - Use a combination of automated and manual monitoring techniques
28 | - Implement centralized logging for easier analysis
29 | - Regularly review and update monitoring thresholds and rules
--------------------------------------------------------------------------------
/src/incident-record.md:
--------------------------------------------------------------------------------
1 | ---
2 | excerpt: An incident record is a documented entry that captures all the details of an incident from detection to resolution.
3 | term: Incident Record
4 | ---
5 | ## What Is Incident Record
6 |
7 | An incident record is a documented entry that captures all the details of an incident from detection to resolution. It includes information such as the incident's description, time of occurrence, affected systems, severity level, assigned responders, and resolution steps taken.
8 |
9 | ## Why Is Incident Record Important
10 |
11 | Incident records provide a complete audit trail of all incidents within an organization. They help teams track response progress, analyze patterns over time, and create a knowledge base for similar future incidents. Well-maintained incident records also support compliance requirements and facilitate post-incident reviews.
12 |
13 | ## Example Of Incident Record
14 |
15 | A database server crashes at 2:15 PM. The incident record captures the timestamp, affected service (customer database), severity (high), assigned responder (Jane Smith), actions taken (server restart and data verification), and resolution time (2:45 PM).
--------------------------------------------------------------------------------
/src/incident-resolution.md:
--------------------------------------------------------------------------------
1 | ---
2 | excerpt: Incident resolution is the process of restoring normal service operation after an incident has occurred.
3 | term: Incident Resolution
4 | ---
5 | ## What Is Incident Resolution
6 |
7 | Incident resolution is the process of restoring normal service operation after an incident has occurred. It involves identifying the root cause, implementing a solution, verifying that service has been restored, and documenting the resolution steps taken.
8 |
9 | ## Why Is Incident Resolution Important
10 |
11 | Effective incident resolution minimizes downtime and reduces the impact on users and business operations. It helps maintain service reliability and customer trust. The speed and quality of incident resolution directly affects an organization's operational efficiency and reputation.
12 |
13 | ## Example Of Incident Resolution
14 |
15 | A payment processing system fails. The incident resolution involves identifying a memory leak in the application, deploying a hotfix to address the issue, verifying that transactions are processing correctly, and documenting the fix for future reference.
16 |
17 | ## How To Resolve Incidents With Spike
18 |
19 | - Open the incident in Spike and click the Resolve button
20 | - You can resolve incidents from web, mobile, Slack, email, or by sending an SMS
21 | - Add a resolution note to document what fixed the issue
22 | - Spike updates the status and notifies everyone involved
23 |
24 | Start resolving incidents faster and keep your team in sync with [Spike](https://app.spike.sh/signup).
--------------------------------------------------------------------------------
/src/incident-status-information.md:
--------------------------------------------------------------------------------
1 | ---
2 | excerpt: Incident Status Information is real-time data about the current state of an incident, including its severity, affected systems, resolution progress, and estimated time to resolution.
3 | term: Incident Status Information
4 | ---
5 | ## What Is Incident Status Information
6 |
7 | Incident Status Information is real-time data about the current state of an incident, including its severity, affected systems, resolution progress, and estimated time to resolution. This information helps teams track incident lifecycle and keeps stakeholders informed about ongoing issues and their impact.
8 |
9 | ## Why Is Incident Status Information Important
10 |
11 | Incident Status Information creates transparency across the organization during critical events. It reduces duplicate inquiries, allows teams to coordinate effectively, and helps stakeholders make informed decisions about business continuity. Clear status updates also build trust with customers and prevent speculation.
12 |
13 | ## Example Of Incident Status Information
14 |
15 | During a payment processing outage, the status information might include: "High severity incident affecting checkout systems. Engineering team identified database connection issue at 14:30. Estimated resolution time: 2 hours. Workaround available for urgent transactions through backup payment gateway."
--------------------------------------------------------------------------------
/src/incident-summary.md:
--------------------------------------------------------------------------------
1 | ---
2 | excerpt: An incident summary is a brief overview of an incident, including what happened, when it occurred, and its impact.
3 | term: Incident Summary
4 | ---
5 | ## What Is Incident Summary
6 |
7 | An incident summary is a brief overview of an incident, including what happened, when it occurred, and its impact. It helps teams understand the situation quickly and decide on next steps.
8 |
9 | ## Why Is Incident Summary Important
10 |
11 | A clear incident summary saves time during investigations and debriefs. It gives everyone a shared understanding of the incident, which improves response and learning.
12 |
13 | ## Example of Incident Summary
14 |
15 | A summary might state: "On May 10th, the database server crashed at 2:00 AM, affecting user logins for 30 minutes. The root cause was a failed update."
16 |
17 | ## Best Practices
18 |
19 | - Keep summaries concise and factual
20 | - Include key details: time, impact, and cause
21 | - Write summaries right after the incident for accuracy
--------------------------------------------------------------------------------
/src/incident.md:
--------------------------------------------------------------------------------
1 | ---
2 | excerpt: An incident is an unplanned interruption, degradation, or failure of a service, system, or infrastructure component that impacts business operations or users.
3 | term: Incident
4 | ---
5 | ## What Is Incident
6 |
7 | An incident is an unplanned interruption, degradation, or failure of a service, system, or infrastructure component that impacts business operations or users. In incident management, it represents any event that disrupts normal service delivery and requires intervention to resolve.
8 |
9 | ## Example Of Incident
10 |
11 | A payment processing system experiences a 30% increase in transaction failures during peak shopping hours. This is classified as a high-priority incident because it directly impacts revenue and customer experience. The incident triggers automated alerts and brings together a response team to investigate and resolve the issue.
--------------------------------------------------------------------------------
/src/initial-response.md:
--------------------------------------------------------------------------------
1 | ---
2 | excerpt: Initial response is the first set of actions taken after an incident is detected.
3 | term: Initial Response
4 | ---
5 | ## What Is Initial Response
6 |
7 | Initial response is the first set of actions taken after an incident is detected. It includes assessing the situation, containing the impact, and starting communication with key stakeholders.
8 |
9 | ## Why Is Initial Response Important
10 |
11 | A strong initial response can limit damage, speed up recovery, and keep everyone aligned. It sets the foundation for effective incident management.
12 |
13 | ## Example Of Initial Response
14 |
15 | After a security breach alert, the IT team isolates affected systems, notifies leadership, and gathers information for further investigation.
16 |
17 | ## How To Implement Initial Response
18 |
19 | - Create clear response playbooks for common incidents
20 | - Train teams on immediate containment steps
21 | - Use automated alerts for faster action
22 | - Assign communication roles early
23 | - Review and update response steps after each incident
--------------------------------------------------------------------------------
/src/integration-ecosystem.md:
--------------------------------------------------------------------------------
1 | ---
2 | excerpt: An integration ecosystem in incident management is a network of interconnected tools, platforms, and systems that work together to detect, respond to, and resolve incidents.
3 | term: Integration Ecosystem
4 | ---
5 | ## What Is Integration Ecosystem
6 |
7 | An integration ecosystem in incident management is a network of interconnected tools, platforms, and systems that work together to detect, respond to, and resolve incidents. It creates a unified environment where monitoring tools, communication platforms, ticketing systems, and automation tools share data and functionality seamlessly.
8 |
9 | ## Why Is Integration Ecosystem Important
10 |
11 | An integration ecosystem eliminates silos between different incident management tools, reducing manual work and preventing information gaps. It speeds up incident response by automating data flow between systems and provides teams with complete context when troubleshooting problems.
12 |
13 | ## Example Of Integration Ecosystem
14 |
15 | A company connects their monitoring platform (Datadog) with their incident management tool (Spike), which then integrates with Slack for team communication and PagerDuty for on-call notifications. When an issue occurs, alerts flow automatically through this ecosystem, creating tickets, notifying the right people, and centralizing all relevant information.
--------------------------------------------------------------------------------
/src/joint-command.md:
--------------------------------------------------------------------------------
1 | ---
2 | excerpt: Joint Command lets leaders from different agencies share decisions in managing incidents together.
3 | term: Joint Command
4 | ---
5 | ## What Is Joint Command
6 |
7 | Joint Command is a structure used in incident management where leaders from different agencies or jurisdictions share the responsibility for managing an incident. It's often referred to as Unified Command (UC) within the Incident Command System (ICS). These leaders work together to make decisions.
8 |
9 | ## Importance Joint Command
10 |
11 | Joint Command is essential for incidents involving multiple agencies, like large fires or multi-jurisdictional events. It ensures a coordinated response, prevents conflicting orders, and allows agencies to pool resources effectively under shared objectives.
--------------------------------------------------------------------------------
/src/joint-information-center-jic.md:
--------------------------------------------------------------------------------
1 | ---
2 | excerpt: A JIC is where agencies work together to share accurate and timely info during incidents.
3 | term: Joint Information Center (JIC)
4 | ---
5 | ## What Is Joint Information Center (JIC)
6 |
7 | A Joint Information Center, or JIC, is a central location where communication staff from different organizations involved in managing an incident work together. It helps coordinate and release timely, accurate, and consistent information to the public and stakeholders. A JIC can be a physical place or operate virtually.
8 |
9 | ## Importance Of Joint Information Center (JIC)
10 |
11 | JICs prevent confusion by ensuring everyone receives the same message from a single, trusted source. This builds public confidence and helps manage rumors during a crisis. It allows response agencies to coordinate their public messaging efficiently.
--------------------------------------------------------------------------------
/src/judgement-call.md:
--------------------------------------------------------------------------------
1 | ---
2 | excerpt: A judgment call is a decision made using experience and intuition when rules don't clearly apply.
3 | term: Judgment Call
4 | ---
5 | ## What Is Judgment Call
6 |
7 | A judgment call is a decision made using experience, intuition, and critical thinking in situations not clearly defined by rules. It often happens in complex or unexpected circumstances where standard procedures don't fully apply.
8 |
9 | ## Why Is Judgment Call Important
10 |
11 | Judgment calls are vital for handling unique or edge cases effectively. They empower responders to assess risk and make timely decisions. This is crucial when procedures are ambiguous or incomplete.
12 |
13 | ## Example Of Judgment Call
14 |
15 | A security team finds unusual network traffic. Automated tools don't flag it as malicious. However, an experienced analyst uses their judgment. They recognize subtle patterns indicating a potential threat and escalate the issue for investigation.
--------------------------------------------------------------------------------
/src/known-error.md:
--------------------------------------------------------------------------------
1 | ---
2 | excerpt: A Known Error is a documented IT issue with a root cause and workaround but no permanent fix yet.
3 | term: Known Error
4 | ---
5 | ## What Is Known Error
6 |
7 | A Known Error is a problem within an IT system that has been identified and analyzed. Often, its root cause and a temporary workaround are known, but a permanent solution is not yet available. It represents a documented issue awaiting a final fix.
8 |
9 | ## Importance Of Known Error
10 |
11 | Recognizing Known Errors helps IT teams resolve recurring incidents faster using documented workarounds. It links ongoing disruptions back to their underlying cause, aiding problem management efforts. This avoids repeated diagnosis for the same issue.
12 |
13 | ## Example Of Known Error
14 |
15 | A software application crashes due to a specific bug when processing large files. The team knows the bug exists and has a workaround (process smaller files separately). Until the bug is fixed in a future release, this is a Known Error.
--------------------------------------------------------------------------------
/src/latency.md:
--------------------------------------------------------------------------------
1 | ---
2 | excerpt: Latency is the time delay between an action and the resulting response in a system.
3 | term: Latency
4 | ---
5 | ## What Is Latency
6 |
7 | Latency is the time delay between an action and the resulting response in a system. In incident management, latency often refers to response time delays in applications, networks, or services that can degrade user experience or indicate underlying problems.
8 |
9 | ## Example Of Latency
10 |
11 | A database query that normally takes 50 milliseconds begins taking 2 seconds to complete. This latency increase causes the entire application to slow down. Monitoring detects this change and alerts the team, who discover that a missing index is causing the slowdown.
12 |
13 | ## How To Implement Latency Monitoring
14 |
15 | - Define acceptable latency thresholds for critical services
16 | - Implement monitoring at multiple points in your system
17 | - Set up alerts for when latency exceeds normal ranges
18 | - Use synthetic transactions to test latency proactively
19 | - Track latency trends over time to identify gradual degradations
20 |
21 | ## Best Practices
22 |
23 | - Monitor latency from the end-user perspective, not just internal metrics
24 | - Establish baseline performance metrics during normal operations
25 | - Create latency heat maps to visualize problem areas in complex systems
--------------------------------------------------------------------------------
/src/major-incident.md:
--------------------------------------------------------------------------------
1 | ---
2 | excerpt: A Major Incident is a high-impact, high-urgency event that causes significant disruption to business operations or services.
3 | term: Major Incident
4 | ---
5 | ## What Is Major Incident
6 |
7 | A Major Incident is a high-impact, high-urgency event that causes significant disruption to business operations or services. It typically affects multiple users or critical systems, requires immediate response, and often involves cross-team coordination to resolve. Major incidents demand heightened attention and specialized handling procedures beyond routine issues.
8 |
9 | ## Example Of Major Incident
10 |
11 | A payment processor experiences a database failure during Black Friday, preventing customers from completing purchases across hundreds of e-commerce sites. The company immediately declares a major incident, assembles a response team, and works to restore service while communicating updates to affected merchants.
--------------------------------------------------------------------------------
/src/manual-escalation.md:
--------------------------------------------------------------------------------
1 | ---
2 | excerpt: Manual escalation is when an on-call responder decides to pass an incident to another team member or a higher-level expert.
3 | term: Manual Escalation
4 | ---
5 | ## What Is Manual Escalation
6 |
7 | Manual escalation is when an on-call responder decides to pass an incident to another team member or a higher-level expert. This is done when the issue is beyond their expertise or authority.
8 |
9 | ## Why Is Manual Escalation Important
10 |
11 | Manual escalation helps incidents get resolved by the right people. It avoids delays that happen when someone tries to fix an issue they can't handle.
12 |
13 | ## How to Implement Manual Escalation
14 |
15 | - Define clear escalation paths and contacts
16 | - Train responders on when and how to escalate
17 | - Document escalation steps in incident response plans
18 | - Use tools that make it easy to hand off incidents
--------------------------------------------------------------------------------
/src/mean-time-to-detect-mttd.md:
--------------------------------------------------------------------------------
1 | ---
2 | excerpt: Mean Time to Detect (MTTD) is the average time between when an incident actually begins and when it is detected by monitoring systems or users.
3 | term: Mean Time To Detect (MTTD)
4 | ---
5 | ## What Is Mean Time To Detect (MTTD)
6 |
7 | Mean Time to Detect (MTTD) is the average time between when an incident actually begins and when it is detected by monitoring systems or users. This metric measures how quickly your organization identifies problems after they occur.
8 |
9 | ## Why Is MTTD Important
10 |
11 | MTTD directly affects incident impact. Faster detection leads to faster resolution and less damage. This metric helps organizations evaluate the effectiveness of their monitoring tools and observability practices.
12 |
13 | ## Example Of MTTD
14 |
15 | A database begins experiencing performance degradation at 9:15 AM. At 9:23 AM, monitoring alerts trigger based on slow query response times. The MTTD is 8 minutes.
16 |
17 | ## How To Track MTTD
18 |
19 | - Deploy comprehensive monitoring across all critical systems
20 | - Configure alerts with appropriate thresholds for early detection
21 | - Use anomaly detection to identify unusual patterns
22 | - Track incident start times through system logs and user reports
23 | - Calculate and review MTTD regularly across incident categories
24 |
25 | ## Best Practices
26 |
27 | - Implement real-time monitoring for critical services
28 | - Use synthetic monitoring to detect issues before users do
29 | - Create redundant detection methods for critical systems
--------------------------------------------------------------------------------
/src/mean-time-to-recovery-mttr.md:
--------------------------------------------------------------------------------
1 | ---
2 | excerpt: Mean Time to Recovery (MTTR) is the average time between when a system fails and when it returns to full functionality.
3 | term: Mean Time To Recovery (MTTR)
4 | ---
5 | ## What Is Mean Time To Recovery (MTTR)
6 |
7 | Mean Time to Recovery (MTTR) is the average time between when a system fails and when it returns to full functionality. This metric focuses specifically on the restoration period, measuring how quickly services can be brought back online after an outage.
8 |
9 | ## Why Is MTTR Important
10 |
11 | MTTR directly impacts business continuity and user experience. Faster recovery means less downtime and fewer frustrated users. This metric helps organizations evaluate their disaster recovery capabilities and resilience strategies.
12 |
13 | ## Example Of MTTR
14 |
15 | A database server crashes at 3:00 PM. After emergency response procedures, the database is back online at 3:45 PM. The MTTR is 45 minutes.
16 |
17 | ## How To Track MTTR
18 |
19 | - Record precise failure and recovery timestamps for all incidents
20 | - Calculate average recovery times across different systems
21 | - Compare actual MTTR against recovery time objectives
22 | - Identify systems with consistently high recovery times
23 | - Test recovery procedures regularly to improve MTTR
24 |
25 | ## Best Practices
26 |
27 | - Implement automated recovery procedures where possible
28 | - Maintain up-to-date recovery playbooks for all critical systems
29 | - Practice recovery scenarios through regular drills
--------------------------------------------------------------------------------
/src/mean-time-to-resolve-mttr.md:
--------------------------------------------------------------------------------
1 | ---
2 | excerpt: Mean Time to Resolve (MTTR) is the average time between when an incident is detected and when it is fully resolved.
3 | term: Mean Time To Resolve (MTTR)
4 | ---
5 | ## What Is Mean Time To Resolve (MTTR)
6 |
7 | Mean Time to Resolve (MTTR) is the average time between when an incident is detected and when it is fully resolved. This metric measures the total lifecycle of an incident from discovery to complete resolution, including diagnosis and repair time.
8 |
9 | ## Why Is MTTR Important
10 |
11 | MTTR directly reflects the efficiency of your incident response process. Lower MTTR means less downtime and business impact. This comprehensive metric helps organizations evaluate their overall incident management effectiveness.
12 |
13 | ## Example Of MTTR
14 |
15 | A critical application crashes at 10:00 AM. After diagnosis and fixes, service is fully restored at 11:30 AM. The MTTR is 90 minutes.
16 |
17 | ## How To Implement MTTR Tracking
18 |
19 | - Record precise start and end times for all incidents
20 | - Calculate MTTR across different incident categories and severities
21 | - Compare MTTR trends over time to measure improvement
22 | - Break down MTTR into component phases to identify bottlenecks
23 | - Set realistic MTTR targets based on service level objectives
24 |
25 | ## Best Practices
26 |
27 | - Develop and refine incident response playbooks
28 | - Conduct regular training for response teams
29 | - Implement automated remediation for common issues
--------------------------------------------------------------------------------
/src/monitoring-system.md:
--------------------------------------------------------------------------------
1 | ---
2 | excerpt: A monitoring system is a set of tools and processes that track the health, performance, and availability of IT infrastructure and applications.
3 | term: Monitoring System
4 | ---
5 | ## What Is Monitoring System
6 |
7 | A monitoring system is a set of tools and processes that track the health, performance, and availability of IT infrastructure and applications. It detects issues in real time and sends alerts to incident response teams.
8 |
9 | ## Why Is Monitoring System Important
10 |
11 | Monitoring systems catch problems before they impact users. They help teams respond faster, reduce downtime, and maintain service reliability.
12 |
13 | ## Example Of Monitoring System
14 |
15 | A SaaS company uses a monitoring system to track server CPU usage. When usage spikes, the system sends an alert to the on-call engineer.
16 |
17 | ## How To Implement Monitoring System
18 |
19 | - Identify critical systems and services to monitor
20 | - Choose monitoring tools that fit your needs
21 | - Set up alert thresholds for key metrics
22 | - Integrate the monitoring system with your incident management platform
23 | - Test alerts and adjust thresholds as needed
24 |
25 | ## Best Practices
26 |
27 | - Regularly review and update monitoring coverage
28 | - Set actionable alert thresholds to avoid alert fatigue
--------------------------------------------------------------------------------
/src/mutual-aid-agreement.md:
--------------------------------------------------------------------------------
1 | ---
2 | excerpt: A Mutual Aid Agreement is a formal arrangement between organizations to provide assistance to each other during incidents or emergencies that exceed their individual response capabilities.
3 | term: Mutual Aid Agreement
4 | ---
5 | ## What Is Mutual Aid Agreement
6 |
7 | A Mutual Aid Agreement is a formal arrangement between organizations to provide assistance to each other during incidents or emergencies that exceed their individual response capabilities. These agreements define the terms, conditions, and procedures for requesting and providing support.
8 |
9 | ## Why Is Mutual Aid Important
10 |
11 | Mutual Aid Agreements strengthen incident response capabilities by allowing organizations to share resources, expertise, and personnel during major incidents. They create a support network that helps maintain service continuity, reduce recovery time, and mitigate the impact of large-scale incidents.
12 |
13 | ## Example Of Mutual Aid
14 |
15 | Two regional hospitals establish a mutual aid agreement for IT disaster recovery. When Hospital A experiences a ransomware attack that compromises their electronic health record system, Hospital B provides temporary server capacity and technical staff to help restore critical services.
--------------------------------------------------------------------------------
/src/national-incident-management-system-nims.md:
--------------------------------------------------------------------------------
1 | ---
2 | excerpt: The National Incident Management System (NIMS) is a standardized approach to incident management developed by the U.S. Department of Homeland Security.
3 | term: National Incident Management System (NIMS)
4 | ---
5 | ## What Is National Incident Management System (NIMS)
6 |
7 | The National Incident Management System (NIMS) is a standardized approach to incident management developed by the U.S. Department of Homeland Security. It provides a consistent framework for government, private sector, and non-governmental organizations to work together during disasters and emergencies, regardless of size, location, or complexity.
8 |
9 | ## Why Is National Incident Management System (NIMS) Important
10 |
11 | NIMS establishes common terminology, organizational structures, and operational procedures that enable effective coordination during incidents. For organizations managing critical infrastructure or public services, NIMS alignment improves collaboration with emergency responders and government agencies during major incidents that require multi-agency response.
--------------------------------------------------------------------------------
/src/network-latency.md:
--------------------------------------------------------------------------------
1 | ---
2 | excerpt: Network latency is the time delay between sending and receiving data across a network.
3 | term: Network Latency
4 | ---
5 | ## What Is Network Latency
6 |
7 | Network latency is the time delay between sending and receiving data across a network. In incident management, high latency can impact system performance and user experience, potentially triggering incidents.
8 |
9 | ## Example of Network Latency
10 |
11 | A company's video conferencing system experiences delays and poor quality due to high network latency, disrupting important client meetings.
12 |
13 | ## How to Implement Network Latency Monitoring
14 |
15 | - Deploy network performance monitoring tools
16 | - Establish baseline latency metrics for normal operations
17 | - Set up alerts for abnormal latency spikes
18 | - Regularly test network paths for latency issues
19 |
20 | ## Best Practices
21 |
22 | - Optimize network infrastructure to minimize latency
23 | - Prioritize traffic for latency-sensitive applications
24 | - Educate users on factors that can contribute to network latency
--------------------------------------------------------------------------------
/src/network-operations-center-noc.md:
--------------------------------------------------------------------------------
1 | ---
2 | excerpt: A Network Operations Center (NOC) is a centralized location where IT professionals monitor, manage, and troubleshoot an organization's network infrastructure, systems, and services.
3 | term: Network Operations Center (NOC)
4 | ---
5 | ## What Is Network Operations Center (NOC)
6 |
7 | A Network Operations Center (NOC) is a centralized location where IT professionals monitor, manage, and troubleshoot an organization's network infrastructure, systems, and services. The NOC serves as the first line of defense against network disruptions, outages, and security incidents, providing 24/7 oversight of critical IT operations.
8 |
9 | ## Why Is Network Operations Center (NOC) Important
10 |
11 | A NOC is vital for maintaining business continuity by quickly identifying and resolving network issues before they impact users. It centralizes monitoring capabilities, improves incident response times, and provides valuable data for infrastructure planning. For incident management, a NOC acts as the nerve center that detects problems early and coordinates response efforts.
12 |
13 | ## Example Of Network Operations Center (NOC)
14 |
15 | A global financial services company operates a NOC that monitors thousands of network devices across multiple data centers. When a router failure is detected at 2 AM, NOC analysts immediately receive alerts, diagnose the issue, and dispatch field technicians to replace the hardware before the morning trading session begins.
--------------------------------------------------------------------------------
/src/network-outage.md:
--------------------------------------------------------------------------------
1 | ---
2 | excerpt: A network outage is a disruption in network connectivity that prevents users from accessing network resources or services.
3 | term: Network Outage
4 | ---
5 | ## What Is a Network Outage
6 |
7 | A network outage is a disruption in network connectivity that prevents users from accessing network resources or services. It can range from a localized issue affecting a few users to a widespread problem impacting an entire organization.
8 |
9 | ## Example of a Network Outage
10 |
11 | A faulty router causes a complete loss of internet connectivity for an entire office building, preventing employees from accessing cloud-based services and external communications.
12 |
13 | ## How to Implement Network Outage Management
14 |
15 | - Develop a network outage response plan
16 | - Implement network monitoring tools for quick detection
17 | - Establish redundant network paths to minimize single points of failure
18 | - Create a communication plan for notifying affected users
19 |
20 | ## Best Practices
21 |
22 | - Conduct regular network health checks and preventive maintenance
23 | - Maintain an up-to-date network diagram for faster troubleshooting
24 | - Perform post-outage analysis to prevent similar incidents in the future
--------------------------------------------------------------------------------
/src/noise-reduction.md:
--------------------------------------------------------------------------------
1 | ---
2 | excerpt: Noise reduction in incident management is the practice of filtering out unnecessary alerts and notifications to focus on meaningful signals.
3 | term: Noise Reduction
4 | ---
5 | ## What Is Noise Reduction
6 |
7 | Noise reduction in incident management is the practice of filtering out unnecessary alerts and notifications to focus on meaningful signals. It involves eliminating duplicate alerts, correlating related issues, and suppressing known false positives.
8 |
9 | ## Why Is Noise Reduction Important
10 |
11 | Noise reduction prevents alert fatigue among incident responders. It helps teams focus on genuine issues that require attention, improves response times for critical incidents, and reduces the risk of important alerts being overlooked amid the noise.
12 |
13 | ## Example Of Noise Reduction
14 |
15 | Instead of generating 50 separate alerts when a server cluster experiences issues, a noise reduction system correlates these into a single incident with detailed context. This prevents the on-call engineer from being overwhelmed with notifications.
16 |
17 | ## How To Reduce Noise With Spike
18 |
19 | - Spike groups related alerts into a single incident, so you see less clutter.
20 | - It suppresses duplicate notifications, keeping only the first alert visible.
21 | - You can set filters to mute known false positives in just a few clicks.
22 |
23 | Cut the noise and help your team focus on real issues—start with [Spike](https://app.spike.sh/signup) today.
--------------------------------------------------------------------------------
/src/non-compliance.md:
--------------------------------------------------------------------------------
1 | ---
2 | excerpt: Non-compliance in incident management refers to the failure to adhere to established policies, procedures, or regulatory requirements when handling incidents.
3 | term: Non-Compliance
4 | ---
5 | ## What Is Non-Compliance
6 |
7 | Non-compliance in incident management refers to the failure to adhere to established policies, procedures, or regulatory requirements when handling incidents. It can lead to increased risks, penalties, and ineffective incident resolution.
8 |
9 | ## Example of Non-Compliance
10 |
11 | A company fails to report a data breach within the required 72-hour window, violating GDPR regulations and risking hefty fines.
12 |
13 | ## How to Implement Non-Compliance Prevention
14 |
15 | - Regularly review and update incident management policies
16 | - Provide ongoing training for staff on compliance requirements
17 | - Implement automated compliance checks in incident management tools
18 | - Conduct periodic audits of incident handling processes
--------------------------------------------------------------------------------
/src/non-conformance.md:
--------------------------------------------------------------------------------
1 | ---
2 | excerpt: Non-conformance in incident management refers to a deviation from established policies, procedures, or standards during incident handling.
3 | term: Non-Conformance
4 | ---
5 | ## What Is Non-Conformance
6 |
7 | Non-conformance in incident management refers to a deviation from established policies, procedures, or standards during incident handling. It occurs when teams fail to follow documented processes, miss required steps, or bypass controls designed to maintain service quality and security during incident response.
8 |
9 | ## Why Identifying Non-Conformance Important
10 |
11 | Identifying and addressing non-conformance helps organizations maintain consistent incident management practices. Unchecked non-conformance can lead to longer resolution times, repeated incidents, compliance violations, and security risks. Tracking these deviations provides valuable insights for process improvement and training needs.
12 |
13 | ## Example Of Non-conformance
14 |
15 | During a critical system outage, an IT engineer implements an unauthorized configuration change to restore service quickly, bypassing the required change approval process. While the fix works, the lack of documentation and testing creates risk and violates the organization's incident management procedures.
--------------------------------------------------------------------------------
/src/non-critical-incident.md:
--------------------------------------------------------------------------------
1 | ---
2 | excerpt: A non-critical incident is an event that disrupts normal business operations but doesn't significantly impact core services or large numbers of users.
3 | term: Non-Critical Incident
4 | ---
5 |
6 | ## What Is Non-Critical Incident
7 |
8 | A non-critical incident is an event that disrupts normal business operations but doesn't significantly impact core services or large numbers of users. These incidents have minimal urgency and typically allow for standard resolution timeframes without requiring immediate attention or emergency response procedures.
9 |
10 | ## Example Of Non-Critical Incident
11 |
12 | A printer malfunction affecting a single department would be classified as non-critical. While it impacts productivity for some users, the business can continue operating normally while the issue is resolved through standard support channels.
--------------------------------------------------------------------------------
/src/normal-operations.md:
--------------------------------------------------------------------------------
1 | ---
2 | excerpt: Normal operations in incident management refer to the standard functioning of systems and processes without any active incidents or disruptions.
3 | term: Normal Operations
4 | ---
5 | ## What Are Normal Operations
6 |
7 | Normal operations in incident management refer to the standard functioning of systems and processes without any active incidents or disruptions. It represents the baseline state against which incidents are measured.
8 |
9 | ## Example of Normal Operations
10 |
11 | A company's network typically operates at 60% capacity during business hours with 99.9% uptime. Any significant deviation from these metrics could signal an incident.
--------------------------------------------------------------------------------
/src/notification.md:
--------------------------------------------------------------------------------
1 | ---
2 | excerpt: A notification is an automated or manual alert sent to individuals or teams about an incident, system change, or important event.
3 | term: Notification
4 | ---
5 | ## What Is Notification
6 |
7 | A notification is an automated or manual alert sent to individuals or teams about an incident, system change, or important event. In incident management, notifications inform relevant stakeholders about detected issues, their status, and required actions through channels like email, SMS, phone calls, or mobile apps.
8 |
9 | ## Why Is Notification Important
10 |
11 | Effective notifications reduce response time by quickly alerting the right people when incidents occur. They provide critical information needed to begin troubleshooting, coordinate response efforts, and keep stakeholders informed. Well-designed notifications help prevent alert fatigue while ensuring critical issues receive proper attention.
12 |
13 | ## Example Of Notification
14 |
15 | When a database server reaches 95% disk capacity at midnight, an automated notification system alerts the on-call database administrator via SMS and phone call. The message includes the server name, current disk usage, and a link to the incident details, allowing immediate action to prevent service disruption.
--------------------------------------------------------------------------------
/src/observability-integration.md:
--------------------------------------------------------------------------------
1 | ---
2 | excerpt: Observability integration is the process of connecting various monitoring tools, logs, metrics, and tracing systems into a unified framework.
3 | term: Observability Integration
4 | ---
5 | ## What Is Observability Integration
6 |
7 | Observability integration is the process of connecting various monitoring tools, logs, metrics, and tracing systems into a unified framework. It creates a comprehensive view of system behavior by correlating data from different sources, making it easier to understand complex system states and troubleshoot issues.
8 |
9 | ## Why Is Observability Integration Important
10 |
11 | Observability integration provides a complete picture of system health by connecting data from multiple sources. It speeds up troubleshooting by eliminating the need to switch between different tools and helps teams understand relationships between components. This holistic view is crucial for managing modern, complex IT environments.
12 |
13 | ## Example Of Observability Integration
14 |
15 | A microservices application experiences intermittent performance issues. Through their integrated observability platform, engineers can trace a slow customer transaction across multiple services, view the corresponding infrastructure metrics, and examine logs from each component—all in a single interface. This integration helps them quickly identify a memory leak in a specific service.
--------------------------------------------------------------------------------
/src/observability.md:
--------------------------------------------------------------------------------
1 | ---
2 | excerpt: Observability is the ability to understand a system's internal state based on its external outputs.
3 | term: Observability
4 | ---
5 | ## What Is Observability
6 |
7 | Observability is the ability to understand a system's internal state based on its external outputs. It combines metrics, logs, and traces to provide insights into what's happening inside complex systems, making it easier to detect, investigate, and resolve incidents.
8 |
9 | ## Why Is Observability Important
10 |
11 | Observability reduces mean time to detect (MTTD) and mean time to resolve (MTTR) by providing context-rich data about system behavior. It helps teams understand not just that something is wrong, but why it's wrong, enabling faster and more accurate incident resolution.
12 |
13 | ## Example Of Observability
14 |
15 | When a payment service experiences increased latency, an observability platform correlates this with recent code deployments, higher database query times, and increased error rates. This gives the incident team clear direction for investigation and resolution.
--------------------------------------------------------------------------------
/src/on-call-shift.md:
--------------------------------------------------------------------------------
1 | ---
2 | excerpt: An on-call shift is a set period when a team member is responsible for responding to incidents.
3 | term: On-Call Shift
4 | ---
5 | ## What Is On-Call Shift
6 |
7 | An on-call shift is a set period when a team member is responsible for responding to incidents. Shifts can last hours or days, depending on the schedule.
8 |
9 | ## Why Is On-Call Shift Important
10 |
11 | On-call shifts make sure someone is always ready to respond to incidents, even outside normal business hours.
12 |
13 | ## How To Implement On-Call Shift
14 |
15 | - Define shift length and coverage needed for your team
16 | - Use an on-call scheduling tool to assign shifts
17 | - Provide clear escalation paths and support resources
18 |
19 | ## Best Practices
20 |
21 | - Rotate on-call shifts fairly among team members
22 | - Offer time off or compensation after heavy on-call periods
--------------------------------------------------------------------------------
/src/oncall-engineer.md:
--------------------------------------------------------------------------------
1 | ---
2 | excerpt: An on-call engineer is a team member assigned to respond to incidents and alerts outside regular working hours.
3 | term: " On-Call Engineer"
4 | ---
5 | ## Who Is An On-Call Engineer
6 |
7 | An on-call engineer is a team member assigned to respond to incidents and alerts outside regular working hours. This role rotates among team members to provide round-the-clock coverage.
8 |
9 | ## Why Is An On-Call Engineer Important
10 |
11 | On-call engineers help resolve incidents quickly, reducing downtime and minimizing business impact when issues occur after hours.
12 |
13 | ## Example Of An On-Call Engineer
14 |
15 | A cloud provider rotates on-call duty weekly. The assigned engineer receives alerts and handles incidents during nights and weekends.
--------------------------------------------------------------------------------
/src/oncall-load-distribution.md:
--------------------------------------------------------------------------------
1 | ---
2 | excerpt: On-call load distribution is the practice of spreading incident response duties evenly across team members.
3 | term: On-call load distribution
4 | ---
5 | ## What Is On-Call Load Distribution
6 |
7 | On-call load distribution is the practice of spreading incident response duties evenly across team members. This prevents burnout and keeps incident management effective.
8 |
9 | ## Why Is On-Call Load Distribution Important
10 |
11 | Evenly distributed on-call duties help maintain team morale and reduce fatigue. It also makes sure that no single person is overloaded, which leads to faster and more reliable incident responses.
12 |
13 | ## How To Implement On-Call Load Distribution With Spike
14 |
15 | - Create an on-call schedule in Spike for your team
16 | - Add team members and set their shifts or rotation patterns
17 | - Assign escalation policies to route alerts to the right person on duty
18 | - Review and adjust schedules as needed to balance workloads
19 |
20 | Distribute on-call duties fairly and keep your team fresh—set up your first schedule with [Spike](https://app.spike.sh/signup).
--------------------------------------------------------------------------------
/src/oncall-load.md:
--------------------------------------------------------------------------------
1 | ---
2 | excerpt: On-call load is the number of incidents, alerts, or pages an on-call engineer receives during their shift.
3 | term: " On-Call Load"
4 | ---
5 | ## What Is On-Call Load
6 |
7 | On-call load is the number of incidents, alerts, or pages an on-call engineer receives during their shift. It measures the workload and stress placed on the person covering incidents.
8 |
9 | ## Why Is Tracking On-Call Load Important
10 |
11 | Tracking on-call load helps teams spot burnout risks and balance workloads. It also helps improve incident response by distributing alerts more evenly.
12 |
13 | ## How To Measure And Manage On-Call Load
14 |
15 | - Track the number of alerts and incidents per on-call shift
16 | - Use reporting tools to analyze patterns over time
17 | - Adjust shift schedules or team size based on data
18 | - Set alert thresholds to reduce noise and false alarms
--------------------------------------------------------------------------------
/src/oncall-responder.md:
--------------------------------------------------------------------------------
1 | ---
2 | excerpt: An on-call responder is a designated IT professional responsible for acknowledging, investigating, and resolving incidents that occur during their assigned shift.
3 | term: On-Call Responder
4 | ---
5 | ## Who Is On-Call Responder
6 |
7 | An on-call responder is a designated IT professional responsible for acknowledging, investigating, and resolving incidents that occur during their assigned shift. They serve as the first line of defense against service disruptions outside regular business hours.
8 |
9 | ## Why Is On-Call Responder Important
10 |
11 | On-call responders minimize incident impact by providing rapid response when issues arise. Their expertise and quick action prevent minor issues from escalating into major outages, protecting business operations and customer experience even during off-hours.
12 |
13 | ## Example Of On-Call Responder
14 |
15 | At midnight, an on-call responder receives an alert about increasing error rates in the payment processing system. They quickly identify a memory leak in the application, restart the affected services, and implement a temporary fix until a permanent solution can be deployed during business hours.
--------------------------------------------------------------------------------
/src/oncall.md:
--------------------------------------------------------------------------------
1 | ---
2 | excerpt: On-call is a rotation system where IT professionals remain available outside regular working hours to respond to incidents and alerts.
3 | term: On-Call
4 | ---
5 | ## What Is On-Call
6 |
7 | On-call is a rotation system where IT professionals remain available outside regular working hours to respond to incidents and alerts. During their on-call shift, these professionals must be ready to address system issues, outages, or emergencies that require immediate attention to maintain service reliability.
8 |
9 | ## Why Is On-Call Important
10 |
11 | On-call rotations provide continuous coverage for critical systems and services, preventing extended downtime and service disruptions. This 24/7 availability helps organizations meet SLAs, maintain customer trust, and minimize financial losses from unresolved incidents.
12 |
13 | ## Example Of On-Call
14 |
15 | A DevOps engineer receives an alert at 2 AM about a database server crash. Being on-call, she acknowledges the alert through her incident management tool, investigates the issue remotely, and restores service within 30 minutes by restarting the database and fixing the configuration error.
--------------------------------------------------------------------------------
/src/open-telemetry.md:
--------------------------------------------------------------------------------
1 | ---
2 | excerpt: OpenTelemetry is an open-source observability framework that provides standardized tools, APIs, and SDKs for collecting and exporting metrics, logs, and traces from applications and infrastructure.
3 | term: Open Telemetry
4 | ---
5 | ## What Is Open Telemetry
6 |
7 | OpenTelemetry is an open-source observability framework that provides standardized tools, APIs, and SDKs for collecting and exporting metrics, logs, and traces from applications and infrastructure. It helps create consistent observability data across diverse technology stacks.
8 |
9 | ## Why Is Open Telemetry Important
10 |
11 | OpenTelemetry reduces vendor lock-in by providing a standard format for observability data. It simplifies instrumentation across heterogeneous environments and enables teams to switch between observability tools without changing their code. This flexibility improves incident detection and analysis capabilities.
12 |
13 | ## Example Of Open Telemetry
14 |
15 | A company uses OpenTelemetry to instrument their microservices architecture. When an incident occurs, they can trace requests across multiple services, regardless of programming language or infrastructure, pinpointing exactly where failures happen.
--------------------------------------------------------------------------------
/src/operational-maturity-om.md:
--------------------------------------------------------------------------------
1 | ---
2 | excerpt: Operational Maturity (OM) is a framework that measures how advanced an organization's operational practices and processes are in terms of effectiveness, efficiency, and consistency.
3 | term: Operational Maturity (OM)
4 | ---
5 | ## What Is Operational Maturity (OM)
6 |
7 | Operational Maturity (OM) is a framework that measures how advanced an organization's operational practices and processes are in terms of effectiveness, efficiency, and consistency. In incident management, it reflects how well a team can predict, prevent, detect, respond to, and learn from incidents.
8 |
9 | ## Why Is Operational Maturity Important
10 |
11 | Higher operational maturity leads to fewer incidents, faster resolution times, and better service quality. It helps organizations move from reactive firefighting to proactive incident prevention. Teams with high OM experience less stress, lower costs, and improved customer satisfaction.
12 |
13 | ## Example Of Operational Maturity
14 |
15 | A company progresses from manual incident detection and ad-hoc responses (low maturity) to automated monitoring, standardized playbooks, and regular process improvements based on incident data analysis (high maturity).
--------------------------------------------------------------------------------
/src/operational-readiness.md:
--------------------------------------------------------------------------------
1 | ---
2 | excerpt: Operational Readiness is the state of preparedness that allows an organization to effectively respond to and manage incidents when they occur.
3 | term: Operational Readiness
4 | ---
5 | ## What Is Operational Readiness
6 |
7 | Operational Readiness is the state of preparedness that allows an organization to effectively respond to and manage incidents when they occur. It encompasses having the right tools, processes, trained personnel, and resources in place to handle various incident scenarios.
8 |
9 | ## Why Is Operational Readiness Important
10 |
11 | Being operationally ready minimizes the impact of incidents by reducing response time and improving resolution effectiveness. It helps organizations maintain service continuity during disruptions and builds confidence among stakeholders in the team's ability to handle crises.
12 |
13 | ## Example Of Operational Readiness
14 |
15 | Before launching a major new feature, a software company conducts tabletop exercises simulating different failure scenarios. They verify that monitoring is in place, on-call schedules are set, and response playbooks are updated for potential incidents.
--------------------------------------------------------------------------------
/src/operational-resilience.md:
--------------------------------------------------------------------------------
1 | ---
2 | excerpt: Operational Resilience is an organization's ability to continue delivering critical services despite disruptive incidents.
3 | term: Operational Resilience
4 | ---
5 | ## What Is Operational Resilience
6 |
7 | Operational Resilience is an organization's ability to continue delivering critical services despite disruptive incidents. It combines risk management, business continuity, disaster recovery, and incident response to create systems that can absorb shocks, adapt to changing conditions, and recover quickly.
8 |
9 | ## Why Is Operational Resilience Important
10 |
11 | Resilient operations maintain service availability during disruptions, protecting revenue and reputation. They reduce the business impact of incidents by limiting their scope and duration. Resilience also helps organizations meet regulatory requirements and customer expectations for reliable service.
12 |
13 | ## Example Of Operational Resilience
14 |
15 | A financial services company experiences a data center outage but continues processing transactions by automatically failing over to a backup site. Their incident response team manages the situation while business operations continue with minimal customer impact.
--------------------------------------------------------------------------------
/src/operations-lead.md:
--------------------------------------------------------------------------------
1 | ---
2 | excerpt: An Operations Lead is the individual responsible for overseeing daily IT operations, coordinating operational activities, and ensuring service reliability.
3 | term: Operations Lead
4 | ---
5 | ## Who Is Operations Lead
6 |
7 | An Operations Lead is the individual responsible for overseeing daily IT operations, coordinating operational activities, and ensuring service reliability. They manage operational teams, drive process improvements, and serve as the primary point of escalation for significant incidents.
8 |
9 | ## Why Is Operations Lead Important
10 |
11 | The Operations Lead provides direction and accountability for operational excellence. They bridge the gap between technical teams and management, drive operational improvements, and ensure that incident response aligns with business priorities.
12 |
13 | ## Example Of Operations Lead
14 |
15 | During a major service outage, the Operations Lead coordinates the response across multiple teams, communicates with executives about business impact, makes critical decisions about service restoration priorities, and ensures proper resources are allocated to resolve the incident.
--------------------------------------------------------------------------------
/src/outage.md:
--------------------------------------------------------------------------------
1 | ---
2 | excerpt: An outage is an unplanned interruption or loss of service in a system, network, application, or infrastructure component that prevents users from accessing resources or performing normal operations.
3 | term: Outage
4 | ---
5 | ## What Is Outage
6 |
7 | An outage is an unplanned interruption or loss of service in a system, network, application, or infrastructure component that prevents users from accessing resources or performing normal operations. Outages can range from brief disruptions affecting limited functionality to complete system failures impacting entire organizations.
8 |
9 | ## Example Of Outage
10 |
11 | A cloud service provider experiences a network failure that takes down their authentication system. As a result, thousands of customers cannot log into their accounts or access critical services for several hours, leading to business disruptions and financial losses.
--------------------------------------------------------------------------------
/src/p0-priority-zero.md:
--------------------------------------------------------------------------------
1 | ---
2 | excerpt: P0 is the highest incident priority level, representing critical incidents that cause complete service outage or pose severe security threats.
3 | term: P0 (Priority Zero)
4 | ---
5 | ## What Is P0 (Priority Zero)
6 |
7 | P0 is the highest incident priority level, representing critical incidents that cause complete service outage or pose severe security threats. These incidents demand immediate attention and often require all-hands response from the organization.
8 |
9 | ## Why Is P0 Important
10 |
11 | P0 incidents have the most severe business impact and require rapid response to minimize damage. They often affect multiple systems or all users of a service and can lead to significant financial losses or reputation damage if not addressed quickly.
12 |
13 | ## Example Of P0
14 |
15 | A complete database failure that renders the entire application unusable for all customers would be classified as P0. Another example is a security breach that exposes sensitive customer data and requires immediate containment.
--------------------------------------------------------------------------------
/src/p1-priority-one.md:
--------------------------------------------------------------------------------
1 | ---
2 | excerpt: P1 is the second-highest incident priority level, representing serious incidents that cause significant service degradation or affect a large portion of users.
3 | term: P1 (Priority One)
4 | ---
5 | ## What Is P1 (Priority One)
6 |
7 | P1 is the second-highest incident priority level, representing serious incidents that cause significant service degradation or affect a large portion of users. These incidents require urgent attention but may not demand the full-scale response of a P0.
8 |
9 | ## Why Is P1 Important
10 |
11 | P1 incidents significantly impact business operations and user experience. They require prompt resolution to prevent escalation to P0 status and to maintain service reliability and customer satisfaction.
12 |
13 | ## Example Of P1
14 |
15 | A payment processing system experiencing intermittent failures during checkout would be a P1 incident. It doesn't completely prevent all transactions but creates a poor experience for many customers and impacts revenue.
--------------------------------------------------------------------------------
/src/p2-priority-two.md:
--------------------------------------------------------------------------------
1 | ---
2 | excerpt: P2 is a moderate priority level for incidents that cause limited service disruption or affect a smaller subset of users.
3 | term: P2 (Priority Two)
4 | ---
5 | ## What Is P2 (Priority Two)
6 |
7 | P2 is a moderate priority level for incidents that cause limited service disruption or affect a smaller subset of users. These incidents require attention within hours rather than minutes but still need to be addressed promptly.
8 |
9 | ## Why Is P2 Important
10 |
11 | P2 incidents can degrade user experience and potentially escalate if left unaddressed. They serve as early warnings of potential larger issues and provide opportunities to fix problems before they affect more users.
12 |
13 | ## Example Of P2
14 |
15 | A feature that's unavailable for a specific user segment or a performance slowdown during non-peak hours would typically be classified as P2. These issues impact operations but don't prevent core business functions.
--------------------------------------------------------------------------------
/src/p3-priority-three.md:
--------------------------------------------------------------------------------
1 | ---
2 | excerpt: P3 is a low-priority incident level that represents minor issues with limited impact on users or business operations.
3 | term: P3 (Priority Three)
4 | ---
5 | ## What Is P3 (Priority Three)
6 |
7 | P3 is a low-priority incident level that represents minor issues with limited impact on users or business operations. These incidents may cause slight inconvenience but don't significantly affect core functionality or user experience.
8 |
9 | ## Why Is P3 Important
10 |
11 | P3 incidents help teams identify and fix minor issues before they potentially grow into larger problems. They also provide valuable data about system behavior and can reveal opportunities for improvement in less critical areas.
12 |
13 | ## Example Of P3
14 |
15 | A cosmetic UI issue, a minor delay in non-critical report generation, or a feature that's working but not optimally would be classified as P3. These issues are noticeable but don't prevent users from accomplishing their tasks.
--------------------------------------------------------------------------------
/src/p4-priority-four.md:
--------------------------------------------------------------------------------
1 | ---
2 | excerpt: P4 is the lowest incident priority level, representing trivial issues that have minimal or no impact on users or business operations.
3 | term: P4 (Priority Four)
4 | ---
5 | ## What Is P4 (Priority Four)
6 |
7 | P4 is the lowest incident priority level, representing trivial issues that have minimal or no impact on users or business operations. These are often cosmetic issues, minor bugs in non-critical features, or suggestions for improvement.
8 |
9 | ## Why Is P4 Important
10 |
11 | P4 incidents help maintain overall system quality and user satisfaction over time. While not urgent, addressing these issues demonstrates attention to detail and commitment to continuous improvement.
12 |
13 | ## Example Of P4
14 |
15 | A typo in documentation, a slight color mismatch in the UI, or a feature enhancement request would typically be classified as P4. These issues don't affect functionality but may impact the polish or user experience of a product.
--------------------------------------------------------------------------------
/src/phone-call-notifications.md:
--------------------------------------------------------------------------------
1 | ---
2 | excerpt: Phone Call Notifications are automated voice calls sent to on-call responders when critical incidents occur.
3 | term: Phone Call Notifications
4 | ---
5 | ## What Are Phone Call Notifications
6 |
7 | Phone Call Notifications are automated voice calls sent to on-call responders when critical incidents occur. They typically include information about the incident and may require acknowledgment from the recipient.
8 |
9 | ## Why Are Phone Call Notifications Important
10 |
11 | Phone Call Notifications break through notification fatigue when urgent action is needed. They're more likely to wake sleeping team members during off-hours incidents and provide a reliable backup when other notification methods fail.
12 |
13 | ## Example Of Phone Call Notifications
14 |
15 | When a payment processing system fails at 2 AM, the incident management platform automatically calls the on-call engineer. The call includes a brief description of the incident and options to acknowledge or escalate the issue.
16 |
17 | ## How To Implement Phone Call Notifications With Spike
18 |
19 | - Go to your team settings in Spike and add phone numbers for your on-call responders
20 | - Choose “Phone Call” as a notification method in your on-call schedules or escalation policies
21 | - Spike calls team members during critical incidents and asks for acknowledgment
22 |
23 | Set up phone call notifications in [Spike](https://app.spike.sh/signup) and never miss a critical alert, day or night.
--------------------------------------------------------------------------------
/src/platform-engineering.md:
--------------------------------------------------------------------------------
1 | ---
2 | excerpt: Platform engineering is the discipline of designing and building internal developer platforms that enable software delivery and operations teams to self-service their infrastructure needs.
3 | term: Platform Engineering
4 | ---
5 | ## What Is Platform Engineering
6 |
7 | Platform engineering is the discipline of designing and building internal developer platforms that enable software delivery and operations teams to self-service their infrastructure needs. In incident management, it focuses on creating standardized, reliable systems that minimize incidents and streamline response.
8 |
9 | ## Why Is Platform Engineering Important
10 |
11 | Platform engineering reduces incident frequency by standardizing infrastructure and eliminating error-prone manual processes. It accelerates incident response by providing consistent tooling and environments, while improving visibility across complex systems.
12 |
13 | ## Example Of Platform Engineering
14 |
15 | A retail company's platform team builds a self-service portal where development teams can provision pre-hardened, security-compliant environments. This reduces configuration errors that previously caused 40% of production incidents.
--------------------------------------------------------------------------------
/src/platform-integration.md:
--------------------------------------------------------------------------------
1 | ---
2 | excerpt: Platform Integration in incident management refers to connecting your incident response tools with other systems like monitoring, ticketing, communication, and development platforms.
3 | term: Platform Integration
4 | ---
5 | ## What Is Platform Integration
6 |
7 | Platform Integration in incident management refers to connecting your incident response tools with other systems like monitoring, ticketing, communication, and development platforms. This creates a unified workflow across the incident lifecycle.
8 |
9 | ## Why Is Platform Integration Important
10 |
11 | Platform Integration eliminates manual data transfer between systems. It speeds up incident response by automating workflows, reduces context switching for responders, and creates a more complete view of incidents across different tools.
12 |
13 | ## Example Of Platform Integration
14 |
15 | An incident management system integrates with Slack, PagerDuty, and Jira. When an alert triggers, it automatically creates a Slack channel, notifies on-call staff through PagerDuty, and generates a Jira ticket—all without manual intervention.
--------------------------------------------------------------------------------
/src/postmortem-templates.md:
--------------------------------------------------------------------------------
1 | ---
2 | excerpt: Postmortem Templates are standardized documents or forms used to analyze incidents after they've been resolved.
3 | term: Postmortem Templates
4 | ---
5 | ## What Are Postmortem Templates
6 |
7 | Postmortem Templates are standardized documents or forms used to analyze incidents after they've been resolved. They guide teams through a structured review process to document what happened, why it happened, and how to prevent similar incidents in the future.
8 |
9 | ## Why Are Postmortem Templates Important
10 |
11 | Postmortem Templates create consistency in incident reviews across teams and incidents. They save time during the analysis process, help capture all relevant information, and make it easier to track action items and improvements over time.
12 |
13 | ## Example Of Postmortem Template
14 |
15 | A cloud service provider uses a template that includes sections for incident timeline, root cause analysis, customer impact assessment, and action items. This template is filled out after every major outage and shared with all engineering teams.
16 |
17 | ## How To Create Postmortem Templates
18 |
19 | - Create a basic template with key sections for incident analysis
20 | - Include fields for timeline, impact, root cause, and action items
21 | - Make the template accessible to all relevant teams
22 | - Review and update the template based on team feedback
23 | - Store completed postmortems in a searchable repository
--------------------------------------------------------------------------------
/src/predictable-pricing.md:
--------------------------------------------------------------------------------
1 | ---
2 | excerpt: Predictable pricing is a transparent billing model for incident management tools where costs remain consistent and foreseeable regardless of usage fluctuations.
3 | term: Predictable Pricing
4 | ---
5 | ## What Is Predictable Pricing
6 |
7 | Predictable pricing is a transparent billing model for incident management tools where costs remain consistent and foreseeable regardless of usage fluctuations. It typically involves fixed monthly or annual fees with clear limits on features, users, or alerts, allowing organizations to budget accurately without fear of unexpected charges.
8 |
9 | ## Why Is Predictable Pricing Important
10 |
11 | Predictable pricing helps organizations plan their incident management budgets with confidence. It eliminates surprise charges during major incidents when alert volumes spike. Teams can focus on resolving issues rather than worrying about mounting costs during critical situations.
12 |
13 | ## Example Of Predictable Pricing
14 |
15 | A company pays $500 monthly for their incident management platform regardless of whether they handle 10 or 1,000 incidents. During a major outage, they receive hundreds of alerts but face no additional charges, unlike competitors using per-alert pricing models.
--------------------------------------------------------------------------------
/src/primary-responder.md:
--------------------------------------------------------------------------------
1 | ---
2 | excerpt: A primary responder is the first person or team assigned to handle an incident as soon as it is reported.
3 | term: Primary Responder
4 | ---
5 | ## Who Is Primary Responder
6 |
7 | A primary responder is the first person or team assigned to handle an incident as soon as it is reported. This role involves taking immediate action to assess the situation, start the resolution, and communicate with stakeholders.
8 |
9 | ## Why Is Primary Responder Important
10 |
11 | The primary responder sets the pace for incident resolution. Quick and effective action in the early moments can reduce downtime, limit impact, and keep everyone informed.
12 |
13 | ## Example Of Primary Responder
14 |
15 | When a server goes down, the on-call engineer receives an alert and acts as the primary responder. They investigate the issue, update the team, and start recovery steps.
--------------------------------------------------------------------------------
/src/priority.md:
--------------------------------------------------------------------------------
1 | ---
2 | excerpt: Priority in incident management is the assigned level of urgency and importance given to an incident based on its impact on business operations and customers.
3 | term: Priority
4 | ---
5 | ## What Is Priority
6 |
7 | Priority in incident management is the assigned level of urgency and importance given to an incident based on its impact on business operations and customers. It determines the order in which incidents are addressed and the resources allocated to their resolution.
8 |
9 | ## Example Of Priority
10 |
11 | A payment processing system failure affecting thousands of customers receives P1 (highest) priority, triggering immediate response from senior engineers. Meanwhile, a minor UI glitch affecting a few internal users is assigned P4 priority for later resolution.
--------------------------------------------------------------------------------
/src/production-environment.md:
--------------------------------------------------------------------------------
1 | ---
2 | excerpt: A Production Environment is the live system where applications and services run to deliver functionality to end-users.
3 | term: Production Environment
4 | ---
5 | ## What Is Production Environment
6 |
7 | A Production Environment is the live system where applications and services run to deliver functionality to end-users. It contains the actual data and infrastructure that support business operations and is distinguished from development, testing, and staging environments by its direct impact on users and business processes.
8 |
9 | ## Why Is Production Environment Important
10 |
11 | The Production Environment directly affects user experience, business operations, and revenue generation. Any incidents in this environment have immediate consequences on service availability and reliability. Proper management of the production environment is critical for maintaining business continuity and customer trust.
12 |
13 | ## Example Of Production Environment
14 |
15 | An e-commerce company's production environment includes web servers handling customer traffic, payment processing systems, inventory databases, and order fulfillment applications. When a database server in this environment slows down, it immediately impacts customers' ability to complete purchases.
--------------------------------------------------------------------------------
/src/quantum-computing-security-incidents.md:
--------------------------------------------------------------------------------
1 | ---
2 | excerpt: Quantum computing security incidents are breaches or vulnerabilities that emerge from quantum computing technologies or target quantum systems.
3 | term: Quantum Computing Security Incidents
4 | ---
5 | ## What Are Quantum Computing Security Incidents
6 |
7 | Quantum computing security incidents are breaches or vulnerabilities that emerge from quantum computing technologies or target quantum systems. These incidents involve attacks that exploit quantum algorithms to break traditional encryption or compromise quantum communication channels, posing unique challenges for incident response teams.
8 |
9 | ## Example Of Quantum Computing Security Incidents
10 |
11 | A theoretical quantum computing incident might involve an attacker using Shor's algorithm on a sufficiently powerful quantum computer to factor large prime numbers, breaking RSA encryption. This could compromise encrypted communications across an organization's network before traditional monitoring systems detect the breach.
--------------------------------------------------------------------------------
/src/query-builder.md:
--------------------------------------------------------------------------------
1 | ---
2 | excerpt: Query Builder is a tool that allows users to create custom searches and filters for incident data without needing to know complex query languages.
3 | term: Query Builder
4 | ---
5 | ## What Is Query Builder
6 |
7 | Query Builder is a tool that allows users to create custom searches and filters for incident data without needing to know complex query languages. It provides a visual interface to construct searches across incident records, helping teams find relevant information quickly during incident management.
8 |
9 | ## Why Is Query Builder Important
10 |
11 | Query Builder helps teams quickly find historical incidents similar to current ones, potentially revealing solutions. It supports post-incident analysis by making it easy to identify patterns across incidents. Teams can create custom views and reports without relying on technical specialists.
12 |
13 | ## Example Of Query Builder
14 |
15 | A team lead needs to find all database-related incidents that occurred during peak hours in the last month. Using Query Builder, they select "Database" from the category dropdown, set a time range filter for business hours, and add a date range for the past 30 days—all without writing SQL.
--------------------------------------------------------------------------------
/src/queue.md:
--------------------------------------------------------------------------------
1 | ---
2 | excerpt: A queue in incident management is an organized list of incidents waiting to be addressed by support teams.
3 | term: Queue
4 | ---
5 | ## What Is Queue
6 |
7 | A queue in incident management is an organized list of incidents waiting to be addressed by support teams. It serves as a central repository where incidents are stored, prioritized, and assigned based on factors like severity, impact, and available resources.
8 |
9 | ## Why Is Queue Important
10 |
11 | Queues create order in what could otherwise be chaos during multiple simultaneous incidents. They help teams prioritize their work based on business impact rather than just responding to the newest or loudest issues. Proper queue management prevents critical incidents from being overlooked.
--------------------------------------------------------------------------------
/src/real-time-collaboration-tools.md:
--------------------------------------------------------------------------------
1 | ---
2 | excerpt: Real-time collaboration tools are software platforms that allow incident response teams to work together simultaneously during an incident.
3 | term: Real-time Collaboration Tools
4 | ---
5 | ## What Are Real-time Collaboration Tools
6 |
7 | Real-time collaboration tools are software platforms that allow incident response teams to work together simultaneously during an incident. These tools provide shared workspaces where team members can communicate, share information, and coordinate actions regardless of their physical location.
8 |
9 | ## Why Are Real-time Collaboration Tools Important
10 |
11 | During incidents, quick and clear communication is crucial. Real-time collaboration tools eliminate delays, create a single source of truth, and help teams coordinate complex responses. They also preserve a record of the incident response for later analysis and learning.
12 |
13 | ## Example Of Real-time Collaboration Tools
14 |
15 | During a major service outage, an incident response team uses a dedicated Slack channel to share updates, a collaborative document to track investigation steps, and a video conferencing tool for their virtual war room. This combination allows them to work together effectively despite being distributed across different locations.
--------------------------------------------------------------------------------
/src/recovery-point-objective-rpo.md:
--------------------------------------------------------------------------------
1 | ---
2 | excerpt: Recovery Point Objective (RPO) is the maximum acceptable amount of data loss measured in time.
3 | term: Recovery Point Objective (RPO)
4 | ---
5 | ## What Is Recovery Point Objective (RPO)
6 |
7 | Recovery Point Objective (RPO) is the maximum acceptable amount of data loss measured in time. It defines how much data an organization can afford to lose during an incident before significant harm occurs to the business.
8 |
9 | ## Why Is Recovery Point Objective (RPO) Important
10 |
11 | RPO helps organizations design appropriate backup strategies and allocate resources effectively. It balances the cost of data protection against business risk, allowing teams to implement solutions that meet specific data loss tolerance requirements.
12 |
13 | ## Example Of Recovery Point Objective (RPO)
14 |
15 | A financial services company sets an RPO of 5 minutes for its transaction processing system. This means they implement continuous data replication to ensure that, in the event of a failure, no more than 5 minutes of transaction data would be lost.
--------------------------------------------------------------------------------
/src/recovery-time-objective-rto.md:
--------------------------------------------------------------------------------
1 | ---
2 | excerpt: Recovery Time Objective (RTO) is the maximum acceptable time it should take to restore a system after an incident.
3 | term: Recovery Time Objective (RTO)
4 | ---
5 | ## What Is Recovery Time Objective (RTO)
6 |
7 | Recovery Time Objective (RTO) is the maximum acceptable time it should take to restore a system after an incident. It defines how quickly a business process must be restored to avoid unacceptable consequences from a break in continuity.
8 |
9 | ## Why Is Recovery Time Objective (RTO) Important
10 |
11 | RTO guides investment in recovery capabilities and helps set realistic expectations with stakeholders. It ensures that recovery strategies align with business needs and helps incident teams prioritize their efforts during restoration activities.
12 |
13 | ## Example Of Recovery Time Objective (RTO)
14 |
15 | An e-commerce platform sets an RTO of 15 minutes for its payment processing system. This drives their investment in redundant payment infrastructure, automated failover mechanisms, and dedicated recovery teams to meet this tight timeline.
--------------------------------------------------------------------------------
/src/resilience-engineering.md:
--------------------------------------------------------------------------------
1 | ---
2 | excerpt: Resilience engineering is an approach to incident management that focuses on building systems that can withstand, adapt to, and recover from failures.
3 | term: Resilience Engineering
4 | ---
5 | ## What Is Resilience Engineering
6 |
7 | Resilience engineering is an approach to incident management that focuses on building systems that can withstand, adapt to, and recover from failures. Rather than just preventing failures, it acknowledges that failures will occur and designs systems to be robust enough to continue functioning despite problems.
8 |
9 | ## Why Is Resilience Engineering Important
10 |
11 | Resilience engineering helps organizations maintain critical services even during incidents. It shifts focus from blame to learning, improves system reliability, and reduces the business impact of failures. This approach is especially valuable in complex, interconnected systems where not all failures can be predicted.
12 |
13 | ## Example Of Resilience Engineering
14 |
15 | A payment processor implements circuit breakers in their microservices architecture. When one service begins to fail, the circuit breaker prevents cascading failures by gracefully degrading non-essential features while maintaining core payment functionality.
--------------------------------------------------------------------------------
/src/resilience.md:
--------------------------------------------------------------------------------
1 | ---
2 | excerpt: Resilience in incident management is the ability of an IT system or organization to withstand, adapt to, and rapidly recover from disruptions while maintaining continuous business operations.
3 | term: Resilience
4 | ---
5 | ## What Is Resilience In Incident Management
6 |
7 | Resilience in incident management is the ability of an IT system or organization to withstand, adapt to, and rapidly recover from disruptions while maintaining continuous business operations. It involves designing systems that can absorb impacts, maintain core functionality during incidents, and return to normal operations quickly.
8 |
9 | ## Why Is Resilience Important In Incident Management
10 |
11 | Resilience reduces business impact during incidents and helps maintain service continuity. It builds customer trust by minimizing downtime and preserving critical functions even when problems occur. Organizations with resilient systems face fewer catastrophic failures and recover faster when incidents do happen.
--------------------------------------------------------------------------------
/src/resolution-time.md:
--------------------------------------------------------------------------------
1 | ---
2 | excerpt: Resolution time is the total duration from when an incident is first detected until it is fully resolved and normal service is restored.
3 | term: Resolution Time
4 | ---
5 | ## What Is Resolution Time
6 |
7 | Resolution time is the total duration from when an incident is first detected until it is fully resolved and normal service is restored. It measures the complete lifecycle of an incident and reflects the efficiency of an organization's incident management process.
8 |
9 | ## Why Is Resolution Time Important
10 |
11 | Resolution time directly impacts business operations and customer satisfaction. Shorter resolution times minimize service disruptions and associated costs. This metric helps identify bottlenecks in incident management processes and provides a benchmark for measuring improvement.
12 |
13 | ## Example Of Resolution Time
14 |
15 | A critical database failure occurs at 2:00 PM. After detection, diagnosis, and implementation of a fix, normal operations resume at 4:30 PM. The resolution time is 2.5 hours, which can be compared against SLA targets and historical performance.
--------------------------------------------------------------------------------
/src/response-time.md:
--------------------------------------------------------------------------------
1 | ---
2 | excerpt: Response time in incident management is the duration between incident detection and the beginning of remediation efforts.
3 | term: Response Time
4 | ---
5 | ## What Is Response Time
6 |
7 | Response time in incident management is the duration between incident detection and the beginning of remediation efforts. It measures how quickly a team acknowledges and starts working on an incident after it's been identified or reported.
8 |
9 | ## Why Is Response Time Important
10 |
11 | Fast response times limit incident impact and prevent problems from escalating. Quick responses demonstrate reliability to customers and stakeholders. This metric helps identify staffing gaps and process inefficiencies in the initial incident handling phase.
12 |
13 | ## Example Of Response Time
14 |
15 | A monitoring system detects a website outage at 3:15 AM. The on-call engineer receives the alert, acknowledges it, and begins troubleshooting by 3:22 AM. The response time is 7 minutes, which falls within the organization's target of 15 minutes for critical incidents.
--------------------------------------------------------------------------------
/src/root-cause.md:
--------------------------------------------------------------------------------
1 | ---
2 | excerpt: Root cause is the fundamental, underlying reason for an incident or problem.
3 | term: Root Cause
4 | ---
5 | ## What Is Root Cause
6 |
7 | Root cause is the fundamental, underlying reason for an incident or problem. It represents the core issue that, when addressed, prevents similar incidents from recurring. Root cause goes beyond symptoms to identify the original source of the failure.
8 |
9 | ## Why Is Root Cause Important
10 |
11 | Identifying the root cause prevents treating only symptoms, which leads to recurring incidents. It enables permanent solutions rather than temporary fixes. Understanding root causes also helps organizations improve their systems and processes over time.
12 |
13 | ## Example Of Root Cause
14 |
15 | A website experiences repeated outages. Initial investigation shows high server load (symptom). The root cause is identified as inefficient database queries triggered by a recent code change that weren't caught in testing.
--------------------------------------------------------------------------------
/src/runbook.md:
--------------------------------------------------------------------------------
1 | ---
2 | excerpt: A runbook is a standardized document that contains step-by-step procedures for responding to specific incidents or performing routine operations.
3 | term: Runbook
4 | ---
5 | ## What Is Runbook
6 |
7 | A runbook is a standardized document that contains step-by-step procedures for responding to specific incidents or performing routine operations. It provides clear instructions that guide responders through the process of diagnosing and resolving incidents efficiently.
8 |
9 | ## Why Is Runbook Important
10 |
11 | Runbooks reduce response time and human error during incidents. They capture institutional knowledge, enable consistent responses regardless of who's on call, and help new team members respond effectively. Well-designed runbooks lead to faster incident resolution.
12 |
13 | ## Example Of Runbook
14 |
15 | A database failure runbook includes steps to verify the outage, check for recent changes, review error logs, restart services in the correct order, verify recovery, and communicate with stakeholders throughout the process.
16 |
17 | ## How To Create Runbook With Spike
18 |
19 | - Open the Incidents or Runbooks page in Spike and click "Create Runbook"
20 | - Add a title, summary, and step-by-step instructions in plain language
21 | - Save your runbook to make it instantly available during incidents
22 | - Link the runbook to specific alerts so your team can find it fast
23 |
24 | Make incident response easier—start building your first runbook with [Spike](https://app.spike.sh/signup).
--------------------------------------------------------------------------------
/src/secondary-responder.md:
--------------------------------------------------------------------------------
1 | ---
2 | excerpt: A secondary responder is a backup team member who steps in if the primary on-call responder cannot address an incident.
3 | term: Secondary Responder
4 | ---
5 | ## What Is Secondary Responder
6 |
7 | A secondary responder is a backup team member who steps in if the primary on-call responder cannot address an incident. This role helps maintain incident response continuity.
8 |
9 | ## Why Is Secondary Responder Important
10 |
11 | Secondary responders prevent gaps in incident coverage. If the primary responder misses or cannot handle an alert, the secondary responder steps in to resolve the issue quickly.
12 |
13 | ## Example Of Secondary Responder
14 |
15 | If the primary on-call engineer does not acknowledge a critical alert within 10 minutes, the alert escalates to the secondary responder.
--------------------------------------------------------------------------------
/src/security-incident.md:
--------------------------------------------------------------------------------
1 | ---
2 | excerpt: A security incident is an event that violates security policies, compromises data integrity, or threatens system confidentiality or availability.
3 | term: Security Incident
4 | ---
5 | ## What Is Security Incident
6 |
7 | A security incident is an event that violates security policies, compromises data integrity, or threatens system confidentiality or availability. It includes unauthorized access attempts, data breaches, malware infections, and other events that pose security risks to an organization.
8 |
9 | ## Example of Security Incident
10 |
11 | A company detects unusual login patterns indicating a brute force attack against their customer database. The security team blocks the suspicious IP addresses, forces password resets for affected accounts, and investigates the scope of the potential breach.
12 |
13 | ## How To Implement Security Incident Response
14 |
15 | - Create a dedicated security incident response plan
16 | - Form a specialized security incident response team
17 | - Deploy security monitoring and detection tools
18 | - Establish clear escalation paths for different types of security incidents
19 | - Develop containment, eradication, and recovery procedures
20 | - Prepare communication templates for different security scenarios
21 |
22 | ## Best Practices
23 |
24 | - Train all staff on security awareness and incident reporting procedures
25 | - Conduct regular security incident simulations to test response readiness
26 | - Maintain relationships with external security resources and law enforcement
--------------------------------------------------------------------------------
/src/self-healing-incident.md:
--------------------------------------------------------------------------------
1 | ---
2 | excerpt: A self-healing incident is an issue that is detected and resolved automatically by systems without human intervention.
3 | term: Self-Healing Incident
4 | ---
5 | ## What Is Self-Healing Incident
6 |
7 | A self-healing incident is an issue that is detected and resolved automatically by systems without human intervention. Automated scripts or tools identify the problem and trigger corrective actions.
8 |
9 | ## Example Of Self-Healing Incident
10 |
11 | A monitoring tool detects high memory usage on a server and automatically restarts the affected service, fixing the issue before users notice.
--------------------------------------------------------------------------------
/src/service-degradation.md:
--------------------------------------------------------------------------------
1 | ---
2 | excerpt: Service degradation occurs when a system continues to function but with reduced performance, reliability, or capabilities.
3 | term: Service Degradation
4 | ---
5 | ## What Is Service Degradation
6 |
7 | Service degradation occurs when a system continues to function but with reduced performance, reliability, or capabilities. Unlike a complete outage, degraded services operate below normal quality levels, creating a suboptimal user experience while still providing core functionality.
8 |
9 | ## Example of Service Degradation
10 |
11 | An e-commerce website experiences slow page load times during a flash sale. The site remains accessible, but images load slowly, search functions time out occasionally, and checkout takes longer than normal. The degraded performance impacts sales but doesn't completely halt operations.
--------------------------------------------------------------------------------
/src/service-level-indicator-sli.md:
--------------------------------------------------------------------------------
1 | ---
2 | excerpt: A Service Level Indicator (SLI) is a specific metric used to measure the performance of a service.
3 | term: Service Level Indicator (SLI)
4 | ---
5 | ## What Is Service Level Indicator (SLI)
6 |
7 | A Service Level Indicator (SLI) is a specific metric used to measure the performance of a service. In incident management, SLIs often include metrics like incident response time, resolution time, or system uptime percentage.
8 |
9 | ## Why Is Service Level Indicator (SLI) Important
10 |
11 | SLIs provide concrete, measurable data points to assess service quality and performance. They help teams track progress, identify areas for improvement, and demonstrate compliance with SLAs. SLIs form the foundation for setting and evaluating Service Level Objectives (SLOs).
12 |
13 | ## Example Of Service Level Indicator (SLI)
14 |
15 | An SLI for an incident management system might be "percentage of high-severity incidents acknowledged within 15 minutes." This metric directly measures a specific aspect of the incident response process.
--------------------------------------------------------------------------------
/src/service-level-objective-slo.md:
--------------------------------------------------------------------------------
1 | ---
2 | excerpt: A Service Level Objective (SLO) is a target value or range for a service level that is measured by a Service Level Indicator (SLI).
3 | term: Service Level Objective (SLO)
4 | ---
5 | ## What Is Service Level Objective (SLO)
6 |
7 | A Service Level Objective (SLO) is a target value or range for a service level that is measured by a Service Level Indicator (SLI). In incident management, SLOs define the goals for incident response and resolution performance.
8 |
9 | ## Why Is Service Level Objective (SLO) Important
10 |
11 | SLOs translate SLIs into concrete performance targets. They help teams set realistic goals, prioritize improvements, and balance reliability with innovation. SLOs provide a clear benchmark for assessing service quality and guiding incident management strategies.
12 |
13 | ## Example Of Service Level Objective (SLO)
14 |
15 | An SLO for incident response might be "99% of high-severity incidents will be acknowledged within 15 minutes." This sets a specific, measurable target for the incident management team.
--------------------------------------------------------------------------------
/src/service-owner.md:
--------------------------------------------------------------------------------
1 | ---
2 | excerpt: A service owner is the individual responsible for the overall health, performance, and business alignment of a specific service.
3 | term: Service Owner
4 | ---
5 | ## What Is Service Owner
6 |
7 | A service owner is the individual responsible for the overall health, performance, and business alignment of a specific service. They oversee the service throughout its lifecycle, coordinate across teams during incidents, and drive continuous improvement initiatives.
8 |
9 | ## Importance Of Service Owner
10 |
11 | Service owners provide clear accountability for service quality and incident response. They bridge the gap between technical teams and business stakeholders, ensuring services meet business needs and that incidents are handled with appropriate priority and resources.
12 |
13 | ## Example of Service Owner
14 |
15 | The payment processing service owner at an e-commerce company coordinates between development, operations, and business teams during a payment outage. They communicate impact to executives, make critical decisions about mitigation strategies, and follow up with improvements after resolution.
--------------------------------------------------------------------------------
/src/service-restoration.md:
--------------------------------------------------------------------------------
1 | ---
2 | excerpt: Service Restoration is the process of returning affected systems to normal operation after an incident.
3 | term: Service Restoration
4 | ---
5 | ## What Is Service Restoration
6 |
7 | Service Restoration is the process of returning affected systems to normal operation after an incident. It focuses on minimizing downtime by quickly restoring functionality, even if temporary measures are needed while permanent fixes are developed.
8 |
9 | ## Why Is Service Restoration Important
10 |
11 | Service Restoration directly impacts user experience and business continuity. Quick restoration minimizes financial and reputational damage from outages. It separates the immediate need to restore service from the longer process of permanent resolution, allowing businesses to recover faster.
12 |
13 | ## Example Of Service Restoration
14 |
15 | A web application experiences database connection failures. The incident team restores service by implementing a connection pooling solution and adding more database replicas. This restores functionality while they work on the underlying connection management issue.
16 |
17 | ## How To Implement Service Restoration
18 |
19 | - Develop restoration procedures for critical services in advance
20 | - Create a decision framework for choosing between restoration options
21 | - Maintain backup systems and redundant components
22 | - Practice restoration procedures regularly
23 | - Document temporary fixes applied during restoration
24 | - Verify service functionality after restoration
--------------------------------------------------------------------------------
/src/service.md:
--------------------------------------------------------------------------------
1 | ---
2 | excerpt: A service in incident management refers to any application, system, or infrastructure component that delivers value to users.
3 | term: Service
4 | ---
5 | ## What Is Service
6 |
7 | A service in incident management refers to any application, system, or infrastructure component that delivers value to users. Services can include customer-facing applications, internal tools, APIs, databases, or network infrastructure that support business operations.
8 |
9 | ## Example Of Service
10 |
11 | An e-commerce company identifies their checkout process as a critical service. This service depends on payment processing, inventory management, and user authentication components. When an incident affects any of these components, the impact on the checkout service determines response priority.
--------------------------------------------------------------------------------
/src/severity.md:
--------------------------------------------------------------------------------
1 | ---
2 | excerpt: Severity in incident management is a measure of the impact and urgency of an incident on business operations, services, or customers.
3 | term: Severity
4 | ---
5 | ## What Is Severity
6 |
7 | Severity in incident management is a measure of the impact and urgency of an incident on business operations, services, or customers. It helps prioritize incidents and determine the appropriate response level and resources needed for resolution.
8 |
9 | ## Example Of Severity
10 |
11 | A major e-commerce platform experiences a complete site outage during a peak shopping period. This would be classified as a high-severity incident due to its significant impact on revenue and customer experience.
--------------------------------------------------------------------------------
/src/shadow-on-call-rotation.md:
--------------------------------------------------------------------------------
1 | ---
2 | excerpt: Shadow on-call rotation lets new team members observe experienced engineers during on-call shifts without full responsibility.
3 | term: Shadow On-Call Rotation
4 | ---
5 | ## What Is Shadow On-Call Rotation
6 |
7 | Shadow on-call rotation lets new team members observe experienced engineers during on-call shifts without full responsibility. This helps them learn incident response in real situations.
8 |
9 | ## Why Is Shadow On-Call Rotation Important
10 |
11 | Shadow rotations give new staff real-world training and build confidence. They improve team readiness without risking incident response quality.
12 |
13 | ## How To Implement Shadow On-Call Rotation
14 |
15 | - Pair new hires with experienced on-call engineers
16 | - Let shadows join incident calls and discussions
17 | - Gradually increase their involvement as they learn
18 | - Collect feedback from both the shadow and the mentor
19 | - Move shadows to active rotation when ready
--------------------------------------------------------------------------------
/src/single-point-of-failure.md:
--------------------------------------------------------------------------------
1 | ---
2 | excerpt: A single point of failure (SPOF) is any part of a system that, if it fails, will cause the entire system or service to stop working.
3 | term: Single Point Of Failure (SPOF)
4 | ---
5 | ## What Is Single Point Of Failure
6 |
7 | A single point of failure (SPOF) is any part of a system that, if it fails, will cause the entire system or service to stop working. In incident management, SPOFs are risks that can lead to major outages.
8 |
9 | ## Why Is Identifying Single Point Of Failure Important
10 |
11 | Identifying SPOFs helps teams improve reliability and reduce the risk of major incidents. Removing SPOFs makes systems more resilient to failures.
12 |
13 | ## Example Of Single Point Of Failure
14 |
15 | A company runs its website on a single server. If that server fails, the website goes down for all users.
16 |
17 | ## How To Implement Single Point Of Failure Analysis
18 |
19 | - Map out all system components and dependencies
20 | - Identify parts with no backup or redundancy
21 | - Prioritize fixing the most critical SPOFs
22 |
23 | ## Best Practices
24 |
25 | - Add redundancy for critical components
26 | - Regularly review systems for new SPOFs
27 | - Document all known SPOFs and mitigation plans
--------------------------------------------------------------------------------
/src/site-reliability-engineering-sre.md:
--------------------------------------------------------------------------------
1 | ---
2 | excerpt: Site Reliability Engineering (SRE) is a discipline that incorporates aspects of software engineering and applies them to infrastructure and operations problems.
3 | term: Site Reliability Engineering (SRE)
4 | ---
5 | ## What Is Site Reliability Engineering (SRE)
6 |
7 | Site Reliability Engineering (SRE) is a discipline that incorporates aspects of software engineering and applies them to infrastructure and operations problems. SRE aims to create scalable and highly reliable software systems.
8 |
9 | ## Why Is Site Reliability Engineering (SRE) Important
10 |
11 | SRE bridges the gap between development and operations, promoting a proactive approach to system reliability. It helps reduce downtime, improve incident response, and balance the pace of innovation with the risk of failure. SRE practices lead to more resilient systems and efficient incident management.
--------------------------------------------------------------------------------
/src/site.manifest:
--------------------------------------------------------------------------------
1 | {
2 | "name": "Incident Response Glossary by Spike.sh",
3 | "short_name": "Glossary",
4 | "description": "Explore 500+ key incident response terms and definitions curated by Spike.sh",
5 | "start_url": "/glossary",
6 | "display": "standalone",
7 | "background_color": "#ffffff",
8 | "theme_color": "#1649FF",
9 | "icons": [
10 | {
11 | "src": "https://cdn.spike.sh/logos/spike-badge-192.png",
12 | "sizes": "192x192",
13 | "type": "image/png"
14 | },
15 | {
16 | "src": "https://cdn.spike.sh/logos/spike-badge-512.png",
17 | "sizes": "512x512",
18 | "type": "image/png"
19 | }
20 | ]
21 | }
--------------------------------------------------------------------------------
/src/stakeholder.md:
--------------------------------------------------------------------------------
1 | ---
2 | excerpt: A stakeholder in incident management is any individual, team, or entity affected by or having influence over an incident and its resolution.
3 | term: Stakeholder
4 | ---
5 | ## What Is Stakeholder
6 |
7 | A stakeholder in incident management is any individual, team, or entity affected by or having influence over an incident and its resolution. This includes technical teams, management, customers, and third-party vendors who have a vested interest in the incident's outcome.
8 |
9 | ## Why Is Stakeholder Important
10 |
11 | Stakeholders provide critical perspectives and resources during incidents. Their involvement helps prioritize response efforts, allocate resources effectively, and maintain clear communication channels. Proper stakeholder management prevents misalignment and speeds up incident resolution.
12 |
13 | ## Example Of Stakeholder
14 |
15 | During a payment processing outage, stakeholders include the engineering team working on the fix, customer support handling user complaints, the finance department tracking revenue impact, and customers experiencing failed transactions.
--------------------------------------------------------------------------------
/src/system-failure.md:
--------------------------------------------------------------------------------
1 | ---
2 | excerpt: System failure is when a critical part of your IT infrastructure stops working as expected.
3 | term: System Failure
4 | ---
5 | ## What Is System Failure
6 |
7 | System failure is when a critical part of your IT infrastructure stops working as expected. This can halt key services or disrupt business operations until the issue is fixed.
8 |
9 | ## Example Of System Failure
10 |
11 | A payment gateway goes offline during peak hours, stopping all customer transactions until engineers restore the service.
12 |
13 | ## How To Implement System Failure Response
14 |
15 | - Set up monitoring to detect failures quickly
16 | - Define clear incident response steps for your team
17 | - Keep backup systems or failover solutions ready
18 | - Communicate updates to stakeholders during outages
19 | - Review each failure to improve future responses
20 |
21 | ## Best Practices
22 |
23 | - Test your backup and recovery processes regularly
24 | - Document all incident responses for future learning
25 | - Train your team to handle high-pressure situations calmly
--------------------------------------------------------------------------------
/src/system-outage.md:
--------------------------------------------------------------------------------
1 | ---
2 | excerpt: A system outage is a period when a computer system, service, or application becomes unavailable or non-functional for its intended users.
3 | term: System Outage
4 | ---
5 | ## What Is System Outage
6 |
7 | A system outage is a period when a computer system, service, or application becomes unavailable or non-functional for its intended users. Outages can be planned (maintenance) or unplanned (failures), and may affect entire systems or specific components.
8 |
9 | ## Example Of System Outage
10 |
11 | An e-commerce platform experiences a two-hour outage during a major sale when its payment processing system fails due to unexpected traffic volume. The company loses revenue, frustrates customers, and damages its reputation.
--------------------------------------------------------------------------------
/src/teams-multi-management.md:
--------------------------------------------------------------------------------
1 | ---
2 | excerpt: Teams (multi) management refers to the coordination and oversight of multiple teams involved in incident response.
3 | term: Teams (Multi) Management
4 | ---
5 | ## What Is Teams (Multi) Management
6 |
7 | Teams (multi) management refers to the coordination and oversight of multiple teams involved in incident response. It includes defining roles, responsibilities, and communication channels across different groups such as operations, development, and customer support.
8 |
9 | ## Why Is Teams (Multi) Management Important
10 |
11 | Effective multi-team management improves collaboration, reduces duplication of efforts, and speeds up incident resolution. It helps maintain clear lines of communication and accountability, especially in complex incidents that require diverse expertise.
12 |
13 | ## How To Implement Teams (Multi) Management
14 |
15 | - Define clear roles and responsibilities for each team
16 | - Establish communication protocols between teams
17 | - Use a centralized incident management platform
18 | - Implement regular cross-team training and drills
19 | - Create escalation paths for inter-team issues
20 |
21 | ## Best Practices
22 |
23 | - Foster a culture of collaboration and shared responsibility
24 | - Conduct joint post-incident reviews to improve coordination
25 | - Use tools that support multi-team visibility and communication
--------------------------------------------------------------------------------
/src/template-library.md:
--------------------------------------------------------------------------------
1 | ---
2 | excerpt: A template library is a collection of pre-defined, customizable documents and workflows for common incident types and communications.
3 | term: Template Library
4 | ---
5 | ## What Is Template Library
6 |
7 | A template library is a collection of pre-defined, customizable documents and workflows for common incident types and communications. It includes templates for incident reports, status updates, and post-incident reviews.
8 |
9 | ## Why Is Template Library Important
10 |
11 | Template libraries save time, maintain consistency in communication, and help teams follow best practices. They reduce the cognitive load on responders during stressful incidents and help ensure all necessary information is captured and shared.
12 |
13 | ## How To Create Template Library
14 |
15 | - Identify common incident types and required communications
16 | - Create templates for each scenario, including key fields
17 | - Store templates in an easily accessible central location
18 | - Allow for customization to fit specific incident needs
19 | - Regularly review and update templates based on feedback
20 |
21 | ## Best Practices
22 |
23 | - Keep templates clear, concise, and easy to use
24 | - Include placeholders for incident-specific information
25 | - Regularly update templates based on lessons learned
--------------------------------------------------------------------------------
/src/threat.md:
--------------------------------------------------------------------------------
1 | ---
2 | excerpt: In incident management, a threat is any potential danger that could exploit vulnerabilities in a system, leading to unauthorized access, data breaches, service disruptions, or other harmful outcomes.
3 | term: Threat
4 | ---
5 | ## What Is Threat
6 |
7 | In incident management, a threat is any potential danger that could exploit vulnerabilities in a system, leading to unauthorized access, data breaches, service disruptions, or other harmful outcomes. Threats can be intentional (like cyberattacks) or unintentional (like human error or natural disasters).
8 |
9 | ## Why Understanding Threat Important
10 |
11 | Understanding threats is crucial for proactive incident management. By identifying potential threats before they materialize, organizations can implement preventive measures, develop appropriate response plans, and allocate resources effectively. This awareness helps minimize the impact of incidents and reduces recovery time.
12 |
13 | ## Example Of Threat
14 |
15 | A newly discovered software vulnerability in a widely-used application could allow attackers to gain unauthorized access to sensitive data. This represents a threat that must be addressed through patching, configuration changes, or other mitigation strategies before it can be exploited.
--------------------------------------------------------------------------------
/src/threshold.md:
--------------------------------------------------------------------------------
1 | ---
2 | excerpt: A threshold is a predefined limit or boundary that, when crossed, triggers an alert or incident.
3 | term: Threshold
4 | ---
5 | ## What Is Threshold
6 |
7 | A threshold is a predefined limit or boundary that, when crossed, triggers an alert or incident. In incident management, thresholds define the acceptable performance parameters for systems, applications, or services, helping teams identify abnormal conditions that require attention.
8 |
9 | ## Why Is Threshold Important
10 |
11 | Well-defined thresholds help teams detect issues before they impact users. They provide an objective basis for determining when to create incidents and how to prioritize them. Appropriate thresholds reduce alert noise while ensuring genuine problems don't go unnoticed.
12 |
13 | ## Example Of Threshold
14 |
15 | A web application has a response time threshold set at 2 seconds. When the average response time exceeds this value for 5 consecutive minutes, an alert is triggered. This allows the team to investigate performance degradation before users report problems.
--------------------------------------------------------------------------------
/src/ticket-automation.md:
--------------------------------------------------------------------------------
1 | ---
2 | excerpt: Ticket automation is a process that uses software to automatically create, route, and manage support tickets without human intervention.
3 | term: Ticket Automation
4 | ---
5 | ## What Is Ticket Automation
6 |
7 | Ticket automation is a process that uses software to automatically create, route, and manage support tickets without human intervention. It streamlines incident management by handling repetitive tasks, categorizing issues, and assigning them to the appropriate teams or individuals.
8 |
9 | ## Why Is Ticket Automation Important
10 |
11 | Ticket automation reduces manual workload, minimizes human error, and speeds up response times. It allows support teams to focus on complex issues while routine tasks are handled automatically. This leads to improved efficiency, consistent service quality, and higher customer satisfaction.
12 |
13 | ## How To Implement Ticket Automation
14 |
15 | - Choose a ticketing system with automation features
16 | - Define rules for ticket categorization and routing
17 | - Set up automatic responses for common issues
18 | - Integrate with other tools like monitoring systems
19 | - Train staff on using the automated system
20 |
21 | ## Best Practices
22 |
23 | - Regularly review and update automation rules
24 | - Use clear, concise language in automated responses
25 | - Balance automation with human touch for complex issues
--------------------------------------------------------------------------------
/src/ticket-management.md:
--------------------------------------------------------------------------------
1 | ---
2 | excerpt: Ticket management is the overall process of handling incident tickets throughout their lifecycle.
3 | term: Ticket Management
4 | ---
5 | ## What Is Ticket Management
6 |
7 | Ticket management is the overall process of handling incident tickets throughout their lifecycle. This includes creating, logging, categorizing, prioritizing, assigning, tracking, and closing tickets.
8 |
9 | ## Why Is Ticket Management Important
10 |
11 | Effective ticket management streamlines the incident resolution process. It improves team coordination and provides visibility into support workload and performance. It helps resolve issues faster.
12 |
13 | ## Example Of Ticket Management
14 |
15 | Using an IT Service Management (ITSM) tool to manage incoming support requests. The tool routes tickets based on issue type, assigns them to available staff, and tracks resolution time against service level agreements.
16 |
17 | ## How To Implement Ticket Management
18 |
19 | - Select an appropriate ticketing system or software
20 | - Define clear workflows for ticket handling, including escalation paths
21 | - Train staff on using the system and following procedures
22 | - Automate routine tasks like ticket assignment or status updates
23 |
24 | ## Best Practices
25 |
26 | - Define clear priority levels and response time goals for tickets
27 | - Regularly analyze ticket data to identify recurring problems or bottlenecks
28 | - Integrate the ticketing system with monitoring tools for automatic ticket creation
--------------------------------------------------------------------------------
/src/ticket.md:
--------------------------------------------------------------------------------
1 | ---
2 | excerpt: A ticket is a digital record of an incident, alert, or service request within an IT system.
3 | term: Ticket
4 | ---
5 | ## What Is Ticket
6 |
7 | A ticket is a digital record of an incident, alert, or service request within an IT system. It contains details about the event, its status, and actions taken.
8 |
9 | ## Why Is Ticket Important
10 |
11 | Tickets provide a structured way to track issues from report to resolution. They centralize communication and create a historical record for analysis and improvement.
12 |
13 | ## Example Of Ticket
14 |
15 | An employee reports a broken printer via a help desk portal. A ticket is automatically created. It includes the employee's name, printer location, issue description, and assigned technician.
--------------------------------------------------------------------------------
/src/time-to-acknowledge.md:
--------------------------------------------------------------------------------
1 | ---
2 | excerpt: Time to Acknowledge is the duration between when an incident alert is triggered and when a team member acknowledges receipt of that alert.
3 | term: Time to Acknowledge
4 | ---
5 | ## What Is Time to Acknowledge
6 |
7 | Time to Acknowledge is the duration between when an incident alert is triggered and when a team member acknowledges receipt of that alert. This metric measures how quickly your incident response team notices and takes ownership of an issue after it's been detected by monitoring systems.
8 |
9 | ## Why Is Time to Acknowledge Important
10 |
11 | Quick acknowledgment of incidents directly impacts your overall resolution time. Reducing this metric helps minimize system downtime and potential business impact. It also serves as an indicator of your team's alertness and the effectiveness of your notification systems.
12 |
13 | ## Example Of Time to Acknowledge
14 |
15 | A critical database server goes down at 2:15 AM, triggering an alert. The on-call engineer receives the notification and acknowledges it at 2:18 AM, resulting in a 3-minute Time to Acknowledge. This prompt response allows the team to begin addressing the issue quickly, even during off-hours.
--------------------------------------------------------------------------------
/src/time-to-detect.md:
--------------------------------------------------------------------------------
1 | ---
2 | excerpt: Time to Detect, or Mean Time to Detect (MTTD), measures the average time elapsed between when an incident begins and when your team first detects or identifies it.
3 | term: Time To Detect
4 | ---
5 | ## What Is Time To Detect
6 |
7 | Time to Detect, or Mean Time to Detect (MTTD), measures the average time elapsed between when an incident begins and when your team first detects or identifies it.
8 |
9 | ## Why Is Time To Detect Important
10 |
11 | Faster detection allows for a quicker response. Reducing the time it takes to find an issue minimizes its potential impact and damage. It is a crucial first step in the incident lifecycle.
12 |
13 | ## Example Of Time To Detect
14 |
15 | A website error starts occurring at 2:00 PM. Monitoring systems automatically generate an alert received by the on-call engineer at 2:07 PM. The Time to Detect for this incident is 7 minutes.
--------------------------------------------------------------------------------
/src/time-to-resolution.md:
--------------------------------------------------------------------------------
1 | ---
2 | excerpt: Time to Resolution, often called Mean Time to Resolution (MTTR), is the average time taken to completely fix an incident after it has been reported.
3 | term: Time To Resolution
4 | ---
5 | ## What Is Time To Resolution
6 |
7 | Time to Resolution, often called Mean Time to Resolution (MTTR), is the average time taken to completely fix an incident after it has been reported. It measures the duration from the initial report until the issue is resolved.
8 |
9 | ## Why Is Time To Resolution Important
10 |
11 | This metric measures the efficiency of your incident response process. A lower Time to Resolution indicates faster recovery from disruptions. This minimizes downtime and impact on users.
12 |
13 | ## Example Of Time To Resolution
14 |
15 | If three incidents took 4 hours, 6 hours, and 8 hours to resolve, the total time is 18 hours. The average Time to Resolution (MTTR) is 18 hours divided by 3 incidents, which equals 6 hours.
--------------------------------------------------------------------------------
/src/time-to-respond.md:
--------------------------------------------------------------------------------
1 | ---
2 | excerpt: Time to Respond is the duration between when an incident is acknowledged and when active troubleshooting or remediation work begins.
3 | term: Time to Respond
4 | ---
5 | ## What Is Time to Respond
6 |
7 | Time to Respond is the duration between when an incident is acknowledged and when active troubleshooting or remediation work begins. This metric measures how quickly your team transitions from awareness of an issue to taking concrete steps toward resolving it.
8 |
9 | ## Why Is Time to Respond Important
10 |
11 | A short response time demonstrates operational readiness and helps minimize the business impact of incidents. It reflects your team's ability to mobilize resources and begin addressing issues promptly, which is crucial for maintaining service reliability and customer trust.
12 |
13 | ## Example of Time to Respond
14 |
15 | After acknowledging a website outage alert at 10:05 AM, the on-call engineer logs into the system, reviews error logs, and begins troubleshooting at 10:12 AM. The Time to Respond is 7 minutes, showing the team's readiness to address critical issues quickly.
--------------------------------------------------------------------------------
/src/triage-automation.md:
--------------------------------------------------------------------------------
1 | ---
2 | excerpt: Triage automation is the use of AI and machine learning to automatically assess, categorize, and prioritize incoming incidents or alerts.
3 | term: Triage Automation
4 | ---
5 | ## What Is Triage Automation
6 |
7 | Triage automation is the use of AI and machine learning to automatically assess, categorize, and prioritize incoming incidents or alerts. It helps quickly determine the severity and urgency of issues, routing them to the appropriate teams or resources.
8 |
9 | ## Why Is Triage Automation Important
10 |
11 | Triage automation significantly reduces response times and improves accuracy in incident handling. It helps teams focus on critical issues first, reduces alert fatigue, and ensures consistent application of triage criteria across all incidents.
12 |
13 | ## How To Implement Triage Automation
14 |
15 | - Select an AI-powered incident management platform
16 | - Define clear criteria for incident severity and priority
17 | - Train the system using historical incident data
18 | - Integrate with existing monitoring and alerting tools
19 | - Continuously refine the automation rules based on feedback
20 |
21 | ## Best Practices
22 |
23 | - Regularly review and update triage criteria
24 | - Maintain a human oversight mechanism for complex cases
25 | - Use triage automation data to identify recurring issues
--------------------------------------------------------------------------------
/src/triage.md:
--------------------------------------------------------------------------------
1 | ---
2 | excerpt: Triage is the process of quickly assessing and prioritizing incoming incidents or alerts.
3 | term: Triage
4 | ---
5 | ## What Is Triage
6 |
7 | Triage is the process of quickly assessing and prioritizing incoming incidents or alerts. It determines the urgency and impact of each issue to decide the order of response.
8 |
9 | ## Why Is Triage Important
10 |
11 | Triage helps teams focus on the most critical issues first. It efficiently allocates resources and prevents delays in addressing high-impact incidents. This process minimizes business disruption.
12 |
13 | ## Example Of Triage
14 |
15 | A security operations center receives many alerts daily. An analyst quickly reviews each alert. They determine if it's a false alarm, a minor issue, or a serious threat requiring immediate action.
16 |
17 | ## How To Implement Triage With Spike
18 |
19 | - Use Spike's incident severity filter to quickly categorize alerts based on urgency.
20 | - Prioritize incidents with customizable labels that highlight impact and priority.
21 |
22 | Start triaging incidents efficiently with [Spike](https://app.spike.sh/signup) today.
--------------------------------------------------------------------------------
/src/trigger.md:
--------------------------------------------------------------------------------
1 | ---
2 | excerpt: A trigger in incident management is an event or condition that initiates an automated response or alert.
3 | term: Trigger
4 | ---
5 | ## What Is Trigger
6 |
7 | A trigger in incident management is an event or condition that initiates an automated response or alert. Triggers detect abnormal system behavior and prompt incident creation based on predefined thresholds or conditions.
8 |
9 | ## Why Is Trigger Important
10 |
11 | Triggers enable rapid detection of incidents before they impact users. They automate the initial response process, reduce detection time, and help teams prioritize issues based on severity and impact.
12 |
13 | ## Example of Trigger
14 |
15 | A monitoring system detects CPU usage exceeding 90% for more than five minutes on a critical server. This triggers an alert that automatically creates an incident ticket and notifies the on-call engineer via SMS.
16 |
17 | ## How to Implement Trigger
18 |
19 | - Identify key metrics and thresholds that indicate potential issues
20 | - Configure monitoring tools to detect these conditions
21 | - Set up notification channels for different trigger types
22 | - Create automated workflows for initial response steps
23 | - Regularly review and refine trigger conditions
24 |
25 | ## Best Practices
26 |
27 | - Set appropriate thresholds to avoid alert fatigue from false positives
28 | - Configure different notification channels based on trigger severity
29 | - Regularly review trigger effectiveness and adjust as systems evolve
--------------------------------------------------------------------------------
/src/unplanned-downtime.md:
--------------------------------------------------------------------------------
1 | ---
2 | excerpt: Unplanned downtime occurs when systems fail unexpectedly, disrupting business operations.
3 | term: Unplanned Downtime
4 | ---
5 | ## What Is Unplanned Downtime
6 |
7 | Unplanned downtime is an unexpected period when a system, service, or infrastructure becomes unavailable, disrupting normal operations. It occurs without warning due to hardware failures, software bugs, network issues, power outages, or human error, causing immediate impact to business operations.
8 |
9 | ## Example Of Unplanned Downtime
10 |
11 | A database server crashes during peak business hours due to a hardware failure. The incident management team receives alerts, diagnoses the issue, and works to restore service while communicating updates to stakeholders. The entire process takes three hours, affecting customer transactions.
12 |
13 | ## Types Of Unplanned Downtime
14 |
15 | - Infrastructure failures: Hardware malfunctions, network outages, or power disruptions
16 | - Software issues: Bugs, memory leaks, or compatibility problems
17 | - Security incidents: DDoS attacks, ransomware, or other cyber threats
18 | - Human error: Accidental configuration changes or improper maintenance
--------------------------------------------------------------------------------
/src/unplanned-maintenance.md:
--------------------------------------------------------------------------------
1 | ---
2 | excerpt: Unplanned maintenance, also known as corrective or reactive maintenance, refers to repair work that occurs unexpectedly due to sudden equipment failures or system malfunctions.
3 | term: Unplanned Maintenance
4 | ---
5 | ## What Is Unplanned Maintenance
6 |
7 | Unplanned maintenance, also known as corrective or reactive maintenance, refers to repair work that occurs unexpectedly due to sudden equipment failures or system malfunctions. Unlike planned maintenance, it's performed as an immediate response to restore services or fix broken components without prior scheduling.
8 |
9 | ## Why Is Unplanned Maintenance Important
10 |
11 | Unplanned maintenance is crucial for minimizing downtime when unexpected failures occur. It helps prevent further damage to equipment, reduces safety hazards, and allows operations to resume quickly. Though reactive by nature, effective unplanned maintenance processes can significantly limit the impact of unforeseen incidents.
12 |
13 | ## Example Of Unplanned Maintenance
14 |
15 | A server crashes unexpectedly during peak business hours, causing a critical application to become unavailable. The IT team must immediately diagnose the issue, replace faulty hardware components, and restore the system to working condition to minimize service disruption.
--------------------------------------------------------------------------------
/src/uptime-percentage.md:
--------------------------------------------------------------------------------
1 | ---
2 | excerpt: Uptime is calculated as operational time divided by total possible time, expressed in percentages.
3 | term: Uptime Percentage
4 | ---
5 | ## What Is Uptime Percentage
6 |
7 | Uptime percentage is a metric that quantifies system reliability by measuring the proportion of time a service remains operational. It's calculated by dividing the total operational time by the total possible time and multiplying by 100. This percentage is often expressed in "nines" (99.9%, 99.99%, etc.).
8 |
9 | ## Why Is Uptime Percentage Important
10 |
11 | Uptime percentage serves as a key indicator of service reliability and quality. It helps organizations set clear availability targets, measure performance against SLAs, and compare reliability across different systems or time periods. Even small differences in uptime percentage can represent significant differences in actual downtime.
12 |
13 | ## Example Of Uptime Percentage
14 |
15 | A critical payment processing system has a target uptime of 99.999% (five nines). This allows for only 5.26 minutes of downtime per year. During an incident review, the team discovers the actual uptime was 99.98%, representing about 1.75 hours of downtime—significantly exceeding the allowable threshold.
--------------------------------------------------------------------------------
/src/urgency-classification.md:
--------------------------------------------------------------------------------
1 | ---
2 | excerpt: Urgency classification is the process of categorizing incidents based on how quickly they require resolution.
3 | term: Urgency Classification
4 | ---
5 | ## What Is Urgency Classification
6 |
7 | Urgency classification is the process of categorizing incidents based on how quickly they require resolution. It represents the time sensitivity of an incident independent of its impact and helps determine overall incident priority when combined with impact assessment.
8 |
9 | ## Why Is Urgency Classification Important
10 |
11 | Proper urgency classification helps support teams focus on time-sensitive issues first. It creates a standardized approach to incident prioritization that reduces subjective decision-making and helps meet service level agreements consistently.
12 |
13 | ## Example Of Urgency Classification
14 |
15 | A company classifies incident urgency into three levels: High (requires immediate attention), Medium (requires attention within working hours), and Low (can be scheduled). A system outage during peak business hours receives high urgency, while a minor reporting issue might receive low urgency.
16 |
17 | ## How To Implement Urgency Classification With Spike
18 |
19 | - Use Spike to set up urgency levels based on your team's needs.
20 | - Classify incidents into categories like high, medium, or low to guide response priorities.
21 |
22 | Start prioritizing incidents effectively with urgency classification in [Spike](https://app.spike.sh/signup).
--------------------------------------------------------------------------------
/src/user-experience.md:
--------------------------------------------------------------------------------
1 | ---
2 | excerpt: User experience in incident management refers to how end users perceive and interact with services during and after an incident.
3 | term: User Experience
4 | ---
5 | ## What Is User Experience
6 |
7 | User experience in incident management refers to how end users perceive and interact with services during and after an incident. It encompasses the accessibility, performance, and reliability of systems, as well as how well users are informed about issues and their resolution status.
8 |
9 | ## Why Is User Experience Important
10 |
11 | User experience directly affects customer satisfaction and trust. Poor handling of incidents can damage reputation and lead to customer churn. Maintaining positive user experience during incidents demonstrates reliability and builds confidence in your services, even when problems occur.
12 |
13 | ## Example Of User Experience
14 |
15 | During a network outage, a company provides real-time status updates through multiple channels, offers alternative solutions, and communicates expected resolution times. Despite the outage, users feel informed and valued, minimizing frustration and maintaining trust.
--------------------------------------------------------------------------------
/src/vulnerability.md:
--------------------------------------------------------------------------------
1 | ---
2 | excerpt: A vulnerability is a weakness in a computer system, network, or application that can be exploited by cybercriminals to gain unauthorized access.
3 | term: Vulnerability
4 | ---
5 | ## What Is Vulnerability
6 |
7 | A vulnerability is a weakness in a computer system, network, or application that can be exploited by cybercriminals to gain unauthorized access. These weaknesses may exist in software code, system configurations, or security practices, allowing attackers to run malicious code, install malware, or steal sensitive data.
8 |
9 | ## Why Understading Vulnerability Important
10 |
11 | Understanding vulnerabilities is crucial for protecting your systems against potential attacks. Unaddressed vulnerabilities represent open doors for malicious actors, putting your organization's data, operations, and reputation at risk. Identifying vulnerabilities early allows you to patch them before they can be exploited.
12 |
13 | ## Example Of Vulnerability
14 |
15 | A common example is SQL injection, where attackers insert malicious SQL code into input fields on a website. If the application doesn't properly validate user inputs, attackers can manipulate the database to extract sensitive information or gain administrative access to the system.
--------------------------------------------------------------------------------
/src/warning.md:
--------------------------------------------------------------------------------
1 | ---
2 | excerpt: Warnings alert teams to potential threats before they become full incidents requiring response.
3 | term: Warning
4 | ---
5 | ## What Is Warning
6 |
7 | A warning is an alert or notification issued to inform about a potential or imminent threat, risk, or problem. In incident management, it's a proactive measure to prevent or mitigate issues before they escalate into full-blown incidents.
8 |
9 | ## Why Is Warning Important
10 |
11 | Warnings are crucial for early detection and prevention of incidents. They allow teams to take preemptive action, reducing the likelihood of service disruptions and minimizing potential damage or downtime.
12 |
13 | ## Example Of Warning
14 |
15 | A server's CPU usage reaches 80%, triggering a warning alert. This prompts the IT team to investigate and address the issue before it leads to system slowdowns or crashes.
--------------------------------------------------------------------------------
/src/webhook.md:
--------------------------------------------------------------------------------
1 | ---
2 | excerpt: Webhooks send automatic alerts between apps during incidents for faster response and better tool integration.
3 | term: Webhook
4 | ---
5 | ## What Is Webhook
6 |
7 | A webhook is an automated message sent from one application to another when a specific event occurs. In incident management, webhooks enable real-time notifications and integrations between different tools and systems.
8 |
9 | ## Why Is Webhook Important
10 |
11 | Webhooks facilitate seamless information flow between incident management tools and other systems. They enable rapid notifications, automate workflows, and improve overall incident response efficiency.
12 |
13 | ## Example Of Webhook
14 |
15 | When a critical incident is created in an incident management system, a webhook automatically triggers a notification in the team's chat application for immediate awareness.
16 |
17 | ## How To Implement Webhook
18 |
19 | - Identify systems that need to communicate via webhooks
20 | - Configure webhook endpoints in the receiving application
21 | - Set up triggers for webhook dispatch in the sending application
22 | - Test webhook functionality to ensure proper data transmission
23 |
24 | ## Best Practices
25 |
26 | - Secure webhook endpoints with authentication mechanisms
27 | - Implement retry logic for failed webhook deliveries
28 | - Use webhooks judiciously to avoid overwhelming systems
--------------------------------------------------------------------------------
/src/weekly-rotation.md:
--------------------------------------------------------------------------------
1 | ---
2 | excerpt: Weekly rotation is a scheduling method where on-call or first responder duties change hands every week.
3 | term: Weekly Rotation
4 | ---
5 | ## What Is Weekly Rotation
6 |
7 | Weekly rotation is a scheduling method where on-call or first responder duties change hands every week. It helps distribute the workload and prevent fatigue.
8 |
9 | ## Why Is Weekly Rotation Important
10 |
11 | Weekly rotation keeps the incident response team fresh and reduces burnout. It also spreads knowledge across the team.
12 |
13 | ## How To Implement Weekly Rotation With Spike
14 |
15 | - Go to the On-Call section in Spike and select "Create Schedule"
16 | - Name your schedule and choose "Weekly" as the rotation pattern
17 | - Add team members who will participate in the rotation
18 | - Set the start date and handover time for each rotation
19 | - Activate the schedule to begin the automatic weekly rotations
20 |
21 | Keep your team fresh and responsive with Spike's on—call scheduling-set up your first weekly rotation in [Spike](https://app.spike.sh/signup) today.
--------------------------------------------------------------------------------
/src/well-being-features.md:
--------------------------------------------------------------------------------
1 | ---
2 | excerpt: Well-being features in incident management are tools and policies that support the mental and physical health of on-call engineers.
3 | term: Well-Being Features
4 | ---
5 | ## What Are Well-Being Features
6 |
7 | Well-being features in incident management are tools and policies that support the mental and physical health of on-call engineers. These may include workload limits, stress tracking, and time-off planning.
8 |
9 | ## Why Are Well-Being Features Important
10 |
11 | Supporting well-being helps reduce burnout and keeps teams alert during incidents. Healthy teams respond faster and make fewer mistakes.
12 |
13 | ## How To Implement Well-Being Features With Spike
14 |
15 | - Enable Cooldown mode after handling incidents to prevent alert overload during recovery
16 | - Use Deep Work mode when you need focused time-only critical alerts will reach you
17 | - Activate Out of Office mode when unavailable so alerts route to other team members
18 | - Set up these modes with one click in your Spike profile or mobile app
19 | - Schedule these modes in advance for planned focus time or time off
20 |
21 | Protect your team from burnout while maintaining reliable incident response—try Spike's well-being features with [Spike](https://app.spike.sh/signup).
--------------------------------------------------------------------------------
/src/widespread-outage.md:
--------------------------------------------------------------------------------
1 | ---
2 | excerpt: Large-scale disruptions impacting many users and critical services across multiple locations.
3 | term: Widespread Outage
4 | ---
5 | ## What Is Widespread Outage
6 |
7 | A widespread outage is a significant disruption affecting a large number of users or systems across multiple locations or services simultaneously. It often impacts critical infrastructure or core services.
8 |
9 | ## Example Of Widespread Outage
10 |
11 | An ISP experiences a network failure, causing internet service disruption for millions of customers across multiple cities.
--------------------------------------------------------------------------------
/src/work-log.md:
--------------------------------------------------------------------------------
1 | ---
2 | excerpt: Incident work logs document response steps in time order to help teams learn from past events.
3 | term: Work Log
4 | ---
5 | ## What Is Work Log
6 |
7 | A work log is a chronological record of actions, decisions, and progress made during incident management. It captures key details about the incident handling process, serving as a valuable resource for analysis and improvement.
8 |
9 | ## Why Is Work Log Important
10 |
11 | Work logs provide a clear audit trail of incident response activities. They help teams track progress, coordinate efforts, and facilitate post-incident reviews for continuous improvement.
12 |
13 | ## Example Of Work Log
14 |
15 | During a server outage, the work log might include entries like:
16 |
17 | - 10:15 AM: Alert received for server downtime
18 | - 10:20 AM: Initial investigation started
19 | - 10:35 AM: Root cause identified as failed hard drive
20 | - 11:00 AM: Replacement drive installed
21 | - 11:30 AM: Server back online, monitoring for stability
22 |
23 | ## How To Create Work Log
24 |
25 | - Use an incident management tool with built-in logging features
26 | - Train team members on consistent logging practices
27 | - Include timestamps, actions taken, and outcomes for each entry
28 | - Regularly review logs to identify areas for process improvement
29 |
30 | ## Best Practices
31 |
32 | - Keep entries concise, clear, and factual
33 | - Update the log in real-time during incident handling
34 | - Include both successful and unsuccessful actions
--------------------------------------------------------------------------------
/src/yearly-incident-review.md:
--------------------------------------------------------------------------------
1 | ---
2 | excerpt: A Yearly Incident Review examines major incidents to find patterns and improve processes.
3 | term: Yearly Incident Review
4 | ---
5 | ## What Is Yearly Incident Review
6 |
7 | A Yearly Incident Review is an annual assessment of all major incidents that occurred within an organization. It involves analyzing incident reports, response times, and resolution strategies to identify patterns and areas for improvement in incident management processes.
8 |
9 | ## Why Is Yearly Incident Review Important
10 |
11 | Yearly Incident Reviews help organizations learn from past experiences, refine their incident response strategies, and prevent recurring issues. They provide valuable insights into the effectiveness of current processes and highlight areas that need attention or resources.
12 |
13 | ## Example Of Yearly Incident Review
14 |
15 | An IT company reviews all major outages from the past year. They discover that 60% of incidents were related to database failures. This insight leads them to invest in more robust database monitoring and backup systems for the coming year.
16 |
17 | ## How To Implement Yearly Incident Review With Spike
18 |
19 | - Use Spike's analytics to compile incident data from the past year
20 | - Identify patterns and trends in incident types and response times
21 | - Review Spike's incident reports to refine your response strategies
22 |
23 | Streamline your yearly incident reviews with [Spike](https://app.spike.sh/signup) and plan for a stronger tomorrow.
--------------------------------------------------------------------------------
/src/yoy-year-over-year-incident-analysis.md:
--------------------------------------------------------------------------------
1 | ---
2 | excerpt: YOY Incident Analysis tracks how incident patterns change from one year to the next to spot trends.
3 | term: YOY (Year-Over-Year) Incident Analysis
4 | ---
5 | ## What Is YOY (Year-Over-Year) Incident Analysis
6 |
7 | YOY Incident Analysis compares incident data from one year to the same period in the previous year. It helps organizations understand how their incident management performance has changed over time and identify long-term trends in incident occurrence and resolution.
8 |
9 | ## Why Is YOY Incident Analysis Important
10 |
11 | YOY Incident Analysis allows organizations to measure the effectiveness of their incident management strategies over time. It helps identify persistent issues, track improvements, and adjust resources and processes based on long-term trends rather than short-term fluctuations.
12 |
13 | ## Example Of YOY Incident Analysis
14 |
15 | An e-commerce company compares this year's holiday season incidents to last year's. They find a 30% reduction in website downtime incidents, attributing it to infrastructure improvements made earlier in the year.
16 |
17 | ## How To Implement YOY Incident Analysis With Spike
18 |
19 | - Use Spike's analytics to compare incident data year-over-year.
20 | - Track changes in incident frequency, response time, and resolution speed over time.
21 |
22 | Gain insights into your incident management strategy and make informed decisions with [Spike](https://app.spike.sh/signup).
--------------------------------------------------------------------------------