├── .gitignore ├── Dockerfile ├── LICENSE ├── README.md ├── docs ├── assets │ ├── img │ │ ├── covers │ │ │ ├── blameless.png │ │ │ ├── default.png │ │ │ ├── post-mortem_process.png │ │ │ ├── post-mortem_template.png │ │ │ └── reading.png │ │ ├── headers │ │ │ ├── Postmortems-Accountability.png │ │ │ ├── Postmortems-Blameless.png │ │ │ ├── Postmortems-Checklist.png │ │ │ ├── Postmortems-Examples.png │ │ │ ├── Postmortems-InfoSharing.png │ │ │ ├── Postmortems-Introduce.png │ │ │ ├── Postmortems-Meeting.png │ │ │ ├── Postmortems-NextSteps.png │ │ │ ├── Postmortems-Questions.png │ │ │ ├── Postmortems-Resources.png │ │ │ ├── Postmortems-StepByStep.png │ │ │ ├── Postmortems-Template.png │ │ │ ├── Postmortems-Tips.png │ │ │ ├── Postmortems-Title.png │ │ │ ├── Postmortems-WhatIs.png │ │ │ └── pagerduty_logo.png │ │ └── thumbnails │ │ │ ├── 1Administration.png │ │ │ ├── 2CreateATimeline.png │ │ │ ├── 3DocumentImpact.png │ │ │ ├── 4AnalyzeTheIncident.png │ │ │ ├── 5FollowUpActions.png │ │ │ ├── 6WriteExternalMessaging.png │ │ │ ├── 7PostmortemReview.png │ │ │ ├── NextSteps │ │ │ ├── 1NewPostmortemReport.png │ │ │ ├── 2NewPostmortemReport.png │ │ │ ├── 3PostmortemDataSources.png │ │ │ ├── 4CreateTimeline.png │ │ │ ├── 5PostmortemDetail.png │ │ │ ├── 6ChangePostmortemStatus.png │ │ │ ├── 7ExportPostmortemPDF.png │ │ │ ├── 8EditPostmortemTemplate.png │ │ │ └── 9EditPostmortemSections.png │ │ │ ├── PostmortemChecklist.png │ │ │ └── PostmortemTemplate_preview.png │ └── pdf │ │ ├── PostmortemAnalysisQuestions.pdf │ │ ├── PostmortemChecklist.pdf │ │ └── PostmortemTemplate.pdf ├── culture │ ├── accountability.md │ ├── blameless.md │ ├── introduce.md │ └── sharing.md ├── how_to_write │ ├── effective_postmortems.md │ └── writing.md ├── index.md ├── meeting.md ├── next_steps.md ├── resources │ ├── analysis.md │ ├── checklist.md │ ├── examples.md │ ├── post_mortem_template.md │ └── reading.md └── what_is.md ├── mkdocs.yml ├── netlify.toml ├── requirements.txt ├── runtime.txt └── screenshot.png /.gitignore: -------------------------------------------------------------------------------- 1 | site/ 2 | .DS_Store -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | # https://circleci.com/docs/2.0/circleci-images/#python 2 | # We may as well use the same image we use for actually deploying our sites. 3 | FROM circleci/python:3.9.2 4 | 5 | # Dependencies 6 | RUN sudo pip install mkdocs 7 | RUN sudo pip install pymdown-extensions 8 | RUN sudo pip install pygments 9 | 10 | # Install the PagerDuty theme. 11 | WORKDIR /tmp 12 | RUN git clone https://github.com/pagerduty/mkdocs-theme-pagerduty 13 | RUN cd mkdocs-theme-pagerduty && sudo python3 setup.py install 14 | 15 | # Set our working directory and user 16 | WORKDIR /docs 17 | RUN sudo useradd -m --uid 1000 mkdocs 18 | USER mkdocs 19 | 20 | # Expose MkDocs server 21 | EXPOSE 8000 22 | 23 | # Start the local MkDocs server. 24 | ENTRYPOINT ["mkdocs"] 25 | CMD ["serve", "--dev-addr=0.0.0.0:8000"] 26 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright 2019 PagerDuty, Inc. 2 | 3 | Apache License 4 | Version 2.0, January 2004 5 | http://www.apache.org/licenses/ 6 | 7 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 8 | 9 | 1. Definitions. 10 | 11 | "License" shall mean the terms and conditions for use, reproduction, 12 | and distribution as defined by Sections 1 through 9 of this document. 13 | 14 | "Licensor" shall mean the copyright owner or entity authorized by 15 | the copyright owner that is granting the License. 16 | 17 | "Legal Entity" shall mean the union of the acting entity and all 18 | other entities that control, are controlled by, or are under common 19 | control with that entity. For the purposes of this definition, 20 | "control" means (i) the power, direct or indirect, to cause the 21 | direction or management of such entity, whether by contract or 22 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 23 | outstanding shares, or (iii) beneficial ownership of such entity. 24 | 25 | "You" (or "Your") shall mean an individual or Legal Entity 26 | exercising permissions granted by this License. 27 | 28 | "Source" form shall mean the preferred form for making modifications, 29 | including but not limited to software source code, documentation 30 | source, and configuration files. 31 | 32 | "Object" form shall mean any form resulting from mechanical 33 | transformation or translation of a Source form, including but 34 | not limited to compiled object code, generated documentation, 35 | and conversions to other media types. 36 | 37 | "Work" shall mean the work of authorship, whether in Source or 38 | Object form, made available under the License, as indicated by a 39 | copyright notice that is included in or attached to the work 40 | (an example is provided in the Appendix below). 41 | 42 | "Derivative Works" shall mean any work, whether in Source or Object 43 | form, that is based on (or derived from) the Work and for which the 44 | editorial revisions, annotations, elaborations, or other modifications 45 | represent, as a whole, an original work of authorship. For the purposes 46 | of this License, Derivative Works shall not include works that remain 47 | separable from, or merely link (or bind by name) to the interfaces of, 48 | the Work and Derivative Works thereof. 49 | 50 | "Contribution" shall mean any work of authorship, including 51 | the original version of the Work and any modifications or additions 52 | to that Work or Derivative Works thereof, that is intentionally 53 | submitted to Licensor for inclusion in the Work by the copyright owner 54 | or by an individual or Legal Entity authorized to submit on behalf of 55 | the copyright owner. For the purposes of this definition, "submitted" 56 | means any form of electronic, verbal, or written communication sent 57 | to the Licensor or its representatives, including but not limited to 58 | communication on electronic mailing lists, source code control systems, 59 | and issue tracking systems that are managed by, or on behalf of, the 60 | Licensor for the purpose of discussing and improving the Work, but 61 | excluding communication that is conspicuously marked or otherwise 62 | designated in writing by the copyright owner as "Not a Contribution." 63 | 64 | "Contributor" shall mean Licensor and any individual or Legal Entity 65 | on behalf of whom a Contribution has been received by Licensor and 66 | subsequently incorporated within the Work. 67 | 68 | 2. Grant of Copyright License. Subject to the terms and conditions of 69 | this License, each Contributor hereby grants to You a perpetual, 70 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 71 | copyright license to reproduce, prepare Derivative Works of, 72 | publicly display, publicly perform, sublicense, and distribute the 73 | Work and such Derivative Works in Source or Object form. 74 | 75 | 3. Grant of Patent License. Subject to the terms and conditions of 76 | this License, each Contributor hereby grants to You a perpetual, 77 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 78 | (except as stated in this section) patent license to make, have made, 79 | use, offer to sell, sell, import, and otherwise transfer the Work, 80 | where such license applies only to those patent claims licensable 81 | by such Contributor that are necessarily infringed by their 82 | Contribution(s) alone or by combination of their Contribution(s) 83 | with the Work to which such Contribution(s) was submitted. If You 84 | institute patent litigation against any entity (including a 85 | cross-claim or counterclaim in a lawsuit) alleging that the Work 86 | or a Contribution incorporated within the Work constitutes direct 87 | or contributory patent infringement, then any patent licenses 88 | granted to You under this License for that Work shall terminate 89 | as of the date such litigation is filed. 90 | 91 | 4. Redistribution. You may reproduce and distribute copies of the 92 | Work or Derivative Works thereof in any medium, with or without 93 | modifications, and in Source or Object form, provided that You 94 | meet the following conditions: 95 | 96 | (a) You must give any other recipients of the Work or 97 | Derivative Works a copy of this License; and 98 | 99 | (b) You must cause any modified files to carry prominent notices 100 | stating that You changed the files; and 101 | 102 | (c) You must retain, in the Source form of any Derivative Works 103 | that You distribute, all copyright, patent, trademark, and 104 | attribution notices from the Source form of the Work, 105 | excluding those notices that do not pertain to any part of 106 | the Derivative Works; and 107 | 108 | (d) If the Work includes a "NOTICE" text file as part of its 109 | distribution, then any Derivative Works that You distribute must 110 | include a readable copy of the attribution notices contained 111 | within such NOTICE file, excluding those notices that do not 112 | pertain to any part of the Derivative Works, in at least one 113 | of the following places: within a NOTICE text file distributed 114 | as part of the Derivative Works; within the Source form or 115 | documentation, if provided along with the Derivative Works; or, 116 | within a display generated by the Derivative Works, if and 117 | wherever such third-party notices normally appear. The contents 118 | of the NOTICE file are for informational purposes only and 119 | do not modify the License. You may add Your own attribution 120 | notices within Derivative Works that You distribute, alongside 121 | or as an addendum to the NOTICE text from the Work, provided 122 | that such additional attribution notices cannot be construed 123 | as modifying the License. 124 | 125 | You may add Your own copyright statement to Your modifications and 126 | may provide additional or different license terms and conditions 127 | for use, reproduction, or distribution of Your modifications, or 128 | for any such Derivative Works as a whole, provided Your use, 129 | reproduction, and distribution of the Work otherwise complies with 130 | the conditions stated in this License. 131 | 132 | 5. Submission of Contributions. Unless You explicitly state otherwise, 133 | any Contribution intentionally submitted for inclusion in the Work 134 | by You to the Licensor shall be under the terms and conditions of 135 | this License, without any additional terms or conditions. 136 | Notwithstanding the above, nothing herein shall supersede or modify 137 | the terms of any separate license agreement you may have executed 138 | with Licensor regarding such Contributions. 139 | 140 | 6. Trademarks. This License does not grant permission to use the trade 141 | names, trademarks, service marks, or product names of the Licensor, 142 | except as required for reasonable and customary use in describing the 143 | origin of the Work and reproducing the content of the NOTICE file. 144 | 145 | 7. Disclaimer of Warranty. Unless required by applicable law or 146 | agreed to in writing, Licensor provides the Work (and each 147 | Contributor provides its Contributions) on an "AS IS" BASIS, 148 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 149 | implied, including, without limitation, any warranties or conditions 150 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 151 | PARTICULAR PURPOSE. You are solely responsible for determining the 152 | appropriateness of using or redistributing the Work and assume any 153 | risks associated with Your exercise of permissions under this License. 154 | 155 | 8. Limitation of Liability. In no event and under no legal theory, 156 | whether in tort (including negligence), contract, or otherwise, 157 | unless required by applicable law (such as deliberate and grossly 158 | negligent acts) or agreed to in writing, shall any Contributor be 159 | liable to You for damages, including any direct, indirect, special, 160 | incidental, or consequential damages of any character arising as a 161 | result of this License or out of the use or inability to use the 162 | Work (including but not limited to damages for loss of goodwill, 163 | work stoppage, computer failure or malfunction, or any and all 164 | other commercial damages or losses), even if such Contributor 165 | has been advised of the possibility of such damages. 166 | 167 | 9. Accepting Warranty or Additional Liability. While redistributing 168 | the Work or Derivative Works thereof, You may choose to offer, 169 | and charge a fee for, acceptance of support, warranty, indemnity, 170 | or other liability obligations and/or rights consistent with this 171 | License. However, in accepting such obligations, You may act only 172 | on Your own behalf and on Your sole responsibility, not on behalf 173 | of any other Contributor, and only if You agree to indemnify, 174 | defend, and hold each Contributor harmless for any liability 175 | incurred by, or claims asserted against, such Contributor by reason 176 | of your accepting any such warranty or additional liability. 177 | 178 | END OF TERMS AND CONDITIONS 179 | 180 | APPENDIX: How to apply the Apache License to your work. 181 | 182 | To apply the Apache License to your work, attach the following 183 | boilerplate notice, with the fields enclosed by brackets "[]" 184 | replaced with your own identifying information. (Don't include 185 | the brackets!) The text should be enclosed in the appropriate 186 | comment syntax for the file format. We also recommend that a 187 | file or class name and description of purpose be included on the 188 | same "printed page" as the copyright notice for easier 189 | identification within third-party archives. 190 | 191 | Copyright [yyyy] [name of copyright owner] 192 | 193 | Licensed under the Apache License, Version 2.0 (the "License"); 194 | you may not use this file except in compliance with the License. 195 | You may obtain a copy of the License at 196 | 197 | http://www.apache.org/licenses/LICENSE-2.0 198 | 199 | Unless required by applicable law or agreed to in writing, software 200 | distributed under the License is distributed on an "AS IS" BASIS, 201 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 202 | See the License for the specific language governing permissions and 203 | limitations under the License. 204 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # PagerDuty PostMortem Best Practice Documentation 2 | [![Netlify Status](https://api.netlify.com/api/v1/badges/0552001b-35ae-4513-ae3a-3362c0b95fc3/deploy-status)](https://app.netlify.com/sites/postmortems-1e8475/deploys) 3 | 4 | This is a collection of information about the PagerDuty postmortem process and industry best practices. This guide will teach you how to build a culture of continuous learning, the most important components to include in your analysis, and how to conduct effective postmortem meetings. See the [home page](docs/index.md) for more information on what this documentation is and why it exists. 5 | 6 | You can view the documentation [directly](docs/index.md) in this repository, or rendered as a website at https://postmortems.pagerduty.com. 7 | 8 | [![PagerDuty Postmortems Documentation](screenshot.png)](https://postmortems.pagerduty.com) 9 | 10 | ## Development 11 | We use [MkDocs](http://www.mkdocs.org/) to create a static site from this repository. 12 | 13 | ### Native 14 | For local development on your native device, 15 | 16 | 1. Install [MkDocs](http://www.mkdocs.org/#installation). `pip install mkdocs` 17 | 1. Install [MkDocs PyMdown Extensions](https://squidfunk.github.io/mkdocs-material/extensions/pymdown/). `pip install pymdown-extensions` 18 | 1. Install [Pygments](https://pygments.org/) if you want syntax highlighting for any code examples. `pip install pygments` 19 | 1. Install the [PagerDuty MkDocs Theme](https://github.com/pagerduty/mkdocs-theme-pagerduty). 20 | 1. `git clone https://github.com/pagerduty/mkdocs-theme-pagerduty` 21 | 1. `cd mkdocs-theme-pagerduty & python3 setup.py install` 22 | 1. To test locally, run `mkdocs serve` from the project directory. 23 | 1. You can now view the website in your browser at `http://127.0.0.1:8000`. The site will automatically update as you edit the code. 24 | 25 | ### Docker 26 | For local development using Docker, 27 | 28 | 1. Build the docker image and load it for immediate use. `docker build --load -t mkdocs .` 29 | 1. Run the container and pass through your current working directory. `docker run -v $(pwd):/docs -p 127.0.0.1:8000:8000 mkdocs` 30 | 1. You can now view the website in your browser at `http://127.0.0.1:8000`. The site will automatically update as you edit the code. 31 | 32 | _Note: If you're using an Apple Silicon device, add `--platform linux/arm64/v8` to the `docker build` command to get a native Apple Silicon image. That will work faster than translating an arm64 image._ 33 | 34 | ## Deploying 35 | 1. Run `mkdocs build --clean` to produce the static site for upload. 36 | 1. Upload the `site` directory to S3 (or wherever you would like it to be hosted). 37 | 38 | aws s3 sync ./site/ s3://[BUCKET_NAME] \ 39 | --acl public-read \ 40 | --exclude "*.py*" \ 41 | --delete 42 | 43 | ## License 44 | [Apache 2](http://www.apache.org/licenses/LICENSE-2.0) (See [LICENSE](LICENSE) file) 45 | 46 | ## Contributing 47 | Thank you for considering contributing! If you have any questions, just ask - or submit your issue or pull request anyway. The worst that can happen is we'll politely ask you to change something. We appreciate all friendly contributions. 48 | 49 | Here is our preferred process for submitting a pull request, 50 | 51 | 1. Fork it ( https://github.com/PagerDuty/postmortem-docs/fork ) 52 | 1. Create your feature branch (`git checkout -b my-new-feature`) 53 | 1. Commit your changes (`git commit -am 'Add some feature'`) 54 | 1. Push to the branch (`git push origin my-new-feature`) 55 | 1. Create a new Pull Request. 56 | -------------------------------------------------------------------------------- /docs/assets/img/covers/blameless.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PagerDuty/postmortem-docs/93bdc0694e9b5d61875eb07f73187a823069a1aa/docs/assets/img/covers/blameless.png -------------------------------------------------------------------------------- /docs/assets/img/covers/default.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PagerDuty/postmortem-docs/93bdc0694e9b5d61875eb07f73187a823069a1aa/docs/assets/img/covers/default.png -------------------------------------------------------------------------------- /docs/assets/img/covers/post-mortem_process.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PagerDuty/postmortem-docs/93bdc0694e9b5d61875eb07f73187a823069a1aa/docs/assets/img/covers/post-mortem_process.png -------------------------------------------------------------------------------- /docs/assets/img/covers/post-mortem_template.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PagerDuty/postmortem-docs/93bdc0694e9b5d61875eb07f73187a823069a1aa/docs/assets/img/covers/post-mortem_template.png -------------------------------------------------------------------------------- /docs/assets/img/covers/reading.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PagerDuty/postmortem-docs/93bdc0694e9b5d61875eb07f73187a823069a1aa/docs/assets/img/covers/reading.png -------------------------------------------------------------------------------- /docs/assets/img/headers/Postmortems-Accountability.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PagerDuty/postmortem-docs/93bdc0694e9b5d61875eb07f73187a823069a1aa/docs/assets/img/headers/Postmortems-Accountability.png -------------------------------------------------------------------------------- /docs/assets/img/headers/Postmortems-Blameless.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PagerDuty/postmortem-docs/93bdc0694e9b5d61875eb07f73187a823069a1aa/docs/assets/img/headers/Postmortems-Blameless.png -------------------------------------------------------------------------------- /docs/assets/img/headers/Postmortems-Checklist.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PagerDuty/postmortem-docs/93bdc0694e9b5d61875eb07f73187a823069a1aa/docs/assets/img/headers/Postmortems-Checklist.png -------------------------------------------------------------------------------- /docs/assets/img/headers/Postmortems-Examples.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PagerDuty/postmortem-docs/93bdc0694e9b5d61875eb07f73187a823069a1aa/docs/assets/img/headers/Postmortems-Examples.png -------------------------------------------------------------------------------- /docs/assets/img/headers/Postmortems-InfoSharing.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PagerDuty/postmortem-docs/93bdc0694e9b5d61875eb07f73187a823069a1aa/docs/assets/img/headers/Postmortems-InfoSharing.png -------------------------------------------------------------------------------- /docs/assets/img/headers/Postmortems-Introduce.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PagerDuty/postmortem-docs/93bdc0694e9b5d61875eb07f73187a823069a1aa/docs/assets/img/headers/Postmortems-Introduce.png -------------------------------------------------------------------------------- /docs/assets/img/headers/Postmortems-Meeting.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PagerDuty/postmortem-docs/93bdc0694e9b5d61875eb07f73187a823069a1aa/docs/assets/img/headers/Postmortems-Meeting.png -------------------------------------------------------------------------------- /docs/assets/img/headers/Postmortems-NextSteps.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PagerDuty/postmortem-docs/93bdc0694e9b5d61875eb07f73187a823069a1aa/docs/assets/img/headers/Postmortems-NextSteps.png -------------------------------------------------------------------------------- /docs/assets/img/headers/Postmortems-Questions.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PagerDuty/postmortem-docs/93bdc0694e9b5d61875eb07f73187a823069a1aa/docs/assets/img/headers/Postmortems-Questions.png -------------------------------------------------------------------------------- /docs/assets/img/headers/Postmortems-Resources.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PagerDuty/postmortem-docs/93bdc0694e9b5d61875eb07f73187a823069a1aa/docs/assets/img/headers/Postmortems-Resources.png -------------------------------------------------------------------------------- /docs/assets/img/headers/Postmortems-StepByStep.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PagerDuty/postmortem-docs/93bdc0694e9b5d61875eb07f73187a823069a1aa/docs/assets/img/headers/Postmortems-StepByStep.png -------------------------------------------------------------------------------- /docs/assets/img/headers/Postmortems-Template.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PagerDuty/postmortem-docs/93bdc0694e9b5d61875eb07f73187a823069a1aa/docs/assets/img/headers/Postmortems-Template.png -------------------------------------------------------------------------------- /docs/assets/img/headers/Postmortems-Tips.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PagerDuty/postmortem-docs/93bdc0694e9b5d61875eb07f73187a823069a1aa/docs/assets/img/headers/Postmortems-Tips.png -------------------------------------------------------------------------------- /docs/assets/img/headers/Postmortems-Title.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PagerDuty/postmortem-docs/93bdc0694e9b5d61875eb07f73187a823069a1aa/docs/assets/img/headers/Postmortems-Title.png -------------------------------------------------------------------------------- /docs/assets/img/headers/Postmortems-WhatIs.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PagerDuty/postmortem-docs/93bdc0694e9b5d61875eb07f73187a823069a1aa/docs/assets/img/headers/Postmortems-WhatIs.png -------------------------------------------------------------------------------- /docs/assets/img/headers/pagerduty_logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PagerDuty/postmortem-docs/93bdc0694e9b5d61875eb07f73187a823069a1aa/docs/assets/img/headers/pagerduty_logo.png -------------------------------------------------------------------------------- /docs/assets/img/thumbnails/1Administration.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PagerDuty/postmortem-docs/93bdc0694e9b5d61875eb07f73187a823069a1aa/docs/assets/img/thumbnails/1Administration.png -------------------------------------------------------------------------------- /docs/assets/img/thumbnails/2CreateATimeline.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PagerDuty/postmortem-docs/93bdc0694e9b5d61875eb07f73187a823069a1aa/docs/assets/img/thumbnails/2CreateATimeline.png -------------------------------------------------------------------------------- /docs/assets/img/thumbnails/3DocumentImpact.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PagerDuty/postmortem-docs/93bdc0694e9b5d61875eb07f73187a823069a1aa/docs/assets/img/thumbnails/3DocumentImpact.png -------------------------------------------------------------------------------- /docs/assets/img/thumbnails/4AnalyzeTheIncident.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PagerDuty/postmortem-docs/93bdc0694e9b5d61875eb07f73187a823069a1aa/docs/assets/img/thumbnails/4AnalyzeTheIncident.png -------------------------------------------------------------------------------- /docs/assets/img/thumbnails/5FollowUpActions.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PagerDuty/postmortem-docs/93bdc0694e9b5d61875eb07f73187a823069a1aa/docs/assets/img/thumbnails/5FollowUpActions.png -------------------------------------------------------------------------------- /docs/assets/img/thumbnails/6WriteExternalMessaging.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PagerDuty/postmortem-docs/93bdc0694e9b5d61875eb07f73187a823069a1aa/docs/assets/img/thumbnails/6WriteExternalMessaging.png -------------------------------------------------------------------------------- /docs/assets/img/thumbnails/7PostmortemReview.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PagerDuty/postmortem-docs/93bdc0694e9b5d61875eb07f73187a823069a1aa/docs/assets/img/thumbnails/7PostmortemReview.png -------------------------------------------------------------------------------- /docs/assets/img/thumbnails/NextSteps/1NewPostmortemReport.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PagerDuty/postmortem-docs/93bdc0694e9b5d61875eb07f73187a823069a1aa/docs/assets/img/thumbnails/NextSteps/1NewPostmortemReport.png -------------------------------------------------------------------------------- /docs/assets/img/thumbnails/NextSteps/2NewPostmortemReport.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PagerDuty/postmortem-docs/93bdc0694e9b5d61875eb07f73187a823069a1aa/docs/assets/img/thumbnails/NextSteps/2NewPostmortemReport.png -------------------------------------------------------------------------------- /docs/assets/img/thumbnails/NextSteps/3PostmortemDataSources.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PagerDuty/postmortem-docs/93bdc0694e9b5d61875eb07f73187a823069a1aa/docs/assets/img/thumbnails/NextSteps/3PostmortemDataSources.png -------------------------------------------------------------------------------- /docs/assets/img/thumbnails/NextSteps/4CreateTimeline.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PagerDuty/postmortem-docs/93bdc0694e9b5d61875eb07f73187a823069a1aa/docs/assets/img/thumbnails/NextSteps/4CreateTimeline.png -------------------------------------------------------------------------------- /docs/assets/img/thumbnails/NextSteps/5PostmortemDetail.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PagerDuty/postmortem-docs/93bdc0694e9b5d61875eb07f73187a823069a1aa/docs/assets/img/thumbnails/NextSteps/5PostmortemDetail.png -------------------------------------------------------------------------------- /docs/assets/img/thumbnails/NextSteps/6ChangePostmortemStatus.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PagerDuty/postmortem-docs/93bdc0694e9b5d61875eb07f73187a823069a1aa/docs/assets/img/thumbnails/NextSteps/6ChangePostmortemStatus.png -------------------------------------------------------------------------------- /docs/assets/img/thumbnails/NextSteps/7ExportPostmortemPDF.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PagerDuty/postmortem-docs/93bdc0694e9b5d61875eb07f73187a823069a1aa/docs/assets/img/thumbnails/NextSteps/7ExportPostmortemPDF.png -------------------------------------------------------------------------------- /docs/assets/img/thumbnails/NextSteps/8EditPostmortemTemplate.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PagerDuty/postmortem-docs/93bdc0694e9b5d61875eb07f73187a823069a1aa/docs/assets/img/thumbnails/NextSteps/8EditPostmortemTemplate.png -------------------------------------------------------------------------------- /docs/assets/img/thumbnails/NextSteps/9EditPostmortemSections.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PagerDuty/postmortem-docs/93bdc0694e9b5d61875eb07f73187a823069a1aa/docs/assets/img/thumbnails/NextSteps/9EditPostmortemSections.png -------------------------------------------------------------------------------- /docs/assets/img/thumbnails/PostmortemChecklist.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PagerDuty/postmortem-docs/93bdc0694e9b5d61875eb07f73187a823069a1aa/docs/assets/img/thumbnails/PostmortemChecklist.png -------------------------------------------------------------------------------- /docs/assets/img/thumbnails/PostmortemTemplate_preview.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PagerDuty/postmortem-docs/93bdc0694e9b5d61875eb07f73187a823069a1aa/docs/assets/img/thumbnails/PostmortemTemplate_preview.png -------------------------------------------------------------------------------- /docs/assets/pdf/PostmortemAnalysisQuestions.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PagerDuty/postmortem-docs/93bdc0694e9b5d61875eb07f73187a823069a1aa/docs/assets/pdf/PostmortemAnalysisQuestions.pdf -------------------------------------------------------------------------------- /docs/assets/pdf/PostmortemChecklist.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PagerDuty/postmortem-docs/93bdc0694e9b5d61875eb07f73187a823069a1aa/docs/assets/pdf/PostmortemChecklist.pdf -------------------------------------------------------------------------------- /docs/assets/pdf/PostmortemTemplate.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PagerDuty/postmortem-docs/93bdc0694e9b5d61875eb07f73187a823069a1aa/docs/assets/pdf/PostmortemTemplate.pdf -------------------------------------------------------------------------------- /docs/culture/accountability.md: -------------------------------------------------------------------------------- 1 | --- 2 | cover: 3 | description: A successful postmortem process is based on a culture of honesty, learning, and accountability. Culture change requires management buy-in, but you can lead culture change no matter your role. This guide describes common challenges faced in building a culture of continuous learning through postmortems and strategies for overcoming these challenges. 4 | --- 5 | ![Accountability](../assets/img/headers/Postmortems-Accountability.png) 6 | 7 | Information sharing and transparency also support an environment that cultivates accountability. A common challenge to effective postmortems is that, after analyzing the incident and creating action items to prevent recurrence, information sharing to increase transparency is never done. 8 | 9 | Start by setting a policy for when postmortem action items should be completed. At PagerDuty, high-priority action items needed to prevent a Sev-1 incident from recurring should be completed within 15 days after an incident. Action items from a Sev-2 incident should be addressed within 30 days. Communicate this expectation to all of engineering and make sure it is documented for future reference. 10 | 11 | For action items to get done, they must have clear owners. Because we are an Agile and DevOps shop, the cross-functional teams responsible for the affected service are also responsible for implementing improvements expected to reduce the likelihood of failure. Engineering leadership helps clarify what parts of the system each team owns and sets expectations for which teams own new development and operational improvements. Ownership designations are communicated across the organization so all teams understand who owns what and ownership gaps can be identified. **As always, document this information for future reference and new hires.** Any uncertainty about ownership of an incident's action items are discussed in the postmortem meeting with representatives for all teams that may own the action item. 12 | 13 | We have also seen improved accountability for completing action items by involving the leaders responsible (product managers and engineering managers) for prioritizing a team's work in the postmortem meeting. Product managers are responsible for defining a good customer experience. Incidents cause a poor customer experience. Engage product managers in postmortem discussions by explaining that it will provide a wider picture of threats to customer experience and ideas on how to improve that experience. Doing so gives engineering a chance to explain the importance of these action items so that product managers will prioritize the work accordingly. Similarly, getting engineering leadership more involved in postmortem discussions gives them a better understanding of system weaknesses to inform how and where they should invest technical resources. Sharing this context with the leaders that prioritize work allows them to support the team's effort to quickly complete high-priority action items from incident analysis. 14 | 15 | Finally, ensure postmortem action items are discoverable and regularly viewed. Document postmortem action items as you would any other task. The list of action items from an incident analysis should not only live in your postmortem document. Open tickets in your task management tool, within the project of the team that will own the action item, so it can be viewed alongside all other planned work. We label all tickets with the severity level (Sev-1, Sev-2, etc.) and a date tag (YYYYMMDD) so we can easily query tickets that came from specific incidents and build reporting for the number of open tickets from major incidents. 16 | 17 | !!! info "Key Takeaways" 18 | - Set a policy for postmortem action items: e.g. 15 days for Sev-1 action items, 30 days for Sev-2 action items. 19 | - Clarify ownership of postmortem action items. 20 | - Engage the leaders that prioritize work. 21 | - Open tickets for postmortem action items in your work management ticketing system. 22 | -------------------------------------------------------------------------------- /docs/culture/blameless.md: -------------------------------------------------------------------------------- 1 | --- 2 | cover: 3 | description: A successful postmortem process is based on a culture of honesty, learning, and accountability. Culture change requires management buy-in, but you can lead culture change no matter your role. This guide describes common challenges faced in building a culture of continuous learning through postmortems and strategies for overcoming these challenges. 4 | --- 5 | ![Blameless](../assets/img/headers/Postmortems-Blameless.png) 6 | 7 | As IT professionals, we understand that failure is inevitable in complex systems. **How we respond to failure when it occurs matters.** In _[The Field Guide to Understanding Human Error](https://www.amazon.com/Field-Guide-Understanding-Human-Error/dp/0754648265)_, Sidney Dekker describes two views on human error: 1) the old view, which asserts that people’s mistakes cause failure, and 2) the new view, which treats human error as a symptom of a systemic problem. The old view ascribes to “the bad apple theory,” which believes that removing bad actors will prevent failure. This view attaches an individual's character to their actions, assuming negligence or bad intent leads to the error. 8 | 9 | An organization that follows this old view of human error may respond to an incident by finding the careless individual who caused the incident so they can be reprimanded. **This impulse to blame and punish has the unintended effect of disincentivizing the knowledge sharing required to prevent future failure.** Engineers will hesitate to speak up when incidents occur for fear of being blamed. This silence increases overall mean time to acknowledge, mean time to resolve, and exacerbates the impact of incidents. 10 | 11 | For the postmortem process to result in learning and system improvements, the new view of human error must be followed. In complex systems of software development, a variety of conditions interact to lead to failure. **The goal of the postmortem is to understand what systemic factors led to the incident and identify actions that can prevent this kind of failure from recurring.** A blameless postmortem stays focused on _how_ a mistake was made instead of _who_ made it. This is a crucial mindset leveraged by many leading organizations (such as Etsy, a pioneer for [blameless postmortems](https://codeascraft.com/2012/05/22/blameless-postmortems/)) for ensuring postmortems have the right tone, empowering engineers to give truly objective accounts of what happened by eliminating the fear of punishment. 12 | 13 | 14 | ## Why Being Blame Aware is Hard 15 | It is easy to agree that we want a culture of continuous improvement, but it is difficult to practice the blamelessness required for learning. The unexpected nature of failure naturally leads humans to react in ways that interfere with our understanding of it. When processing information, the human mind unconsciously takes shortcuts. By applying general rules-of-thumb, the mind optimizes for timeliness over accuracy. When this produces an incorrect conclusion, it is a cognitive bias. 16 | 17 | [J. Paul Reed](https://techbeacon.com/blameless-postmortems-dont-work-heres-what-does) argues the blameless postmortem is a myth because the tendency to blame is hardwired through millions of years of evolutionary neurobiology. Ignoring this tendency or trying to eliminate it entirely is impossible. It is more productive to be “blame aware.” **By being aware of our biases, we will be able to identify when they occur and work to move past them.** We touch upon some of the biases below, but for more details, read [Lindsay Holmwood's](http://fractio.nl/2015/10/30/blame-language-sharing/) article on the cognitive biases we must be aware of when performing postmortems. 18 | 19 | **[Fundamental attribution error](https://en.wikipedia.org/wiki/Fundamental_attribution_error)** is the tendency to believe that what people do reflects their character rather than their circumstances. This describes the old view of human error, assigning responsibility for a failure to bad actors who are careless and incompetent. Ironically, we tend to explain our own actions by our context, not our personality. Combat this tendency to blame by intentionally focusing the analysis on situational causes rather than discrete actions individuals took. 20 | 21 | Another pervasive cognitive bias is **confirmation bias**, which is the tendency to favor information that reinforces existing beliefs. When presented with ambiguous information, we tend to interpret it in a way that supports our existing assumptions. When combined with the old view of human error, this bias is dangerous for postmortems because it seeks to blame the bad apple. When approaching the analysis with the assumption that an individual is at fault, you will find a way to support that belief despite evidence to the contrary. 22 | 23 | To combat confirmation bias, Holmwood suggests appointing someone to play devil’s advocate to take contrarian viewpoints during investigations. Be cautious of introducing negativity or combativeness with a devil’s advocate. You can also counter confirmation bias by inviting someone from another team to ask any and all questions that come to their mind. This will help surface lines of inquiry the team has learned to take for granted. 24 | 25 | **Hindsight bias** is a type of memory distortion where we recall events to form a judgment. Knowing the outcome, it is easy to see the event as being predictable despite there having been little or no objective basis for predicting it. Often, we recall events in a way to make ourselves look better. An example is when a person analyzing the causes of an incident believes they knew it would happen like that. Enacting this bias can lead to defensiveness and division within a team. Holmwood suggests avoiding the hindsight bias by explaining events in terms of foresight instead. Start your timeline analysis at a point before the incident and work your way forward instead of backward from resolution. 26 | 27 | Another common bias to be aware of is **[negativity bias](https://en.wikipedia.org/wiki/Negativity_bias)**. This is the notion that things of a more negative nature have a greater effect on one’s mental state than those of neutral or even positive nature. Research on social judgments has shown negative information disproportionately impacts a person’s impression of others. This relates to the “bad apple theory,” the belief that there are negative actors in your organization to blame for failures. Studies have also shown people are more likely to attribute negative outcomes to the intentions of another person than neutral and positive outcomes. This also explains our tendency to blame individuals’ characters to explain a major incident. 28 | 29 | In reality, things go right more often than they go wrong, but we tend to focus on and magnify the importance of negative events. Focusing on, exaggerating, and internalizing incidents as negative events can be demoralizing and lead to burnout. Reframing incidents as learning opportunities and remembering to describe what was handled well in your response can help balance perspective 30 | 31 | ### Cognitive Biases 32 | 33 | | Bias | Definition | Countermeasure | 34 | |---|---|---| 35 | | Fundamental attribution error | What people do reflects their character rather than their circumstances. | |Intentionally focus the analysis on situational causes rather than discrete actions individuals took. | 36 | | Confirmation bias | Favoring information that reinforces existing positions. | Appoint someone to play devil’s advocate to take contrarian viewpoints during investigations. | 37 | | Hindsight bias | Seeing the incident as inevitable despite there having been little or no objective basis for predicting it because we know the outcome. | Explain events in terms of foresight instead. Start your timeline analysis at a point before the incident, and work your way forward instead of backward from resolution. | 38 | | Negativity bias | Things of a more negative nature have a greater effect on one’s mental state than neutral or even positive things. | Reframe incidents as learning opportunities, and remember to describe what was handled well in incident response. | 39 | 40 | We all have these cognitive biases that can lead to distorted views of events and damage team relationships if gone unchecked. It is important to be aware of these tendencies so we can acknowledge bias when it occurs. By making postmortems a collaborative process, teams can work as a group to identify blame and then constantly dig deeper in the analysis. 41 | 42 | ## How to Cultivate a Blameless (or Blame-Aware) Culture 43 | Acknowledging blame and working past it is easier said than done. What behaviors can we adopt to move towards a blameless culture? Holmwood eloquently writes about the importance of the words we use to minimize blame and maximize learning. He urges us to ask “what” questions (e.g., “What did you think was happening?” and “What did you do next?” Asking “what” questions grounds the analysis in the big-picture contributing factors to the incident. 44 | 45 | In his article “[The Infinite Hows](https://www.oreilly.com/ideas/the-infinite-hows),” John Allspaw encourages us to ask “how” questions because they get people to describe (at least some of) the conditions that allowed an event to take place. Holmwood also notes that “how” questions can help clarify technical details, distancing people from the actions they took. Avoid asking “why” questions because it forces people to justify their actions, attributing blame. 46 | 47 | [Crucial Accountability](https://www.vitalsmarts.com/crucial-accountability-training/) offers a helpful framework for approaching difficult conversations about unmet expectations that can be applied to postmortems when emotions run high. When analyzing failure, we may fall into victim, villain, and helpless stories that propel emotions and attempt to justify our worst behaviors. You can move beyond blame by telling the rest of the story. Consider your and others’ roles in the problem. Ask yourself why a reasonable, rational, and decent person may have taken the action that seems to have caused the incident. This thinking will help turn attention to the multiple systemic factors that led to the incident. 48 | 49 | Even when you have made a best effort to remain blameless, it is possible someone may still become defensive during a postmortem meeting if they feel they are being blamed. When this happens, work to restore mutual purpose and mutual respect so a productive discussion can continue. Restore mutual purpose by reiterating that the goal of the postmortem is to understand what systemic factors lead to the incident and collaboratively identify actions that can reduce failure moving forward. Often, people act out defensively when they feel their character is being attacked. Restore mutual respect by contrasting. Say what you did not intend (“I did not mean to imply you’re bad at your job.”) contrasted with what you do intend (“I meant to inquire to the situational factors that would lead any responder to take that action.”) Refocus your inquiry away from individual motivation, which implies blame. Abstracting to an inspecific responder also encourages other responders to contribute more suggestions as to what could have contributed to the system failure. 50 | 51 | !!! info "Key Takeaways" 52 | - Ask “what” and “how” questions rather than “who” or “why.” 53 | - Consider multiple and diverse perspectives. 54 | - Ask yourself why a reasonable, rational, and decent person may have taken a particular action. 55 | - When inquiring about a human action, abstract to an inspecific responder. Anyone could have made the same mistake. 56 | - Restore mutual purpose and mutual respect by contrasting what you did not intend with what you do intend. 57 | -------------------------------------------------------------------------------- /docs/culture/introduce.md: -------------------------------------------------------------------------------- 1 | --- 2 | cover: 3 | description: A successful postmortem process is based on a culture of honesty, learning, and accountability. Culture change requires management buy-in, but you can lead culture change no matter your role. This guide describes common challenges faced in building a culture of continuous learning through postmortems and strategies for overcoming these challenges. 4 | --- 5 | ![How to Introduce](../assets/img/headers/Postmortems-Introduce.png) 6 | 7 | Whether you're introducing postmortems as an entirely new practice at your organization or working to improve an existing process, culture change is hard. No matter your role, the first step to introducing a new process is getting buy-in from leadership and individual contributors because, often, bottom-up changes are more successful than top-down mandates from management. 8 | 9 | To practice blameless postmortems and encourage a culture of continuous improvement, you need commitment from leadership that no individuals will be reprimanded in any way after an incident. 10 | 11 | To convince management to support a shift to blameless analysis, clarify how blame is harmful to the business and explain the business value of blamelessness. For instance, punishing individuals for "causing" incidents discourages people from speaking up when problems occur for fear of being blamed. This silence will increase the mean time to acknowledge incidents, mean time to resolve, and, ultimately, exacerbate the impact of incidents. Organizations can rapidly improve the resilience of their systems and increase the speed of innovation by eliminating the fear of blame and encouraging collaborative learning. 12 | 13 | It may sound silly, but when selling a new blameless postmortem process to management, avoid blaming them for blaming others. Acknowledge that practicing blamelessness is difficult for everyone. Teams can help hold each other accountable by calling each other out when blame is observed in response to failure. Ask leadership if they will be receptive to receiving that feedback if and when they accidentally suggest blame after an incident. 14 | 15 | A verbal commitment from management to refrain from punishing people for causing incidents is an important start to introducing blameless postmortems, but that alone will not eliminate the fear of blame. Once you have leadership support, you will also need buy-in from the individual contributors who will be performing postmortem analysis. Share that you have commitment from management that no one will be punished after an incident. Because the tendency to blame is not unique to managers, explain to the team why blame is harmful to trust and collaboration. Agree to work together to become more blame-aware and kindly call each other out when blame is observed. 16 | 17 | When Google studied their teams to learn what behaviors made groups successful, they found that psychological safety was the most critical factor for a team work well together. Harvard Business School professor Amy Edmondson defines psychological safety as "a sense of confidence that the team will not embarrass, reject, or punish someone for speaking up." A sense of safety makes people feel comfortable enough to share information about incidents, which allows for deeper analysis and results in learnings that improve the resilience of your systems. 18 | 19 | Google found that high-performing teams with strong psychological safety share two key behaviors. First, these teams demonstrate conversational turn-taking. Team members speak in roughly the same proportion. When everyone is able to share their perspective, the collective intelligence of the group increases. Second, good teams have high social sensitivity or empathy. Successful teams are able to sense when someone is feeling upset or left out based on nonverbal cues. 20 | 21 | These behaviors and the resulting sense of psychological safety can be encouraged by modeling vulnerability. A manager at Google found his team was able to find ways to work better together after doing an ice-breaker activity in which everyone shared something personal about themselves. The manager started by telling the team about his struggle with cancer, which helped everyone else feel more comfortable sharing something. Creating emotional bonds within a team leads to greater psychological safety and higher performance. 22 | 23 | Culture change does not happen overnight. Iteratively introduce new practices to the organization by starting small, sharing successful results of experimenting with new practices, and slowly expanding those practices across teams. You can start experimenting with blameless postmortems within a single team. To get started, use our ["How to Write a Postmortem"](../how_to_write/writing.md) guide to share tips. 24 | 25 | It is also easy to start practicing blameless postmortems by analyzing smaller incidents before tackling major ones. Doing postmortems for smaller incidents allows the team to develop the skill of deeper system analysis that goes beyond how people contributed to an incident. This also helps protect individuals while everyone is practicing blameless culture as people may revert to blame, but the impact on the individual will be less than if that same mistake happens with a more critical incident. 26 | 27 | !!! info "Key Takeaways" 28 | - Sell the business value of blamelessness: faster incident resolution, more resilient systems, more time for innovation 29 | - Commit to kindly calling each other out when blame is observed 30 | - Start with a single team 31 | - Start with smaller incidents 32 | -------------------------------------------------------------------------------- /docs/culture/sharing.md: -------------------------------------------------------------------------------- 1 | --- 2 | cover: 3 | description: A successful postmortem process is based on a culture of honesty, learning, and accountability. Culture change requires management buy-in, but you can lead culture change no matter your role. This guide describes common challenges faced in building a culture of continuous learning through postmortems and strategies for overcoming these challenges. 4 | --- 5 | ![Information Sharing](../assets/img/headers/Postmortems-InfoSharing.png) 6 | 7 | You can scale culture through sharing.1 People want to share their successes, and when people see something that’s going well, they want to replicate that success. It may seem counterintuitive to share incident reports because it seems like you’re sharing a story of failure rather than success. The truth is, practicing blameless postmortems leads to success because it enables teams to learn from failure and improve systems to reduce the prevalence of failure. Framing incidents as learning opportunities with concrete resulting improvements rather than a personal failure also increases morale, which increases employee retention and productivity. 8 | 9 | **Sharing the results of postmortems has two main benefits:** 10 | 1. It increases system knowledge across the organization. 11 | 1. It reinforces a blameless culture. 12 | 13 | By sharing learnings from incident analysis, you help the entire organization learn, not just the affected teams responsible for remediation. PagerDuty sends completed postmortems via email to an “Incident Reports” distribution list that includes all of engineering, product, and support, as well as all Incident Commanders (who may not be in any of those departments.) This widens system knowledge for everyone involved in incident response. 14 | 15 | We encourage teams to learn postmortem best practices from each other by hosting a community of experienced postmortem writers available to review postmortems before they are shared more widely. This ensures blameless analysis through feedback and coaching while postmortems are being written. 16 | 17 | We also schedule all postmortem meetings on a shared calendar. This calendar is visible to the entire company, and anyone is welcome to join. This gives engineering teams the opportunity to learn from each other on how to practice blamelessness and deeply analyze incident causes. It also makes clear that incidents are not shameful failures that should be kept quiet. 18 | 19 | Being transparent about system failure reinforces a culture of blamelessness. When postmortems are shared, teams will see that individuals are not blamed or punished for incidents. This will reduce the fear of speaking up when issues inevitably occur. Creating a culture where information can be confidently shared leads to a culture of continuous learning in which teams can work together to design improvements. 20 | 21 | !!! info "Key Takeaways" 22 | * Create a community of experienced postmortem writers to review postmortem drafts and spread best practices. 23 | * Schedule postmortem meetings on a shared calendar, open for any interested parties to listen and learn. 24 | * Email completed postmortems to all teams involved in incident response to share learning and reinforce blamelessness. 25 | 26 | --- 27 | 1. Puppet’s [2018 State of DevOps Report](https://puppet.com/resources/whitepaper/state-of-devops-report) tells us operationally mature organizations adopt practices that promote sharing. 28 | -------------------------------------------------------------------------------- /docs/how_to_write/effective_postmortems.md: -------------------------------------------------------------------------------- 1 | --- 2 | cover: 3 | description: Here are concrete steps for producing a postmortem document. You will learn the most important information to include in the postmortem, how to collect and present that information, and how to conduct an effective analysis that results in system improvements. 4 | --- 5 | ![Effective Postmortems](../assets/img/headers/Postmortems-Tips.png) 6 | 7 | Writing detailed and accurate postmortems allows you to learn quickly from mistakes and improve systems and processes for everyone. This guide lists some of the things we do to make sure our postmortems are effective. 8 | 9 | ## Do 10 | - Make sure the timeline is an accurate representation of events. 11 | - Define any technical lingo/acronyms you use that newcomers may not understand. 12 | - [Separate what happened from how to fix it](https://www.youtube.com/watch?v=TqaFT-0cY7U). 13 | - Write follow-up tasks that are actionable, specific, and bounded in scope. 14 | - [Discuss how the incident fits into our understanding of the health and resiliency of the services affected](https://www.pagerduty.com/blog/postmortem-understand-service-reliability/). 15 | 16 | ## Do Not 17 | - Use the word "outage" unless it really was an outage. Accurately reflect the impact of an incident. Outage is usually too broad a term to use. It can lead customers to think the product was fully unavailable when that likely was nowhere near the case. 18 | - Change details or events to make things "look better." Be honest in postmortems, otherwise they lose their effectiveness. 19 | - Name and shame someone. Keep postmortems blameless. If someone deployed a change that broke things, it's not their fault. Everyone is collectively responsible for building a system that allowed them to deploy a breaking change. 20 | - Blame "human error." Very rarely is the mistake "rooted" in a human performing an action. There are often several contributing factors (the script the human ran didn't have rate limiting, the documentation was out of date, etc.) that can and should be addressed. 21 | - Only point out what went wrong. Drill down to the underlying causes of the issue. -------------------------------------------------------------------------------- /docs/how_to_write/writing.md: -------------------------------------------------------------------------------- 1 | --- 2 | cover: 3 | description: Here are concrete steps for producing a postmortem document. You will learn the most important information to include in the postmortem, how to collect and present that information, and how to conduct an effective analysis that results in system improvements. 4 | --- 5 | ![Step by Step](../assets/img/headers/Postmortems-StepByStep.png) 6 | 7 | Below are the steps involved in performing a postmortem at a high level. Below are the details of how to perform each step. 8 | 9 | 1. Create a new postmortem for the incident. 10 | 1. Schedule a postmortem meeting within the required timeframe for all required and optional attendees on the "Incident Postmortem Meetings" shared calendar. 11 | 1. Populate the incident timeline with important changes in status/impact and key actions taken by responders. 12 | - For each item in the timeline, include a metric or some third-party page where the data came from. 13 | 1. Analyze the incident. 14 | - Identify superficial and root causes. 15 | - Consider technology and process. 16 | 1. Open any follow-up action tickets. 17 | 1. Write the external messaging. 18 | 1. Ask for review. 19 | 1. Attend the postmortem meeting. 20 | 1. Share the postmortem. 21 | 22 | ## Owner Responsibilities 23 | At the end of a major incident call, or very shortly after, the [Incident Commander](https://response.pagerduty.com/training/incident_commander/) selects one responder to own the postmortem. The selected owner will be notified directly by the Incident Commander. Writing the postmortem will ultimately be a collaborative effort, but selecting a single owner will help ensure it gets done. 24 | 25 | The owner of a postmortem is responsible for the following: 26 | 27 | - Scheduling the postmortem meeting on the shared calendar and inviting the relevant people (this should be scheduled within 3 calendar days for a Sev-1 and 5 business days for a Sev-2). 28 | - Investigating the incident, pulling in whoever is needed from other teams to assist in the investigation. 29 | - Ensuring the page is updated with all of the necessary content. See our [Template](../resources/post_mortem_template.md) for what should be included. 30 | - Creating follow-up tickets. (The owner is only responsible for creating the tickets, not following them up to resolution). 31 | - Reviewing the postmortem content with appropriate parties before the meeting, and running through the topics at the postmortem meeting (the Incident Commander will "run" the meeting and keep the discussion on track, but you will likely be doing most of the talking). 32 | - Communicating the results of the postmortem internally. 33 | 34 | The owner of a postmortem creates the postmortem document and updates it with all relevant information. 35 | 36 | 37 | ## Administration 38 | ![Administration](../assets/img/thumbnails/1Administration.png) 39 | 40 | 1. Create the document. 41 | 2. Add all responders to it. 42 | 3. Schedule the meeting. 43 | 44 | If not already done by the Incident Commander, the postmortem owner's first step is to create a new, empty postmortem for the Incident. Go through the history in Slack to identify the responders and add them to the page so they can help populate the postmortem. Include the Incident Commander and Scribe as well. Add a link to the incident call recording. 45 | 46 | Next, schedule the postmortem meeting for 30 minutes to an hour, depending on complexity of the incident. Scheduling the meeting at the beginning of the process helps ensure the postmortem is completed within the SLA. **The meeting should be scheduled within 3 calendar days for a Sev-1 and 5 business days for a Sev-2.** Don't worry about finding the best time for all attendees. The priority is to schedule within this timeframe and attendees should adjust their schedules accordingly. At PagerDuty, we schedule all postmortem meetings on a shared "Incident Postmortem Meetings" calendar so they are easily discoverable for any interested parties across the organization. 47 | 48 | Invite the following people to the postmortem meeting: 49 | 50 | - Always 51 | - The [incident commander](https://response.pagerduty.com/training/incident_commander/). 52 | - The incident commander shadowee (if there was one). 53 | - [Service owners](https://response.pagerduty.com/training/subject_matter_expert/) involved in the incident. 54 | - Key engineer(s)/responders involved in the incident. 55 | - Engineering manager for impacted systems. 56 | - Product manager for impacted systems. 57 | - Optional 58 | - [Customer liaison](https://response.pagerduty.com/training/customer_liaison/) (only for Sev-1 incidents). 59 | 60 | PagerDuty postmortems have a "Status" field that indicates where in our process the postmortem currently is. Here's a description of the values and how we use them. 61 | 62 | | Status | Description | 63 | |-|-| 64 | | **Draft** | Indicates that the content of the postmortem is still being worked on. | 65 | | **In Review** | The content of the postmortem has been completed, and is ready to be reviewed during the postmortem meeting. | 66 | | **Reviewed** | The meeting is over and the content has been reviewed and agreed upon.
If there is an "External Message", the Customer Support team will take the message and update our status page as appropriate. | 67 | | **Closed** | No further actions are needed on the postmortem (outstanding issues are tracked in JIRA).
If no "External Message", you can skip straight to this once the meeting is over.
If there's an "External Message", then the Support team will update it to this status once the message is posted. | 68 | 69 | ## Create a Timeline 70 | ![Timeline](../assets/img/thumbnails/2CreateATimeline.png) 71 | Begin by focusing on the timeline. Document the facts of what happened during the incident. Avoid evaluating what should or should not have been done and coming to conclusions about what caused the incident. Presenting only the facts here will help avoid blame and supports a deeper analysis. Note the incident may have started before responders became aware of it and began the response effort. The timeline includes important changes in status/impact and key actions taken by responders. To avoid hindsight bias, start your timeline at a point before the incident and work your way forward instead of backwards from resolution. 72 | 73 | Review the incident log in Slack to find key decisions made and actions taken during the response effort. Also include information the team didn't know during the incident that, in hindsight, you wish you would have. Find this additional information by looking at monitoring, logs, and deployments related to the affected services. You'll take a deeper look at monitoring during the analysis step, but start here by adding key events related to the incident, and include changes to incident status and the impact to the timeline. 74 | 75 | For each item in the timeline, identify a metric or some third-party page where the data came from. This helps illustrate each point clearly and ensures you remain rooted in fact rather than opinions. This could be a link to a monitoring graph, a log search, a tweet, etc.—anything that shows the data point you're trying to illustrate in the timeline. 76 | 77 | !!! info "Key Takeaways" 78 | * Stick to the facts. 79 | * Include changes to incident status and impact. 80 | * Include key decisions and actions taken by responders. 81 | * Illustrate each point with a metric. 82 | 83 | ## Document Impact 84 | ![Impact](../assets/img/thumbnails/3DocumentImpact.png) 85 | Impact should be described from a few perspectives: 86 | 87 | - How long was the impact visible? In other words, what was the length of time users/customers were affected? 88 | - Note the length of impact may differ from the length of the response effort. Impact may have started some time before it was detected and incident response began. 89 | - How many customers were affected? 90 | - Support may need a list of all affected customers so they can reach out individually. 91 | - How many customers wrote or called support about the incident? 92 | - What functionality was affected and how severely? 93 | - Quantify impact with a business metric specific to your product. For PagerDuty this includes event submission, delayed processing, slow notification delivery, etc. 94 | 95 | ## Analyze the Incident 96 | ![Analyze](../assets/img/thumbnails/4AnalyzeTheIncident.png) 97 | Now that you have an understanding of what happened during the incident, look further back in time to find the contributing factors that led to the incident. Technology is a complex system with a network of relationships (organizational, human, technical) that is continuously changing. 98 | 99 | In his paper, "[How Complex Systems Fail](http://web.mit.edu/2.75/resources/random/How%20Complex%20Systems%20Fail.pdf)," Dr. Richard Cook says that because complex systems are heavily defended against failure, it is a unique combination of apparently innocuous failures that join to create catastrophic failure. Furthermore, because overt failure requires multiple faults, attributing a "root cause" is fundamentally wrong. **There is no single root cause of major failure in complex systems, but a combination of contributing factors that together lead to failure.** The postmortem owner's goal in analyzing the incident is not to identify the root cause, but to understand the multiple factors that created an environment where this failure became possible. 100 | 101 | Cook also says the effort to find the "root cause" does not reflect an understanding of the system, but rather the cultural need to blame specific, localized forces for events. Blamelessness is essential for an effective postmortem. **An individual's action should never be considered a root cause.** Effective analysis goes deeper than human action. In the cases where someone's mistake did contribute to a failure, it is worth anonymizing this in your analysis to avoid attaching blame to any individual. Assume any team member could have made the same mistake. According to Cook, "all practitioner actions are actually gambles, that is, acts that take place in the face of uncertain outcomes." 102 | 103 | The postmortem owner should start their analysis by looking at the monitoring for the affected services. Search for irregularities like sudden spikes or flatlining when the incident began and leading up to the incident. Include any commands or queries used to look up data, graph images, or links from monitoring tooling alongside this analysis so others can see how the data was gathered. If there is not monitoring for this service or behavior, make building monitoring an action item for this postmortem. More on [writing action items](#followup) below. 104 | 105 | !!! warning "Importance of Monitoring" 106 | Puppet's 2018 State of DevOps Report highlights making monitoring configurable by the team operating the service as a foundational practice for successful DevOps. Empowering teams to define, manage, and share their own measurement of performance contributes to a culture of continuous improvement. 107 | 108 | Another helpful strategy for targeting what caused an incident is reproducing it in a non-production environment. Experiment by modifying variables to isolate the phenomenon. If you modify or remove some input does the incident still occur? 109 | 110 | This level of analysis will uncover the superficial causes of the incident. Next, ask why the system was designed in a way to make this possible. Why did those design decisions seem to be the best decisions at the time? Answering these questions will help you uncover root causes. 111 | 112 | Here are some questions to help the postmortem owner identify the class of a particular problem: 113 | 114 | - Is it an isolated incident or part of a trend? 115 | - Was this a specific bug, a failure in a class of problem we anticipated, or did it uncover a class of issue we did not architecturally anticipate? 116 | - Was there work the team chose not to do in the past that contributed to this incident? 117 | - Research if there were any similar or related incidents in the past. Does this incident demonstrate a larger trend in your system? 118 | - Will this class of issue get worse/more likely as you continue to grow and scale the use of the service? 119 | 120 | !!! tip 121 | At PagerDuty, we have a separate process for analyzing larger trends across multiple incidents to inform technical and organizational planning. Learn more in our guide on [Operational Reviews](http://reviews.pagerduty.com). 122 | 123 | Though it may not be a root cause, consider the process in your analysis. Did the way that people collaborate, communicate, and/or review work contribute to the incident? This is also an opportunity to evaluate and improve the incident response process. Consider what worked well and didn't work well within the incident response process during the incident. 124 | 125 | Write a summary of the findings in the postmortem. The team may find further learnings and identify additional causes through discussion in the meeting, but the owner should do as much pre-work and documentation as possible to ensure a productive discussion. 126 | 127 | ### Questions to Ask 128 | Below is a non-exhaustive list to help stimulate deep analysis. Ask "how" and "what" questions rather than "who" or "why" to discourage blame and encourage learning. 129 | 130 | 131 | 132 | 133 | 140 | 141 | 142 | 143 | 150 | 151 | 152 | 153 | 160 | 161 | 162 | 163 | 170 | 171 | 172 | 173 | 180 | 181 | 182 | 183 | 190 | 191 | 192 | 193 | 199 | 200 |
Cues 134 |
    135 |
  • What were you focusing on?
  • 136 |
  • What was not noticed?
  • 137 |
  • What differed from what was expected?
  • 138 |
139 |
Previous Knowledge/Experience 144 |
    145 |
  • Was this an anticipated class of problem or did it uncover a class of issue that was not architecturally anticipated?
  • 146 |
  • What expectations did participants have about how things were going to develop?
  • 147 |
  • Were there similar incidents in the past?
  • 148 |
149 |
Goals 154 |
    155 |
  • What goals governed your actions at the time?
  • 156 |
  • How did time pressure or other limitations influence choices?
  • 157 |
  • Was there work the team chose not to do in the past that could have prevented or mitigated this incident?
  • 158 |
159 |
Assessment 164 |
    165 |
  • What mistakes (for example, in interpretation) were likely?
  • 166 |
  • How did you view the health of the services involved prior to the incident?
  • 167 |
  • Did this incident teach you something that should change views about this service's health?
  • 168 |
169 |
Taking Action 174 |
    175 |
  • How did you judge you could influence the course of events?
  • 176 |
  • What options were taken to influence the course of events? How did you determine that these were the best options at the time?
  • 177 |
  • How did other influences (operational or organizational) help determine how you interpreted the situation and how you acted?
  • 178 |
179 |
Help 184 |
    185 |
  • Did you ask anyone for help?
  • 186 |
  • What signal brought you to ask for support?
  • 187 |
  • Were you able to contact the people you needed to contact?
  • 188 |
189 |
Process 194 |
    195 |
  • Did the way that people collaborate, communicate, and/or review work contribute to the incident?
  • 196 |
  • What worked well in your incident response process and what did not work well?
  • 197 |
198 |
201 | 202 | !!! info "Key Takeaways" 203 | * Find contributing factors, not the root cause. 204 | * Focus on the system, not the humans. 205 | * Look for anomalies in monitoring. 206 | * Reproduce and experiment in a non-production environment. 207 | * Don't forget to review your processes. 208 | 209 | ## Follow-Up Actions 210 | ![Followup](../assets/img/thumbnails/5FollowUpActions.png) 211 | After identifying what caused the incident, ask what needs to be done to prevent this from happening again. Based on your analysis, you may also have proposals to reduce the occurrence of this class of problem, rather than this specific incident from recurring. 212 | 213 | It may not be possible (or worth the effort) to completely eliminate the possibility of this same incident or a similar incident from happening again, so also consider how you can improve detection and mitigation of future incidents. Does the team need better monitoring and alerting around this class of problem so they can respond faster in the future? If this class of incident does happen again, how can the team decrease the severity or duration? Remember to identify any actions that can make the incident response process better, too. Go through the incident history in Slack to find any to-do items raised during the incident and make sure these are documented as tickets as well. (At this phase, you are only opening tickets. There is no expectation that tasks will be completed before the postmortem meeting.) 214 | 215 | Create tickets for all proposed follow-up actions in your task management tool. Label all tickets with their severity level and date tags so they can be easily found and reported in the ticketing system. Provide as much context and proposed direction on the tickets as you can so the team's product owner will have enough information to prioritize the task against other work and the eventual assignee will have enough information to complete the task. 216 | 217 | In the _;login:_ magazine article, "[Postmortem Action Items: Plan the Work and Work the Plan](https://www.usenix.org/system/files/login/articles/login_spring17_09_lunney.pdf)," John Lunney, Sue Lueder, and Betsy Beyer write about how Google writes postmortem action items to ensure they are completed quickly and easily. They advise all action items to be written as actionable, specific, and bounded. 218 | 219 | - **Actionable:** Phrase each action item as a sentence starting with a verb. The action should result in a useful outcome. 220 | - **Specific:** Define each action item's scope as narrowly as possible, making clear what is in and out of scope. 221 | - **Bounded:** Word each action item to indicate how to tell when it is finished, as opposed to open-ended or ongoing tasks. 222 | 223 | | Poorly Worded | Better | 224 | |-|-| 225 | | Investigate monitoring for this scenario. | **Actionable:** Add alerting for all cases where this service returns >1% errors. | 226 | | Fix the issue that caused the outage. | **Specific:** Handle invalid postal code in user address form input safely. | 227 | | Make sure engineer checks that database schema can be parsed before updating. | **Bounded:** Add automated presubmit check for schema changes. | 228 | 229 | Source: _;login:_ Spring 2017 Vol. 42, No. 1. 230 | 231 | If there are any proposed follow-up actions that need discussion before tickets can be created, make a note to add these items to the postmortem meeting agenda. These may be proposals that need team validation or clarification. Discussing these items in the meeting will help decide how best to proceed. 232 | 233 | Be careful with creating too many tickets. Only create tickets that are P0/P1s; i.e., tasks that absolutely should be dealt with. There will be some trade-offs here, and that's fine. Sometimes the ROI isn't worth the effort that would go into performing an action that may reduce the recurrence of the incident. When that is the case, it is worth documenting that decision in the postmortem. Understanding why the team is choosing not to perform an action helps avoid learned helplessness. 234 | 235 | Note the person who creates the ticket is not responsible for completing it. Tickets are opened under the projects for the teams that own the affected service. At least one representative for all teams that will be responsible for a follow-up action are invited to the postmortem meeting. 236 | 237 | 238 | !!! info "Key Takeaways" 239 | * What needs to be done to reduce the likelihood of this, or a similar, incident from happening again? 240 | * How can you detect this type of incident sooner? 241 | * How can you decrease the severity or duration of this type of incident? 242 | * Write actionable, specific, and bounded tasks. 243 | 244 | ## Write External Messaging 245 | ![External](../assets/img/thumbnails/6WriteExternalMessaging.png) 246 | The goal of external messaging is to build trust by giving customers enough information about what happened and what you're doing about it, without giving away proprietary information about your technology and organization. There are parts of your internal analysis that primarily benefit the internal audience and do not need to be included in your external postmortem. 247 | 248 | The external postmortem is a summarized and sanitized version of the information used for the internal postmortem. External postmortems include these three sections: 249 | 250 | 1. **Summary:** Two to three sentences that summarize the duration of the incident and the observable customer impact. 251 | 1. **What Happened:** 252 | - Summary of cause(s). 253 | - Summary of customer-facing impact during the incident. 254 | - Summary of mitigation efforts during the incident. 255 | 1. **What Are We Doing About This:** Summary of action items. 256 | 257 | >Tip: Avoid using the word "outage" unless it really was a full outage—use the word "incident" or "service degradation" instead. Customers generally see "outage" and assume the worst. 258 | 259 | Note that at this point, the external postmortem is drafted language that should not be sent or published. It needs to be reviewed during the postmortem meeting before being sent out. 260 | 261 | ## Postmortem Review 262 | ![Review](../assets/img/thumbnails/7PostmortemReview.png) 263 | At PagerDuty, we have a community of experienced postmortem writers available to review postmortems for style and content. This avoids wasted time during the meeting. We post a link to the postmortem into Slack to receive feedback at least 24 hours before the meeting is scheduled. 264 | 265 | Here are some of the things we look for: 266 | 267 | - Does it provide enough detail? 268 | - Rather than just pointing out what went wrong, does it drill down to the underlying causes of the issue? 269 | - Does it separate "What happened?" from "How to fix it"? 270 | - Do the proposed action items make sense? Are they well-scoped enough? 271 | - Is the postmortem well-written and understandable? 272 | - Does the external message resonate well with customers or is it likely to cause outrage? 273 | 274 | Reviewing a postmortem isn't about nitpicking typos (but do make sure the external message isn't littered with spelling and grammatical errors). It's about providing constructive feedback on valuable changes to a postmortem to get the most benefit from them. 275 | -------------------------------------------------------------------------------- /docs/index.md: -------------------------------------------------------------------------------- 1 | ![PagerDuty](assets/img/headers/Postmortems-Title.png) 2 | 3 | Performing postmortems after incidents is how you learn what you're doing right, where you could improve, and most importantly, how to avoid making the same mistakes again and again. Well-designed postmortems allow your teams to iteratively improve your infrastructure and incident response process. 4 | 5 | The postmortem concept is well known in the technology industry, but it can be difficult for newer individuals, teams, and organizations to adopt the cultural nuances required for effective postmortems. This guide will teach you how to build a culture of continuous learning, the most important components to include in your analysis, and how to conduct effective postmortem meetings. 6 | 7 | ## Who Is This For? 8 | This resource is for on-call practitioners who want to iteratively learn from incidents affecting their team and for managers who want to cultivate a culture of learning in their organization. 9 | 10 | ## What Is Covered? 11 | ### What Is a Postmortem? 12 | The who, what, when, and why of [postmortems](what_is.md). 13 | 14 | ### Blameless Culture 15 | A successful postmortem process is based on a culture of honesty, learning, and accountability. Culture change requires management buy-in, but you can lead culture change no matter your role. This section describes common challenges in building a culture of continuous learning through postmortems, and strategies for overcoming them. 16 | 17 | - [The Blameless Postmortem](culture/blameless.md) 18 | - [How to Introduce Postmortems](culture/introduce.md) 19 | - [Information Sharing](culture/sharing.md) 20 | - [Accountability](culture/accountability.md) 21 | 22 | ### How to Write a Postmortem 23 | You will learn what information to include in the postmortem, how to collect and present that information, and how to conduct an effective analysis that results in system improvements. 24 | 25 | - [Step by Step](how_to_write/writing.md) 26 | - [Tips for Effective Postmortems](how_to_write/effective_postmortems.md) 27 | 28 | ### Postmortem Meetings 29 | How to conduct productive [postmortem meetings](meeting.md). 30 | 31 | ### Additional Resources 32 | 33 | - [Template](resources/post_mortem_template.md) 34 | - [Checklist](resources/checklist.md) 35 | - [Analysis Questions](resources/analysis.md) 36 | - [Examples](resources/examples.md) 37 | - [Further Reading](resources/reading.md) 38 | 39 | ### License 40 | This documentation is provided under the Apache License 2.0. In plain English that means you can use and modify this documentation and use it both commercially and for private use. However, you must include any original copyright notices, and the original LICENSE file. 41 | 42 | Whether you are a PagerDuty customer or not, we want you to have the ability to use this documentation internally at your own company. You can view the source code for all of this documentation on our GitHub account, feel free to fork the repository and use it as a base for your own internal documentation. 43 | -------------------------------------------------------------------------------- /docs/meeting.md: -------------------------------------------------------------------------------- 1 | --- 2 | cover: 3 | description: After you have completed the written postmortem, follow up with a meeting to discuss the incident. The purpose of this meeting is to deepen the postmortem analysis through direct communication and to get buy-in for action items. 4 | --- 5 | ![The Postmortem Meeting](assets/img/headers/Postmortems-Meeting.png) 6 | 7 | ## Purpose 8 | After you have completed the written postmortem, follow up with a meeting to discuss the incident. **The purpose of this meeting is to deepen the postmortem analysis through direct communication and to get buy-in for action items.** The asynchronous production of the written postmortem helps the team start learning from the incident, but having a conversation leads to deeper learning. Furthermore, having a meeting scheduled to discuss the written postmortem creates [accountability](culture/accountability.md) for the postmortem to be completed in a timely manner. Using this time to discuss action items also helps ensure that those tasks will be completed. 9 | 10 | An anti-pattern for the postmortem meeting is to be overly focused on the immediate concerns documented in the written postmortem. Avoid filling the meeting time by simply reading through each section of the document. The best use of this time is to take a step back from the detailed analysis to better understand the systemic factors that led to the incident. 11 | 12 | Some teams make use of the [Retrospective Prime Directive](http://retrospectivewiki.org/index.php?title=The_Prime_Directive) to set the tone for the meeting and serve as a regular reminder of the goals. It can be a helpful tool to anchor the discussion and provide a clean slate to start a retrospective, postmortem, or post-incident review. 13 | 14 | > 15 | "Regardless of what we discover, we understand and truly believe that everyone did the best job they could, given what they knew at the time, their skills and abilities, the resources available, and the situation at hand." 16 | --Norm Kerth, Project Retrospectives: A Handbook for Team Review 17 | 18 | 19 | **The most important outcome of the postmortem meeting is buy-in for the action plan.** This is an opportunity to discuss proposed [action items](how_to_write/writing.md), brainstorm other options, and gain consensus among team leadership. Sometimes the ROI of proposed action items is not great enough to justify the work or postmortem action items must be delayed for other priorities. The postmortem meeting is a time to discuss these difficult decisions and make clear what work will and will not be done, as well as the expected implications of those choices. 20 | 21 | Whereas the written postmortem is intended to be shared widely in the organization, the primary audience for the postmortem meeting is the teams directly involved with the incident. This meeting gives the team a chance to align on what happened, what to do about it, and how they will communicate about the incident to internal and external stakeholders. 22 | 23 | !!! tip 24 | Send a link to the postmortem document to meeting attendees 24 hours before the meeting. Though the postmortem does not need to be complete when it is sent to the attendees, it should be finished before the postmortem meeting. It is still worth sending an incomplete postmortem to meeting attendees in advance so they can start reading through the document. 25 | 26 | This will help you avoid wasting time in the meeting simply reading through the document. Remember the purpose of the meeting is to have an in-depth conversation about what caused the incident and how to prevent it in the future, not to review the document. The postmortem meeting is also an opportunity to clarify any questions about what happened and what the team plans to do to prevent it from happening again. Encourage attendees to ask any and all questions to help everyone get on the same page 27 | and help the team consider new perspectives for their analysis. 28 | 29 | ## Agenda 30 | Here is a sample agenda for the meeting: 31 | 32 | 1. **Postmortem owner** summarizes incident causes and timeline. **Facilitator** leads discussion: 33 | - What were the larger cultural and structural factors that lead to the incident? **How did we get here?** 34 | 1. **Postmortem owner** summarizes proposed follow-up action items. **Facilitator** leads discussion: 35 | - Is the team **confident** this plan will reduce the likelihood of this incident recurring? 36 | - **What more or different work might be needed?** 37 | - Will team leadership (Engineering Manager, Product Manager, Tech Lead, etc.) **commit** to prioritizing these action items? 38 | 1. **Customer liaison** summarizes customer impact. 39 | - Provide any new context about customer reaction to the incident. 40 | - Review and approve external communication drafted in the postmortem. 41 | 42 | ## Who Participates 43 | The postmortem owner invites the following people to the postmortem meeting. Below is more detail about the role each plays in the discussion. 44 | 45 | - Always 46 | - The [incident commander](https://response.pagerduty.com/training/incident_commander/). 47 | - The incident commander is responsible for coordinating the response. During the postmortem meeting the incident commander can provide valuable feedback on the incident response effort and process improvements. 48 | - The incident commander shadowee (if there was one). 49 | - This person may have served as the [scribe](https://response.pagerduty.com/training/scribe/) or [deputy](https://response.pagerduty.com/training/deputy/). The deputy incident commander is responsible for adding necessary responders to the call and updating internal stakeholders outside of the incident response call. The deputy can provide valuable feedback on the response effort and the ease or difficulty of communicating with additional responders and stakeholders during incident response. 50 | - [Service owners](https://response.pagerduty.com/training/subject_matter_expert/) and other key engineers involved in the incident. 51 | - On-call service owners and other engineers that responded to the incident are the experts of the affected services. During the postmortem meeting they can provide historical context about how the systems were built, cultural context about what was happening with the team leading up the incident, and proposals for what work would reduce the likelihood of this incident recurring. 52 | - Productive postmortem discussions will include engineers with in-depth knowledge of the part of the system that their team owns. If the engineer(s) that responded to the incident are newer to the team, it will be helpful to include more experienced engineers from their team in the postmortem meeting. 53 | - Engineering manager for impacted systems. 54 | - The manager responsible for the teams that responded to the incident attends the postmortem meeting to inform their staffing and technical investment decisions 55 | - Product manager for impacted systems. 56 | - Product managers attend postmortem meetings to understand the effect incidents have on their customers' experience. For postmortem action items to be prioritized and completed, it is critical to engage product managers in this discussion of the importance and scope of proposed follow-up tasks. 57 | - Optional (Only Sev-1 incidents) 58 | - [Customer liaison](https://response.pagerduty.com/training/customer_liaison/). 59 | - The customer liaison can speak to customers' reactions to the incident. They need to understand the team's decision on action items so they can finalize and send external messaging. 60 | 61 | ## Facilitation 62 | ### What Is Facilitation 63 | The facilitator's role in the postmortem meeting is different from the other participants. The facilitator does not voice their own ideas in the meeting; instead, they encourage the group to speak up and keep the discussion on track. The postmortem owner, the incident commander, or any other meeting attendee that played an active role during the incident are the ones who need to contribute to the discussion and should not also be responsible for facilitating. 64 | 65 | The facilitator: 66 | 67 | - Encourages people to speak up and makes sure that everyone is heard. 68 | - Clarifies insights and challenges the team with questions. 69 | - Helps the team see different perspectives and different options. 70 | - Keeps everyone on time and on track. Cuts off tangents and stops people from dominating the entire meeting. 71 | - Speaks as little as possible. Remember to guide the discussion, but do not take over the meeting. 72 | 73 | The facilitator does not: 74 | 75 | - Make decisions. 76 | - Take sides. If the facilitator takes sides, team members might feel attacked and might stop contributing to the meeting. 77 | - Comment on what people say, even if they are trying to give positive feedback. It may make the speaker feel validated, but it might also make the others feel worse about what they have to say or discourage them from contributing something. 78 | 79 | ### Who Should Facilitate 80 | Good facilitators tend to have a high level of emotional intelligence and can easily read non-verbal cues to understand how people are feeling. They use this sense to cultivate an environment where everyone is comfortable speaking. Agile coaches and project managers are often skilled facilitators. At PagerDuty, we have a guild of confident facilitators who coach individuals interested in learning how to facilitate. When searching for individuals in your organization to help facilitate postmortem meetings, look for people with these core competencies: 81 | 82 | - Can read non-verbal cues to assess how people are feeling in the room and spot who might have something to say. 83 | - Can paraphrases what is said to clarify for self and others. 84 | - Can ask open questions to stimulate deeper thinking. 85 | - Is comfortable interrupting when discussion gets off track or when someone dominates the discussion. 86 | - Can redirect conversation to focus on goals. 87 | - Can keep track of time and give time reminders. 88 | - Can drive discussion to decision-making and action items. 89 | 90 | Postmortem meeting facilitators do not need to be experts in the affected systems. Facilitators do not need to be well-versed in the content of the discussion. Remember, the facilitator does not contribute their own opinions to the discussion, but works to get others to speak. The meeting attendees that were involved with the incident response are the experts on the incident, and the facilitator will ask the right questions to encourage those experts to share information with the group. 91 | 92 | Your facilitator should, however, be familiar with the postmortem process and the goals of the postmortem meeting so they can guide the group discussion to achieve those goals. Postmortem meeting facilitators must have a strong understanding of [blamelessness](culture/blameless.md) so they can help the group avoid blaming speech in the meeting. 93 | 94 | ## Facilitation Tips 95 | The postmortem meeting facilitator helps the team dig deeper into their analysis, [avoid blame](culture/blameless.md), and get buy-in for their action items. Common challenges for the postmortem meeting are being overly focused on the written postmortem and succumbing to the tendency to blame individuals for system failure. Below are tips on how to run effective postmortem meetings and how to handle awkward situations when they arise. 96 | 97 | **Housekeeping** 98 | 99 | - Set ground rules at the beginning of the meeting. 100 | - Set the expectation that everyone should speak but no-one should hog the conversation. 101 | - Remind the group that we practice blameless postmortems. 102 | - Establish a safe word for when the conversation gets off track. 103 | - If a team member notices the conversation is getting off-topic they can say the safe word and have the team re-evaluate the usefulness of the discussion. At PagerDuty, some teams use the acronym ELMO which stands for "Enough, let's move on." This takes pressure off the facilitator alone to interrupt when discussion gets off-topic. 104 | - Share the agenda so the team is clear on what is on- and off-topic. 105 | - Use a timer to timebox. 106 | - You can timebox each agenda item. Presenting a timer makes everyone aware of the time limit and reduces the need for the facilitator to interrupt for time. 107 | - Present the postmortem document from your laptop onto the TV so everyone can see. 108 | 109 | [**How to avoid blame:**](culture/blameless.md) 110 | 111 | - Remind the team at the start of the meeting and/or when blame occurs during the meeting that we have agreed to practice blameless postmortems and call each other out when blame occurs. 112 | - Look out for and avoid "who" or "why" questions, which limit analysis and imply blame. Instead ask "what" and "how" questions, such as: 113 | - "What did you think was happening?" 114 | - "What did you do next?" 115 | - "How did that action make sense at the time?" 116 | - When inquiring about a human action, abstract to an non-specific responder. Remind the team anyone could have made the same mistake. 117 | - "What could have led any responder to take that action?" 118 | 119 | **What to do when the conversation is getting off-topic:** 120 | 121 | - The facilitator's job is to keep the team on track and will need to interrupt to remind the team of the meeting goals by asking if it is valuable to continue with a topic or if it can be taken offline. 122 | - "Sorry to interrupt, but this topic seems unrelated to the goals of this meeting, do we want to go back to the original topic or continue with this discussion?" 123 | - Timebox agenda items. Once the time is done they can vote if they want to keep talking for another few minutes. 124 | 125 | **What to do when one person is dominating the meeting:** 126 | 127 | - Say upfront that participation from everyone is important. Explain the facilitator's responsibilities so they won't be offended if they are asked to stop talking or speak up. Pay attention to how much people are talking throughout the meeting. 128 | - "I wasn't able to hear what the first person was saying." 129 | - Act as a mediator and call out when people are getting interrupted: "Hold that thought – I want to make sure Shari has a chance to finish" 130 | 131 | **If a team member has not said anything, how do you get them to contribute:** 132 | 133 | - "Let's go around the room and hear from everyone" 134 | - "What's stood out for you so far?" 135 | - "What else might we need to consider?" 136 | 137 | **How to stimulate analysis:** 138 | 139 | - Ask open questions, no questions that can be answered with "yes" or "no." 140 | - Reference our [analysis questions](resources/analysis.md). The team may have asked themselves these questions as they were preparing the written postmortem. Asking some of these in the meeting will encourage new, collaborative thinking. 141 | -------------------------------------------------------------------------------- /docs/next_steps.md: -------------------------------------------------------------------------------- 1 | --- 2 | cover: 3 | description: Now that you have learned how to create a postmortem, let's take a look at how to create one in the PagerDuty application. 4 | --- 5 | ![Next Steps](assets/img/headers/Postmortems-NextSteps.png) 6 | ## Create a Report in PagerDuty 7 | 8 | If you are using PagerDuty for incident management, we strongly encourage you to take advantage of our postmortems feature. This allows you to associate incidents and other data within PagerDuty with your report, which will help with timeline generation and allow you to write a more comprehensive report. Note that only non-stakeholders can create, modify, and/or delete postmortems. (For a matrix of user permissions, please see our [support page](https://support.pagerduty.com/docs/user-roles) and refer to the postmortems line items.) 9 | ### Create the Report 10 | 11 | To create a postmortem from an incident, you can select the (resolved) incident and click the New Postmortem Report button: 12 | 13 | ![Create a New Report Option 1](assets/img/thumbnails/NextSteps/1NewPostmortemReport.png) 14 | 15 | Alternatively, you can create a postmortem from the catalog, by either going to Incidents -> Postmortems or directly to `yoursubdomain.pagerduty.com/postmortems`. From there, you click New Report: 16 | 17 | ![Create a New Report Option 2](assets/img/thumbnails/NextSteps/2NewPostmortemReport.png) 18 | 19 | If you're creating a postmortem report from the catalog, you'll need to associate the incident after you start the report. If you include the estimated start and/or end times, the PagerDuty app will limit the possible incidents associated with that report to incidents that happened in that timeframe. 20 | 21 | ![Data Sources](assets/img/thumbnails/NextSteps/3PostmortemDataSources.png) 22 | 23 | 24 | Regardless of whether you created a report from an incident or the catalog, you can add additional incidents using the timeframe or incident number for situations where multiple incidents apply to a single report. 25 | 26 | The PagerDuty app will create a timeline to appear in the postmortem based on the in-app events: 27 | 28 | ![Create Timeline](assets/img/thumbnails/NextSteps/4CreateTimeline.png) 29 | 30 | If you have integrated with Slack or another data source, that information will also appear in the Available Data on the left. You can choose which items to add or remove using the arrows in the center. 31 | 32 | After you've completed the timeline, you will need to write in the Analysis. This section has several subsections. Some of the default subsections are Overview, What Happened, and Resolution: 33 | 34 | ![Postmortem Details](assets/img/thumbnails/NextSteps/5PostmortemDetail.png) 35 | 36 | Once you have the information you would like in the report, click Save & View Report. This will save the report in the Draft state (the report will also autosave in the Draft state). The states available for the postmortem report are: Draft, In Review, Reviewed, and Closed. You can edit the status by clicking on the report from the Postmortem Catalog and using the Status drop down menu, which is located at the top of the page: 37 | 38 | ![Change Postmortem Status](assets/img/thumbnails/NextSteps/6ChangePostmortemStatus.png) 39 | 40 | ## Addenda 41 | ### External Access 42 | You can export your postmortem report to a PDF at any stage. This is primarily used if there are reviewers not in the PagerDuty app or if there is a different, centralized tool for the company for others to view the final report. To save as a PDF, simply select the report from the Postmortem Catalog and click the Save as PDF button: 43 | 44 | ![Export Postmortem to PDF](assets/img/thumbnails/NextSteps/7ExportPostmortemPDF.png) 45 | 46 | ### Customizations 47 | We strongly recommend that you modify the default report template to fit your company's needs. This can involve adding or removing sections, changing wording to match common language, or modifying the clarifying text in each section so that it communicates what is needed. 48 | 49 | If you would like to add, edit, or remove sections you can do so under Settings in the Postmortem Catalog: 50 | 51 | ![Edit the Postmortem Template](assets/img/thumbnails/NextSteps/8EditPostmortemTemplate.png) 52 | 53 | You can Edit sections by clicking on the gear for the appropriate section. You can also click Add Section at the bottom of the template to add a completely new section: 54 | 55 | ![Edit Postmortem Section Text Areas](assets/img/thumbnails/NextSteps/9EditPostmortemSections.png) 56 | 57 | Changes will only apply to postmortem reports moving forward—they will not apply to reports that have already been created. 58 | For some guidance for what questions and clarifying information to put on your questions, take a look at the [Analysis Questions section](https://postmortems.pagerduty.com/resources/analysis/) under the Resources for this guide. 59 | 60 | If at any point you'd like to start again from the default template, you can reset the template. To revert to the original default sections, click on the Reset Template button at the top of your Report Template. You will be prompted in a pop-up menu to Reset Template or Cancel. 61 | 62 | ### Navigating Between Reports and Associated Incidents 63 | Currently, the only way to see an incident associated with a report is to open the report and look at the incidents that have been added to it. You cannot currently view a report by navigating to an incident to see an associated report. 64 | -------------------------------------------------------------------------------- /docs/resources/analysis.md: -------------------------------------------------------------------------------- 1 | --- 2 | cover: 3 | description: Questions to ask to stimulate deep postmortem analysis. 4 | --- 5 | ![Analysis](../assets/img/headers/Postmortems-Questions.png) 6 | 7 | Inspired by Gary Klein’s debriefing questions in Sidney Dekker’s *The Field Guide To Understanding Human Error*, below is a non-exhaustive list to help stimulate deep analysis. Ask “how” and “what” questions, rather than “who” or “why,” to discourage blame and encourage learning. 8 | 9 | [Download as a PDF](../assets/pdf/PostmortemAnalysisQuestions.pdf). 10 | 11 | 12 | 13 | 14 | 21 | 22 | 23 | 24 | 31 | 32 | 33 | 34 | 41 | 42 | 43 | 44 | 51 | 52 | 53 | 54 | 61 | 62 | 63 | 64 | 71 | 72 | 73 | 74 | 80 | 81 |
Cues 15 |
    16 |
  • What were you focusing on?
  • 17 |
  • What was not noticed?
  • 18 |
  • What differed from what was expected?
  • 19 |
20 |
Previous Knowledge/Experience 25 |
    26 |
  • Was this an anticipated class of problem or did it uncover a class of issue that was not architecturally anticipated?
  • 27 |
  • What expectations did participants have about how things were going to develop?
  • 28 |
  • Were there similar incidents in the past?
  • 29 |
30 |
Goals 35 |
    36 |
  • What goals governed your actions at the time?
  • 37 |
  • How did time pressure or other limitations influence choices?
  • 38 |
  • Was there work the team chose not to do in the past that could have prevented or mitigated this incident?
  • 39 |
40 |
Assessment 45 |
    46 |
  • What mistakes (for example, in interpretation) were likely?
  • 47 |
  • How did you view the health of the services involved prior to the incident?
  • 48 |
  • Did this incident teach you something that should change views about this service’s health?
  • 49 |
50 |
Taking Action 55 |
    56 |
  • How did you judge you could influence the course of events?
  • 57 |
  • What options were taken to influence the course of events? How did you determine that these were the best options at the time?
  • 58 |
  • How did other influences (operational or organizational) help determine how you interpreted the situation and how you acted?
  • 59 |
60 |
Help 65 |
    66 |
  • Did you ask anyone for help?
  • 67 |
  • What signal brought you to ask for support?
  • 68 |
  • Were you able to contact the people you needed to contact?
  • 69 |
70 |
Process 75 |
    76 |
  • Did the way that people collaborate, communicate, and/or review work contribute to the incident?
  • 77 |
  • What worked well in your incident response process and what did not work well?
  • 78 |
79 |
82 | -------------------------------------------------------------------------------- /docs/resources/checklist.md: -------------------------------------------------------------------------------- 1 | --- 2 | cover: 3 | description: A checklist for performing a postmortem. 4 | --- 5 | ![Postmortems Checklist](../assets/img/headers/Postmortems-Checklist.png) 6 | 7 | 8 | For each incident that requires a postmortem (Sev-1 or Sev-2 incidents), we clone a ticket template that has subtasks for each step of performing a postmortem. This helps a team collaborate on creating the postmortem and provides visibility on progress leading up to the postmortem meeting. 9 | 10 | Below are the steps involved in performing a postmortem at a high level. 11 | 12 | [Download as a PDF](../assets/pdf/PostmortemChecklist.pdf). 13 | 14 | ![Checklist](../assets/img/thumbnails/PostmortemChecklist.png) 15 | -------------------------------------------------------------------------------- /docs/resources/examples.md: -------------------------------------------------------------------------------- 1 | --- 2 | cover: 3 | description: Examples of postmortems. 4 | --- 5 | ![Postmortem Examples](../assets/img/headers/Postmortems-Examples.png) 6 | 7 | 8 | Here are some examples of postmortems from other companies as a reference, 9 | 10 | * [Stripe](https://support.stripe.com/questions/outage-postmortem-2015-10-08-utc) 11 | * [LastPass](https://blog.lastpass.com/2015/06/lastpass-security-notice.html/comment-page-2/) 12 | * [AWS](https://aws.amazon.com/message/5467D2/) 13 | * [Twilio](https://www.twilio.com/blog/2013/07/billing-incident-post-mortem-breakdown-analysis-and-root-cause.html) 14 | * [Heroku](https://status.heroku.com/incidents/151) 15 | * [Netflix](http://techblog.netflix.com/2012/10/post-mortem-of-october-222012-aws.html) 16 | * [GOV.UK Rail Accident Investigation](https://www.gov.uk/government/publications/kyle-beck-safety-digest/near-miss-at-kyle-beck-3-august-2016) 17 | * [A List of Post-mortems!](https://github.com/danluu/post-mortems) 18 | -------------------------------------------------------------------------------- /docs/resources/post_mortem_template.md: -------------------------------------------------------------------------------- 1 | --- 2 | cover: 3 | description: This is a standard template we use for postmortems at PagerDuty. Each section describes the type of information you will want to put in that section. 4 | --- 5 | ![Postmortem Template](../assets/img/headers/Postmortems-Template.png) 6 | 7 | 8 | This is a standard template we use for postmortems at PagerDuty. Each section describes the type of information you will want to put in that section. 9 | 10 | [Download](../assets/pdf/PostmortemTemplate.pdf) as a PDF to start using with your team. 11 | 12 | --- 13 | ![Template](../assets/img/thumbnails/PostmortemTemplate_preview.png) 14 | -------------------------------------------------------------------------------- /docs/resources/reading.md: -------------------------------------------------------------------------------- 1 | --- 2 | cover: 3 | description: This is a collection of additional reading on the topic of incident response that we've found useful. 4 | --- 5 | ![Futher Reading and Resources](../assets/img/headers/Postmortems-Resources.png) 6 | 7 | ## Creating a Blameless Culture 8 | ### Books 9 | * [The Field Guide to Understanding Human Error](https://www.amazon.com/Field-Guide-Understanding-Human-Error/dp/0754648265) (Sidney Dekker) 10 | * [Crucial Accountability](https://www.amazon.com/Crucial-Accountability-Resolving-Expectations-Commitments/dp/0071829318) (Kerry Patterson, Joseph Grenny, Ron McMillan, Al Switzler, David Maxfield) 11 | 12 | ### Articles 13 | * [Blame. Language. Sharing.](http://fractio.nl/2015/10/30/blame-language-sharing/) (Lindsay Holmwood) 14 | 15 | ### Talks 16 | * "[Three analytical traps in accident investigation](https://www.youtube.com/watch?v=TqaFT-0cY7U)" (Johan Bergstrom) 17 | * "[Two views on Human Error](https://www.youtube.com/watch?v=rHeukoWWtQ8)" (Johan Bergstrom) 18 | * [Advanced PostMortem Fu and Human Error 101 (Velocity 2011)](http://www.slideshare.net/jallspaw/advanced-postmortem-fu-and-human-error-101-velocity-2011) (John Allspaw) 19 | 20 | ## How to Analyze Incidents 21 | ### Articles 22 | * [The Infinite Hows](https://www.oreilly.com/ideas/the-infinite-hows) (John Allspaw) 23 | 24 | ### Documents 25 | * [Postmortem Action Items: Plan the Work and Work the Plan](https://www.usenix.org/system/files/login/articles/login_spring17_09_lunney.pdf) (John Lunney, Sue Lueder, and Betsy Beyer) 26 | 27 | ## Process and Mechanics of Postmortems and Retrospectives 28 | * [The Agile Retrospective Wiki](http://retrospectivewiki.org/index.php?title=Agile_Retrospective_Resource_Wiki) 29 | -------------------------------------------------------------------------------- /docs/what_is.md: -------------------------------------------------------------------------------- 1 | --- 2 | cover: 3 | description: The basics of Postmortems. Why postmortems are important, when they should be done, and who is responsible for the postmortem. 4 | --- 5 | ![What is a Postmortem?](assets/img/headers/Postmortems-WhatIs.png) 6 | 7 | > What went wrong and how do we learn from it? 8 | 9 | A postmortem (or post-mortem) is a process intended to help you learn from past incidents. It typically involves a blame-free analysis and discussion soon after an event has taken place. An artifact is produced that includes a detailed description of exactly what went wrong in order to cause the incident, along with a list of steps to take in order to prevent a similar incident from occurring again in the future. An analysis of how effective your incident response process itself was during the incident should also be included in the discussion. The value of postmortems comes from helping institutionalize a culture of continuous improvement. 10 | 11 | Organizations may refer to the postmortem process in slightly different terms: 12 | 13 | - Learning Review 14 | - After-Action Review 15 | - Incident Review 16 | - Incident Report 17 | - Post-Incident Review 18 | - Root Cause Analysis (or RCA) 19 | 20 | ## Why Do Postmortems 21 | During incident response, the team is 100% focused on restoring service. They cannot (and should not) be wasting time and mental energy thinking about how to do something optimally or performing a deep dive on what caused the incident. That's why postmortems are essential—they provide an opportunity to reflect once the issue is no longer impacting users. **The postmortem process drives focus, instills a culture of learning, and identifies opportunities for improvement that otherwise would be lost.** 22 | 23 | Without a postmortem, you fail to recognize what you're doing right, where you could improve, and, most importantly, how to avoid making the same mistakes in the future. Conducting an effective postmortem allows you to learn quickly from your mistakes and improve your systems and processes. A well-designed, blameless postmortem allows teams to continuously learn, serving as a way to iteratively improve your infrastructure and incident response process. Be sure to write detailed and accurate postmortems in order to get the most benefit out of them. 24 | 25 | ## When to Do a Postmortem 26 | **Do a postmortem for every major incident** (Sev-2/1). This includes **any time incident response is triggered**—even if it is later discovered that severity was actually lower, it was a false alarm, or it quickly recovered without intervention. A postmortem should not be neglected in these cases because it is still an opportunity to review what did and did not work well in the incident response process. If the incident should not have triggered incident response, it is worthwhile understanding why it did so monitoring can be tuned to avoid unnecessarily triggering incident response in the future. Doing this analysis and follow-up action will help prevent alert fatigue going forward. 27 | 28 | Postmortems are done shortly after the incident is resolved, while the context is still fresh for all responders. Just as resolving a major incident becomes top priority when it occurs, completing the postmortem is prioritized over planned work. Completing the postmortem is the final step of your incident response process. Delaying the postmortem delays key learning that will prevent the incident from recurring. 29 | 30 | **PagerDuty's internal policy for completing postmortems is 3 calendar days for a Sev-1 and 5 business days for a Sev-2.** Because scheduling a time when everyone is available can be difficult, the expectation is people will adjust their calendars to attend the postmortem meeting within this timeframe. 31 | 32 | ## Who Is Responsible for the Postmortem 33 | At the end of a major incident call, or very shortly after, the [Incident Commander](https://response.pagerduty.com/training/incident_commander/) selects and directly notifies one responder to own the postmortem. Note that the postmortem owner is not solely responsible for completing the postmortem themselves. **Writing a postmortem is a collaborative effort** and should include everyone involved in the incident response. While engineering will lead the analysis, the postmortem process should involve management, customer support, and business communications teams. The postmortem owner coordinates with everyone who needs to be involved to ensure it is completed in a timely manner. 34 | 35 | It is important to designate a single owner to avoid the bystander effect. If you ask all responders or a team to do the postmortem, you risk everyone assuming someone else is doing it, and therefore, no one does. When selecting an owner you may choose a single individual who meets any of the following criteria: 36 | 37 | - Took a leadership role investigating during the incident 38 | - Performed a task that led to stabilizing the service 39 | - Was the primary on-call responder for the most heavily affected service 40 | - Manually triggered the incident to initiate incident response 41 | 42 | Doing the postmortem is not a punishment, and the owner is not the person that "caused" the incident. Effective postmortems are blameless. In complex systems there is never a single cause, but a combination of factors that lead to failure. The owner is simply an accountable individual who performs select administrative tasks, follows up for information, and drives the postmortem to completion. Writing the postmortem will ultimately be a collaborative effort, but selecting a single owner to orchestrate this collaboration helps ensure it is done. 43 | -------------------------------------------------------------------------------- /mkdocs.yml: -------------------------------------------------------------------------------- 1 | # Project Information 2 | site_name: PagerDuty Postmortem Documentation 3 | site_description: A collection of information about the PagerDuty postmortem process and industry best practices. This guide will teach you how to build a culture of continuous learning, the most important components to include in your analysis, and how to conduct effective postmortem meetings. 4 | site_author: PagerDuty, Inc. 5 | site_url: https://postmortems.pagerduty.com/ 6 | 7 | # Repository 8 | repo_url: https://github.com/pagerduty/postmortem-docs 9 | 10 | # Copyright 11 | copyright: 'Copyright © PagerDuty, Inc.' 12 | 13 | # Theme 14 | theme: 15 | name: pagerduty 16 | title: 'Postmortems' 17 | 18 | # Set default cover image 19 | extra: 20 | cover: assets/img/covers/default.png 21 | 22 | # Contents 23 | nav: 24 | - Home: 'index.md' 25 | - What Is a Postmortem: 'what_is.md' 26 | - Blameless Culture: 27 | - The Blameless Postmortem: 'culture/blameless.md' 28 | - How to Introduce Postmortems: 'culture/introduce.md' 29 | - Information Sharing: 'culture/sharing.md' 30 | - Accountability: 'culture/accountability.md' 31 | - How to Write a Postmortem: 32 | - Step by Step: 'how_to_write/writing.md' 33 | - Tips for Effective Postmortems: 'how_to_write/effective_postmortems.md' 34 | - The Postmortem Meeting: 'meeting.md' 35 | - Next Steps: 'next_steps.md' 36 | - Resources: 37 | - Template: 'resources/post_mortem_template.md' 38 | - Checklist: 'resources/checklist.md' 39 | - Analysis Questions: 'resources/analysis.md' 40 | - Examples: 'resources/examples.md' 41 | - Further Reading: 'resources/reading.md' 42 | 43 | # Analytics 44 | google_analytics: ['UA-8759953-1', 'auto'] 45 | 46 | # Extensions 47 | markdown_extensions: 48 | - toc: 49 | permalink: '#' 50 | - sane_lists: 51 | - admonition: 52 | - meta: 53 | - pymdownx.details: 54 | - pymdownx.extra: 55 | - pymdownx.mark: 56 | - pymdownx.tilde: 57 | - pymdownx.highlight: 58 | - pymdownx.superfences: 59 | - pymdownx.tabbed: 60 | 61 | # Development URL, bind to local only. 62 | dev_addr: '127.0.0.1:8000' 63 | -------------------------------------------------------------------------------- /netlify.toml: -------------------------------------------------------------------------------- 1 | [[headers]] 2 | for = "/*" 3 | [headers.values] 4 | Strict-Transport-Security = "max-age=31536000 ; includeSubDomains" 5 | X-Content-Type-Options = "nosniff" 6 | X-Frame-Options = "DENY" 7 | X-XSS-Protection = "1; mode=block" 8 | Referrer-Policy = "same-origin" 9 | Feature-Policy = "accelerometer 'none'; camera 'none'; geolocation 'none'; gyroscope 'none'; magnetometer 'none'; microphone 'none'; payment 'none'; usb 'none'" 10 | Content-Security-Policy-Report-Only = "default-src 'none'; script-src 'self' ssl.google-analytics.com www.google-analytics.com 'sha256-AzwHtScSzFOoXIoLRz4+vK2rDADGdNC3AXQG5FjKK68=' 'sha256-8IZh+gkrYCTCvcu/zawPN9Vj4RLghgOT4F5rTZXvdkQ='; object-src 'self'; style-src 'self'; img-src 'self' data: ssl.google-analytics.com www.google-analytics.com; media-src 'none'; frame-src www.youtube.com www.youtube-nocookie.com; font-src 'self'; connect-src 'self' ssl.google-analytics.com www.google-analytics.com; base-uri 'none'; form-action 'self'; frame-ancestors 'none'" 11 | 12 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | mkdocs 2 | pymdown-extensions 3 | pygments 4 | git+https://github.com/pagerduty/mkdocs-theme-pagerduty.git@master 5 | -------------------------------------------------------------------------------- /runtime.txt: -------------------------------------------------------------------------------- 1 | 3.8 2 | -------------------------------------------------------------------------------- /screenshot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PagerDuty/postmortem-docs/93bdc0694e9b5d61875eb07f73187a823069a1aa/screenshot.png --------------------------------------------------------------------------------