├── .github ├── CODEOWNERS └── workflows │ ├── publish-docs.yaml │ └── validate-docs.yaml ├── .gitignore ├── .gitpod.yml ├── LICENSE ├── Makefile ├── README.rst ├── VERSION ├── _extra └── robots.txt ├── _static ├── SystemsApproachLogoURL.png ├── bridge.ico ├── cover.jpg ├── css │ └── rtd_theme_mods.css └── fonts │ ├── Inconsolata-Bold.ttf │ └── Inconsolata-Regular.ttf ├── algorithm.rst ├── aqm.rst ├── authors.rst ├── avoidance.rst ├── biblio.rst ├── code ├── README ├── build.sh ├── cwin.c ├── nagle.c ├── red.c └── timeout.c ├── conf.py ├── design.rst ├── dict.txt ├── figures ├── Figure-sources.pptx ├── Graph_16B.png ├── Graph_1A.png ├── Graph_1B.png ├── Graph_6C.png ├── Graph_8B.png ├── Graph_8C.png ├── Slide1.png ├── Slide10.png ├── Slide11.png ├── Slide12.png ├── Slide13.png ├── Slide14.png ├── Slide15.png ├── Slide16.png ├── Slide2.png ├── Slide3.png ├── Slide4.png ├── Slide5.png ├── Slide6.png ├── Slide7.png ├── Slide8.png ├── Slide9.png ├── f03-16-9780123850591.png ├── f05-03-9780123850591.png ├── f05-04-9780123850591.png ├── f05-05-9780123850591.png ├── f05-08-9780123850591.png ├── f05-10-9780123850591.png ├── f06-03-9780123850591.png ├── f06-05-9780123850591.png ├── f06-08-9780123850591.png ├── f06-09-9780123850591.png ├── f06-10-9780123850591.png ├── f06-11-9780123850591.png ├── f06-12-9780123850591.png ├── f06-13-9780123850591.png ├── f06-14-9780123850591.png ├── f06-15-9780123850591.png ├── f06-16-9780123850591.png ├── f06-17-9780123850591.png ├── f06-18-9780123850591.png └── f06-19-9780123850591.png ├── foreword.rst ├── index.rst ├── intro.rst ├── latest.rst ├── preface.rst ├── print.rst ├── requirements.txt ├── tcp_ip.rst └── variants.rst /.github/CODEOWNERS: -------------------------------------------------------------------------------- 1 | #require review 2 | * @llpeterson @drbruced12 3 | -------------------------------------------------------------------------------- /.github/workflows/publish-docs.yaml: -------------------------------------------------------------------------------- 1 | name: Publish Docs Workflow 2 | run-name: ${{ github.actor }} is publishing document artifacts 🚀 3 | on: 4 | push: 5 | branches: 6 | - master 7 | 8 | # Sets permissions of the GITHUB_TOKEN to allow deployment to GitHub Pages 9 | permissions: 10 | contents: read 11 | pages: write 12 | id-token: write 13 | 14 | # Allow only one concurrent deployment, skipping runs queued between the run in-progress and latest queued. 15 | # However, do NOT cancel in-progress runs as we want to allow these production deployments to complete. 16 | concurrency: 17 | group: "pages" 18 | cancel-in-progress: false 19 | 20 | jobs: 21 | # Single deploy job since we're just deploying 22 | deploy: 23 | environment: 24 | name: github-pages 25 | url: ${{ steps.deployment.outputs.page_url }} 26 | runs-on: ubuntu-latest 27 | steps: 28 | - name: Checkout 29 | uses: actions/checkout@v4 30 | - name: Setup Pages 31 | uses: actions/configure-pages@v4 32 | - name: Build html 33 | run: make html 34 | - name: Upload artifact 35 | uses: actions/upload-pages-artifact@v3 36 | with: 37 | # Upload build repository 38 | path: './_build/html' 39 | - name: Deploy to GitHub Pages 40 | id: deployment 41 | uses: actions/deploy-pages@v4 42 | 43 | 44 | - run: echo "🍏 This job's status is ${{ job.status }}." 45 | -------------------------------------------------------------------------------- /.github/workflows/validate-docs.yaml: -------------------------------------------------------------------------------- 1 | name: Validate Docs Workflow 2 | run-name: ${{ github.actor }} is validating document source 3 | on: [pull_request, workflow_dispatch] 4 | jobs: 5 | Validate_Docs: 6 | runs-on: ubuntu-latest 7 | steps: 8 | - run: echo "🎉 The job was automatically triggered by a ${{ github.event_name }} event." 9 | - run: echo "🐧 This job is now running on a ${{ runner.os }} server hosted by GitHub!" 10 | - run: echo "🔎 The name of your branch is ${{ github.ref }} and your repository is ${{ github.repository }}." 11 | - name: Check out repo 12 | uses: actions/checkout@v4 13 | - name: Validate source 14 | run: make test 15 | - name: Build html 16 | run: make html 17 | - name: List built files 18 | run: | 19 | ls ${{ github.workspace }}/_build/html 20 | 21 | - run: echo "🍏 This job's status is ${{ job.status }}." 22 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.pdf 2 | *.tex 3 | venv-docs 4 | .DS_Store 5 | */.DS_Store 6 | figures-pdf/ 7 | figures-hi_res/ 8 | figures-low_res/ 9 | private/ 10 | local/ 11 | scripts/ 12 | _build/ 13 | -------------------------------------------------------------------------------- /.gitpod.yml: -------------------------------------------------------------------------------- 1 | # This configuration file was automatically generated by Gitpod. 2 | # Please adjust to your needs (see https://www.gitpod.io/docs/config-gitpod-file) 3 | # and commit this file to your remote git repository to share the goodness with others. 4 | 5 | tasks: 6 | - init: sudo apt-get -y install libenchant1c2a 7 | command: make html 8 | - name: Start web server 9 | init: python -m http.server 8000 10 | 11 | ports: 12 | - port: 8000 13 | onOpen: open-preview 14 | 15 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Creative Commons Attribution 4.0 International Public License 2 | 3 | By exercising the Licensed Rights (defined below), You accept and 4 | agree to be bound by the terms and conditions of this Creative Commons 5 | Attribution-NonCommercial-NoDerivatives 4.0 International Public 6 | License ("Public License"). To the extent this Public License may be 7 | interpreted as a contract, You are granted the Licensed Rights in 8 | consideration of Your acceptance of these terms and conditions, and 9 | the Licensor grants You such rights in consideration of benefits the 10 | Licensor receives from making the Licensed Material available under 11 | these terms and conditions. 12 | 13 | Section 1 – Definitions. 14 | 15 | (a) Adapted Material means material subject to Copyright and Similar 16 | Rights that is derived from or based upon the Licensed Material and 17 | in which the Licensed Material is translated, altered, arranged, 18 | transformed, or otherwise modified in a manner requiring permission 19 | under the Copyright and Similar Rights held by the Licensor. For 20 | purposes of this Public License, where the Licensed Material is a 21 | musical work, performance, or sound recording, Adapted Material is 22 | always produced where the Licensed Material is synched in timed 23 | relation with a moving image. 24 | 25 | (b) Copyright and Similar Rights means copyright and/or similar 26 | rights closely related to copyright including, without limitation, 27 | performance, broadcast, sound recording, and Sui Generis Database 28 | Rights, without regard to how the rights are labeled or 29 | categorized. For purposes of this Public License, the rights 30 | specified in Section 2(b)(1)-(2) are not Copyright and Similar 31 | Rights. 32 | 33 | (c) Effective Technological Measures means those measures that, in 34 | the absence of proper authority, may not be circumvented under laws 35 | fulfilling obligations under Article 11 of the WIPO Copyright Treaty 36 | adopted on December 20, 1996, and/or similar international 37 | agreements. 38 | 39 | (d) Exceptions and Limitations means fair use, fair dealing, and/or 40 | any other exception or limitation to Copyright and Similar Rights 41 | that applies to Your use of the Licensed Material. 42 | 43 | (e) Licensed Material means the artistic or literary work, database, 44 | or other material to which the Licensor applied this Public License. 45 | 46 | (f) Licensed Rights means the rights granted to You subject to the 47 | terms and conditions of this Public License, which are limited to 48 | all Copyright and Similar Rights that apply to Your use of the 49 | Licensed Material and that the Licensor has authority to license. 50 | 51 | (g) Licensor means the individual(s) or entity(ies) granting rights 52 | under this Public License. 53 | 54 | (h) NonCommercial means not primarily intended for or directed 55 | towards commercial advantage or monetary compensation. For purposes 56 | of this Public License, the exchange of the Licensed Material for 57 | other material subject to Copyright and Similar Rights by digital 58 | file-sharing or similar means is NonCommercial provided there is no 59 | payment of monetary compensation in connection with the exchange. 60 | 61 | (i) Share means to provide material to the public by any means or 62 | process that requires permission under the Licensed Rights, such as 63 | reproduction, public display, public performance, distribution, 64 | dissemination, communication, or importation, and to make material 65 | available to the public including in ways that members of the public 66 | may access the material from a place and at a time individually 67 | chosen by them. 68 | 69 | (j) Sui Generis Database Rights means rights other than copyright 70 | resulting from Directive 96/9/EC of the European Parliament and of 71 | the Council of 11 March 1996 on the legal protection of databases, 72 | as amended and/or succeeded, as well as other essentially equivalent 73 | rights anywhere in the world. 74 | 75 | (k) You means the individual or entity exercising the Licensed 76 | Rights under this Public License. Your has a corresponding meaning. 77 | 78 | Section 2 – Scope. 79 | 80 | (a) License grant. 81 | 82 | (1) Subject to the terms and conditions of this Public License, 83 | the Licensor hereby grants You a worldwide, royalty-free, 84 | non-sublicensable, non-exclusive, irrevocable license to exercise 85 | the Licensed Rights in the Licensed Material to: 86 | 87 | (A) reproduce and Share the Licensed Material, in whole or in 88 | part, for NonCommercial purposes only; and 89 | 90 | (B) produce and reproduce, but not Share, Adapted Material for 91 | NonCommercial purposes only. 92 | 93 | (2) Exceptions and Limitations. For the avoidance of doubt, where 94 | Exceptions and Limitations apply to Your use, this Public License 95 | does not apply, and You do not need to comply with its terms and 96 | conditions. 97 | 98 | (3) Term. The term of this Public License is specified in Section 99 | 6(a). 100 | 101 | (4) Media and formats; technical modifications allowed. The 102 | Licensor authorizes You to exercise the Licensed Rights in all 103 | media and formats whether now known or hereafter created, and to 104 | make technical modifications necessary to do so. The Licensor 105 | waives and/or agrees not to assert any right or authority to 106 | forbid You from making technical modifications necessary to 107 | exercise the Licensed Rights, including technical modifications 108 | necessary to circumvent Effective Technological Measures. For 109 | purposes of this Public License, simply making modifications 110 | authorized by this Section 2(a)(4) never produces Adapted 111 | Material. 112 | 113 | (5) Downstream recipients. 114 | 115 | (A) Offer from the Licensor – Licensed Material. Every recipient 116 | of the Licensed Material automatically receives an offer from 117 | the Licensor to exercise the Licensed Rights under the terms and 118 | conditions of this Public License. 119 | 120 | (B) No downstream restrictions. You may not offer or impose any 121 | additional or different terms or conditions on, or apply any 122 | Effective Technological Measures to, the Licensed Material if 123 | doing so restricts exercise of the Licensed Rights by any 124 | recipient of the Licensed Material. 125 | 126 | (6) No endorsement. Nothing in this Public License constitutes or 127 | may be construed as permission to assert or imply that You are, or 128 | that Your use of the Licensed Material is, connected with, or 129 | sponsored, endorsed, or granted official status by, the Licensor 130 | or others designated to receive attribution as provided in Section 131 | 3(a)(1)(A)(i). 132 | 133 | (b) Other rights. 134 | 135 | (1) Moral rights, such as the right of integrity, are not licensed 136 | under this Public License, nor are publicity, privacy, and/or 137 | other similar personality rights; however, to the extent possible, 138 | the Licensor waives and/or agrees not to assert any such rights 139 | held by the Licensor to the limited extent necessary to allow You 140 | to exercise the Licensed Rights, but not otherwise. 141 | 142 | (2) Patent and trademark rights are not licensed under this Public 143 | License. 144 | 145 | (3) To the extent possible, the Licensor waives any right to 146 | collect royalties from You for the exercise of the Licensed 147 | Rights, whether directly or through a collecting society under any 148 | voluntary or waivable statutory or compulsory licensing scheme. In 149 | all other cases the Licensor expressly reserves any right to 150 | collect such royalties, including when the Licensed Material is 151 | used other than for NonCommercial purposes. 152 | 153 | Section 3 – License Conditions. 154 | 155 | Your exercise of the Licensed Rights is expressly made subject to the 156 | following conditions. 157 | 158 | (a) Attribution. 159 | 160 | (1) If You Share the Licensed Material, You must: 161 | 162 | (A) retain the following if it is supplied by the Licensor with 163 | the Licensed Material: 164 | 165 | (i) identification of the creator(s) of the Licensed Material 166 | and any others designated to receive attribution, in any 167 | reasonable manner requested by the Licensor (including by 168 | pseudonym if designated); 169 | 170 | (ii) a copyright notice; 171 | 172 | (iii) a notice that refers to this Public License; 173 | 174 | (iv) a notice that refers to the disclaimer of warranties; 175 | 176 | (v) a URI or hyperlink to the Licensed Material to the extent 177 | reasonably practicable; 178 | 179 | (B) indicate if You modified the Licensed Material and retain an 180 | indication of any previous modifications; and 181 | 182 | (C) indicate the Licensed Material is licensed under this Public 183 | License, and include the text of, or the URI or hyperlink to, 184 | this Public License. 185 | 186 | For the avoidance of doubt, You do not have permission under this 187 | Public License to Share Adapted Material. 188 | 189 | (2) You may satisfy the conditions in Section 3(a)(1) in any 190 | reasonable manner based on the medium, means, and context in which 191 | You Share the Licensed Material. For example, it may be reasonable 192 | to satisfy the conditions by providing a URI or hyperlink to a 193 | resource that includes the required information. 194 | 195 | (3) If requested by the Licensor, You must remove any of the 196 | information required by Section 3(a)(1)(A) to the extent 197 | reasonably practicable. 198 | 199 | Section 4 – Sui Generis Database Rights. 200 | 201 | Where the Licensed Rights include Sui Generis Database Rights that 202 | apply to Your use of the Licensed Material: 203 | 204 | (a) for the avoidance of doubt, Section 2(a)(1) grants You the right 205 | to extract, reuse, reproduce, and Share all or a substantial portion 206 | of the contents of the database for NonCommercial purposes only and 207 | provided You do not Share Adapted Material; 208 | 209 | (b) if You include all or a substantial portion of the database 210 | contents in a database in which You have Sui Generis Database 211 | Rights, then the database in which You have Sui Generis Database 212 | Rights (but not its individual contents) is Adapted Material; and 213 | 214 | (c) You must comply with the conditions in Section 3(a) if You Share 215 | all or a substantial portion of the contents of the database. 216 | 217 | For the avoidance of doubt, this Section 4 supplements and does not 218 | replace Your obligations under this Public License where the Licensed 219 | Rights include other Copyright and Similar Rights. 220 | 221 | Section 5 – Disclaimer of Warranties and Limitation of Liability. 222 | 223 | (a) Unless otherwise separately undertaken by the Licensor, to the 224 | extent possible, the Licensor offers the Licensed Material as-is and 225 | as-available, and makes no representations or warranties of any kind 226 | concerning the Licensed Material, whether express, implied, 227 | statutory, or other. This includes, without limitation, warranties 228 | of title, merchantability, fitness for a particular purpose, 229 | non-infringement, absence of latent or other defects, accuracy, or 230 | the presence or absence of errors, whether or not known or 231 | discoverable. Where disclaimers of warranties are not allowed in 232 | full or in part, this disclaimer may not apply to You. 233 | 234 | (b) To the extent possible, in no event will the Licensor be liable 235 | to You on any legal theory (including, without limitation, 236 | negligence) or otherwise for any direct, special, indirect, 237 | incidental, consequential, punitive, exemplary, or other losses, 238 | costs, expenses, or damages arising out of this Public License or 239 | use of the Licensed Material, even if the Licensor has been advised 240 | of the possibility of such losses, costs, expenses, or 241 | damages. Where a limitation of liability is not allowed in full or 242 | in part, this limitation may not apply to You. 243 | 244 | (c) The disclaimer of warranties and limitation of liability 245 | provided above shall be interpreted in a manner that, to the extent 246 | possible, most closely approximates an absolute disclaimer and 247 | waiver of all liability. 248 | 249 | Section 6 – Term and Termination. 250 | 251 | (a) This Public License applies for the term of the Copyright and 252 | Similar Rights licensed here. However, if You fail to comply with 253 | this Public License, then Your rights under this Public License 254 | terminate automatically. 255 | 256 | (b) Where Your right to use the Licensed Material has terminated 257 | under Section 6(a), it reinstates: 258 | 259 | (1) automatically as of the date the violation is cured, provided 260 | it is cured within 30 days of Your discovery of the violation; or 261 | 262 | (2) upon express reinstatement by the Licensor. 263 | 264 | For the avoidance of doubt, this Section 6(b) does not affect 265 | any right the Licensor may have to seek remedies for Your violations 266 | of this Public License. 267 | 268 | (c) For the avoidance of doubt, the Licensor may also offer the 269 | Licensed Material under separate terms or conditions or stop 270 | distributing the Licensed Material at any time; however, doing so 271 | will not terminate this Public License. 272 | 273 | (d) Sections 1, 5, 6, 7, and 8 survive termination of this Public 274 | License. 275 | 276 | Section 7 – Other Terms and Conditions. 277 | 278 | (a) The Licensor shall not be bound by any additional or different 279 | terms or conditions communicated by You unless expressly agreed. 280 | 281 | (b) Any arrangements, understandings, or agreements regarding the 282 | Licensed Material not stated herein are separate from and 283 | independent of the terms and conditions of this Public License. 284 | 285 | Section 8 – Interpretation. 286 | 287 | (a) For the avoidance of doubt, this Public License does not, and 288 | shall not be interpreted to, reduce, limit, restrict, or impose 289 | conditions on any use of the Licensed Material that could lawfully 290 | be made without permission under this Public License. 291 | 292 | (b) To the extent possible, if any provision of this Public License 293 | is deemed unenforceable, it shall be automatically reformed to the 294 | minimum extent necessary to make it enforceable. If the provision 295 | cannot be reformed, it shall be severed from this Public License 296 | without affecting the enforceability of the remaining terms and 297 | conditions. 298 | 299 | (c) No term or condition of this Public License will be waived and 300 | no failure to comply consented to unless expressly agreed to by the 301 | Licensor. 302 | 303 | (d) Nothing in this Public License constitutes or may be interpreted 304 | as a limitation upon, or waiver of, any privileges and immunities 305 | that apply to the Licensor or You, including from the legal 306 | processes of any jurisdiction or authority. 307 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | # Makefile for ONF documentation created with Sphinx 2 | 3 | # use bash for pushd/popd, and to fail quickly. virtualenv's activate 4 | # has undefined variables, so no -u 5 | SHELL = bash -e -o pipefail 6 | 7 | # You can set these variables from the command line. 8 | SPHINXOPTS ?= 9 | SPHINXBUILD ?= sphinx-build 10 | SOURCEDIR ?= . 11 | BUILDDIR ?= _build 12 | 13 | # Create the virtualenv with all the tools installed 14 | VIRTUALENV = venv-docs 15 | 16 | # Put it first so that "make" without argument is like "make help". 17 | help: $(VIRTUALENV) 18 | source ./$(VIRTUALENV)/bin/activate ;\ 19 | $(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 20 | 21 | .PHONY: help lint reload Makefile test 22 | 23 | # Create the virtualenv with all the tools installed 24 | $(VIRTUALENV): 25 | python3 -m venv $@ ;\ 26 | source ./$@/bin/activate ;\ 27 | pip install -r requirements.txt 28 | 29 | # lint and link verification. linkcheck is built into sphinx 30 | test: lint spelling 31 | 32 | # lint all .rst files 33 | lint: $(VIRTUALENV) 34 | source ./$`__ license. The 16 | community is invited to contribute corrections, improvements, updates, 17 | and new material under the same terms. While this license does not 18 | automatically grant the right to make derivative works, we are keen to 19 | discuss derivative works (such as translations) with interested 20 | parties. Please reach out to 21 | `discuss@systemsapproach.org `__. 22 | 23 | If you make use of this work, the attribution should include the 24 | following information: 25 | 26 | | *Title: TCP Congestion Control: A Systems Approach* 27 | | *Authors: Larry Peterson, Lawrence Brakmo, and Bruce Davie* 28 | | *Source:* https://github.com/SystemsApproach/tcpcc 29 | | *License:* \ `CC BY-NC-ND 4.0 `__ 30 | 31 | Read the Book 32 | ------------- 33 | 34 | This book is part of the `Systems Approach Series 35 | `__, with an online version 36 | published at `https://tcpcc.systemsapproach.org 37 | `__. 38 | 39 | To track progress and receive notices about new versions, you can follow 40 | the project on 41 | `Mastodon `__. 42 | To read a running commentary on how the Internet is evolving, and for 43 | updates on our writing projects, you can sign up for the 44 | `Systems Approach Newsletter `__. 45 | 46 | Build the Book 47 | -------------- 48 | 49 | To build a web-viewable version, you first need to download the source: 50 | 51 | .. code:: shell 52 | 53 | $ mkdir ~/systemsapproach 54 | $ cd ~/systemsapproach 55 | $ git clone https://github.com/SystemsApproach/tcpcc.git 56 | $ cd tcpcc 57 | 58 | The build process is stored in the Makefile and requires Python be 59 | installed. The Makefile will create a virtualenv (``venv-docs``) which 60 | installs the documentation generation toolset. You may also need to 61 | install the ``enchant`` C library using your system’s package manager 62 | for the spelling checker to function properly. 63 | 64 | To generate HTML in ``_build/html``, run ``make html``. 65 | 66 | To check the formatting of the book, run ``make lint``. 67 | 68 | To check spelling, run ``make spelling``. If there are additional 69 | words, names, or acronyms that are correctly spelled but not in the 70 | dictionary, please add them to the ``dict.txt`` file. 71 | 72 | To see the other available output formats, run ``make``. 73 | 74 | Contribute to the Book 75 | ---------------------- 76 | 77 | We hope that if you use this material, you are also willing to 78 | contribute back to it. If you are new to open source, you might check 79 | out this `How to Contribute to Open 80 | Source `__ guide. Among 81 | other things, you’ll learn about posting *Issues* that you’d like to see 82 | addressed, and issuing *Pull Requests* to merge your improvements back 83 | into GitHub. 84 | 85 | If you’d like to contribute and are looking for something that needs 86 | attention, see the `wiki `__ 87 | for the current TODO list. 88 | -------------------------------------------------------------------------------- /VERSION: -------------------------------------------------------------------------------- 1 | Version 1.1-dev -------------------------------------------------------------------------------- /_extra/robots.txt: -------------------------------------------------------------------------------- 1 | User-agent: AI2Bot 2 | User-agent: Ai2Bot-Dolma 3 | User-agent: aiHitBot 4 | User-agent: Amazonbot 5 | User-agent: anthropic-ai 6 | User-agent: Applebot 7 | User-agent: Applebot-Extended 8 | User-agent: Brightbot 1.0 9 | User-agent: Bytespider 10 | User-agent: CCBot 11 | User-agent: ChatGPT-User 12 | User-agent: Claude-Web 13 | User-agent: ClaudeBot 14 | User-agent: cohere-ai 15 | User-agent: cohere-training-data-crawler 16 | User-agent: Cotoyogi 17 | User-agent: Crawlspace 18 | User-agent: Diffbot 19 | User-agent: DuckAssistBot 20 | User-agent: FacebookBot 21 | User-agent: Factset_spyderbot 22 | User-agent: FirecrawlAgent 23 | User-agent: FriendlyCrawler 24 | User-agent: Google-Extended 25 | User-agent: GoogleOther 26 | User-agent: GoogleOther-Image 27 | User-agent: GoogleOther-Video 28 | User-agent: GPTBot 29 | User-agent: iaskspider/2.0 30 | User-agent: ICC-Crawler 31 | User-agent: ImagesiftBot 32 | User-agent: img2dataset 33 | User-agent: imgproxy 34 | User-agent: ISSCyberRiskCrawler 35 | User-agent: Kangaroo Bot 36 | User-agent: meta-externalagent 37 | User-agent: Meta-ExternalAgent 38 | User-agent: meta-externalfetcher 39 | User-agent: Meta-ExternalFetcher 40 | User-agent: NovaAct 41 | User-agent: OAI-SearchBot 42 | User-agent: omgili 43 | User-agent: omgilibot 44 | User-agent: Operator 45 | User-agent: PanguBot 46 | User-agent: Perplexity-User 47 | User-agent: PerplexityBot 48 | User-agent: PetalBot 49 | User-agent: Scrapy 50 | User-agent: SemrushBot-OCOB 51 | User-agent: SemrushBot-SWA 52 | User-agent: Sidetrade indexer bot 53 | User-agent: TikTokSpider 54 | User-agent: Timpibot 55 | User-agent: VelenPublicWebCrawler 56 | User-agent: Webzio-Extended 57 | User-agent: YouBot 58 | Disallow: / 59 | -------------------------------------------------------------------------------- /_static/SystemsApproachLogoURL.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SystemsApproach/tcpcc/abb44dfa1a05e457243995dac0630e73febd82f5/_static/SystemsApproachLogoURL.png -------------------------------------------------------------------------------- /_static/bridge.ico: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SystemsApproach/tcpcc/abb44dfa1a05e457243995dac0630e73febd82f5/_static/bridge.ico -------------------------------------------------------------------------------- /_static/cover.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SystemsApproach/tcpcc/abb44dfa1a05e457243995dac0630e73febd82f5/_static/cover.jpg -------------------------------------------------------------------------------- /_static/css/rtd_theme_mods.css: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2019-present Open Networking Foundation 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. */ 15 | 16 | /* Don't restrict content width on the RTD theme 17 | * from: https://stackoverflow.com/a/32898444 */ 18 | 19 | .wy-nav-content { 20 | max-width: none; 21 | } 22 | 23 | .wy-table-responsive table td, .wy-table-responsive table th { 24 | white-space: normal; 25 | } 26 | 27 | /* Colors for navigation */ 28 | 29 | .wy-side-nav-search, .wy-nav-top { 30 | background: #2F5597; 31 | } 32 | 33 | /* .wy-menu-vertical header,.wy-menu-vertical p.caption{color:#2F5597} */ 34 | 35 | .wy-menu-vertical header,.wy-menu-vertical p.caption{color:#6AB0DE} 36 | 37 | /* Headings */ 38 | h1, h2 { 39 | font-weight: bold; 40 | line-height: 1.25; 41 | color: #3279a8 42 | text-rendering: optimizeLegibility; 43 | } 44 | 45 | h3, h4, h5, h6 { 46 | margin-bottom: .5rem; 47 | font-style: italic; 48 | line-height: 1.25; 49 | color: #313131; 50 | text-rendering: optimizeLegibility; 51 | } 52 | 53 | h1 { 54 | margin-bottom: 2rem; 55 | font-size: 2rem; 56 | } 57 | 58 | h2 { 59 | margin-bottom: .5rem; 60 | margin-top: 1rem; 61 | font-size: 1.5rem; 62 | } 63 | 64 | h3 { 65 | margin-top: 1.5rem; 66 | font-size: 1.25rem; 67 | } 68 | 69 | .pop { 70 | color: #6AB0DE; 71 | font-style: italic; 72 | font-weight: bold; 73 | } 74 | aside.sidebar { 75 | margin: 0 0 0.5em 1em; 76 | border: 1px solid #ddb; 77 | padding: 7px 7px 0 7px; 78 | background-color: #ffe; 79 | width: 40%; 80 | float: right; 81 | } 82 | -------------------------------------------------------------------------------- /_static/fonts/Inconsolata-Bold.ttf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SystemsApproach/tcpcc/abb44dfa1a05e457243995dac0630e73febd82f5/_static/fonts/Inconsolata-Bold.ttf -------------------------------------------------------------------------------- /_static/fonts/Inconsolata-Regular.ttf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SystemsApproach/tcpcc/abb44dfa1a05e457243995dac0630e73febd82f5/_static/fonts/Inconsolata-Regular.ttf -------------------------------------------------------------------------------- /aqm.rst: -------------------------------------------------------------------------------- 1 | Chapter 6: Active Queue Management 2 | =================================== 3 | 4 | We now look at the role routers can play in congestion control, an 5 | approach often referred to as *Active Queue Management* (AQM). By 6 | its very nature, AQM introduces an element of avoidance to the 7 | end-to-end solution, even when paired with a control-based approach 8 | like TCP Reno. 9 | 10 | Changing router behavior has never been the Internet’s preferred way 11 | of introducing new features, but nonetheless, the approach has been a 12 | constant source of consternation over the last 30 years. The problem 13 | is that while it’s generally agreed that routers are in an ideal 14 | position to detect the onset of congestion—it's their queues that 15 | start to fill up—there has not been a consensus on exactly what the 16 | best algorithm is. The following describes two of the classic 17 | mechanisms, and concludes with a brief discussion of where things 18 | stand today. 19 | 20 | 6.1 DECbit 21 | ---------- 22 | 23 | The first mechanism was developed for use on the Digital Network 24 | Architecture (DNA), an early peer of the TCP/IP Internet that also 25 | adopted a connectionless/best-effort network model. A description 26 | of the approach, published by K.K. Ramakrishnan and Raj Jain, was 27 | presented at the same SIGCOMM as the Jacobson/Karels paper in 1988. 28 | 29 | .. _reading_decbit: 30 | .. admonition:: Further Reading 31 | 32 | K.K. Ramakrishnan and R. Jain. 33 | `A Binary Feedback Scheme for 34 | Congestion Avoidance in Computer Networks with a Connectionless 35 | Network Layer `__. 36 | ACM SIGCOMM, August 1988. 37 | 38 | The idea is to more evenly split the responsibility for congestion 39 | control between the routers and the end hosts. Each router monitors 40 | the load it is experiencing and explicitly notifies the end nodes when 41 | congestion is about to occur. This notification is implemented by 42 | setting a binary congestion bit in the packets that flow through the 43 | router, which came to be known as the *DECbit*. The destination host 44 | then copies this congestion bit into the ACK it sends back to the 45 | source. Finally, the source adjusts its sending rate so as to avoid 46 | congestion. The following discussion describes the algorithm in more 47 | detail, starting with what happens in the router. 48 | 49 | A single congestion bit is added to the packet header. A router sets 50 | this bit in a packet if its average queue length is greater than or 51 | equal to 1 at the time the packet arrives. This average queue length 52 | is measured over a time interval that spans the last busy+idle cycle, 53 | plus the current busy cycle. (The router is *busy* when it is 54 | transmitting and *idle* when it is not.) :numref:`Figure %s 55 | ` shows the queue length at a router as a function of 56 | time. Essentially, the router calculates the area under the curve and 57 | divides this value by the time interval to compute the average queue 58 | length. Using a queue length of 1 as the trigger for setting the 59 | congestion bit is a trade-off between significant queuing (and hence 60 | higher throughput) and increased idle time (and hence lower delay). In 61 | other words, a queue length of 1 seems to optimize the power function. 62 | 63 | .. _fig-decbit: 64 | .. figure:: figures/f06-14-9780123850591.png 65 | :width: 500px 66 | :align: center 67 | 68 | Computing average queue length at a router. 69 | 70 | Now turning our attention to the host half of the mechanism, the source 71 | records how many of its packets resulted in some router setting the 72 | congestion bit. In particular, the source maintains a congestion window, 73 | just as in TCP, and watches to see what fraction of the last window’s 74 | worth of packets resulted in the bit being set. If less than 50% of the 75 | packets had the bit set, then the source increases its congestion window 76 | by one packet. If 50% or more of the last window’s worth of packets had 77 | the congestion bit set, then the source decreases its congestion window 78 | to 0.875 times the previous value. The value 50% was chosen as the 79 | threshold based on analysis that showed it to correspond to the peak of 80 | the power curve. The “increase by 1, decrease by 0.875” rule was 81 | selected because additive increase/multiplicative decrease makes the 82 | mechanism stable. 83 | 84 | 6.2 Random Early Detection 85 | -------------------------- 86 | 87 | A second mechanism, called *random early detection* (RED), is similar to 88 | the DECbit scheme in that each router is programmed to monitor its own 89 | queue length and, when it detects that congestion is imminent, to notify 90 | the source to adjust its congestion window. RED, invented by Sally Floyd 91 | and Van Jacobson in the early 1990s, differs from the DECbit scheme in 92 | two major ways. 93 | 94 | .. _reading_red: 95 | .. admonition:: Further Reading 96 | 97 | S. Floyd and V. Jacobson `Random Early Detection (RED) 98 | Gateways for Congestion Avoidance `__. 99 | IEEE/ACM Transactions on Networking. August 1993. 100 | 101 | The first is that rather than explicitly sending a congestion 102 | notification message to the source, RED is most commonly implemented 103 | such that it *implicitly* notifies the source of congestion by dropping 104 | one of its packets. The source is, therefore, effectively notified by 105 | the subsequent timeout or duplicate ACK. In case you haven’t already 106 | guessed, RED is designed to be used in conjunction with TCP, which 107 | currently detects congestion by means of timeouts (or some other means 108 | of detecting packet loss such as duplicate ACKs). As the “early” part of 109 | the RED acronym suggests, the gateway drops the packet earlier than it 110 | would have to, so as to notify the source that it should decrease its 111 | congestion window sooner than it would normally have. In other words, 112 | the router drops a few packets before it has exhausted its buffer space 113 | completely, so as to cause the source to slow down, with the hope that 114 | this will mean it does not have to drop lots of packets later on. 115 | 116 | The second difference between RED and DECbit is in the details of how 117 | RED decides when to drop a packet and what packet it decides to drop. To 118 | understand the basic idea, consider a simple FIFO queue. Rather than 119 | wait for the queue to become completely full and then be forced to drop 120 | each arriving packet (the tail drop policy described in Section 2.1.3), we 121 | could decide to drop each arriving packet with some *drop probability* 122 | whenever the queue length exceeds some *drop level*. This idea is called 123 | *early random drop*. The RED algorithm defines the details of how to 124 | monitor the queue length and when to drop a packet. 125 | 126 | In the following paragraphs, we describe the RED algorithm as originally 127 | proposed by Floyd and Jacobson. We note that several modifications have 128 | since been proposed both by the inventors and by other researchers. 129 | However, the key ideas are the same as those presented below, and most 130 | current implementations are close to the algorithm that follows. 131 | 132 | First, RED computes an average queue length using a weighted running 133 | average similar to the one used in the original TCP timeout computation. 134 | That is, ``AvgLen`` is computed as 135 | 136 | .. math:: \mathsf{AvgLen = (1 - Weight)\ x\ AvgLen + Weight\ x\ SampleLen} 137 | 138 | where 0 < ``Weight`` < 1 and ``SampleLen`` is the length of the queue 139 | when a sample measurement is made. In most software implementations, the 140 | queue length is measured every time a new packet arrives at the gateway. 141 | In hardware, it might be calculated at some fixed sampling interval. 142 | 143 | The reason for using an average queue length rather than an 144 | instantaneous one is that it more accurately captures the notion of 145 | congestion. Because of the bursty nature of Internet traffic, queues 146 | can become full very quickly and then become empty again. If a queue 147 | is spending most of its time empty, then it’s probably not appropriate 148 | to conclude that the router is congested and to tell the hosts to slow 149 | down. Thus, the weighted running average calculation tries to detect 150 | long-lived congestion, as indicated in the right-hand portion of 151 | :numref:`Figure %s `, by filtering out short-term changes 152 | in the queue length. You can think of the running average as a 153 | low-pass filter, where ``Weight`` determines the time constant of the 154 | filter. The question of how we pick this time constant is discussed 155 | below. 156 | 157 | .. _fig-red-avg: 158 | .. figure:: figures/f06-15-9780123850591.png 159 | :width: 500px 160 | :align: center 161 | 162 | Weighted running average queue length. 163 | 164 | Second, RED has two queue length thresholds that trigger certain 165 | activity: ``MinThreshold`` and ``MaxThreshold``. When a packet arrives 166 | at the gateway, RED compares the current ``AvgLen`` with these two 167 | thresholds, according to the following rules: 168 | 169 | .. literalinclude:: code/red.c 170 | 171 | If the average queue length is smaller than the lower threshold, no 172 | action is taken, and if the average queue length is larger than the 173 | upper threshold, then the packet is always dropped. If the average 174 | queue length is between the two thresholds, then the newly arriving 175 | packet is dropped with some probability ``P``. This situation is 176 | depicted in :numref:`Figure %s `. The approximate 177 | relationship between ``P`` and ``AvgLen`` is shown in :numref:`Figure 178 | %s `. Note that the probability of drop increases slowly 179 | when ``AvgLen`` is between the two thresholds, reaching ``MaxP`` at 180 | the upper threshold, at which point it jumps to unity. The rationale 181 | behind this is that, if ``AvgLen`` reaches the upper threshold, then 182 | the gentle approach (dropping a few packets) is not working and 183 | drastic measures are called for: dropping all arriving packets. Some 184 | research has suggested that a smoother transition from random dropping 185 | to complete dropping, rather than the discontinuous approach shown 186 | here, may be appropriate. 187 | 188 | .. _fig-red: 189 | .. figure:: figures/f06-16-9780123850591.png 190 | :width: 300px 191 | :align: center 192 | 193 | RED thresholds on a FIFO queue. 194 | 195 | .. _fig-red-prob: 196 | .. figure:: figures/f06-17-9780123850591.png 197 | :width: 400px 198 | :align: center 199 | 200 | Drop probability function for RED. 201 | 202 | Although :numref:`Figure %s ` shows the probability of 203 | drop as a function only of ``AvgLen``, the situation is actually a 204 | little more complicated. In fact, ``P`` is a function of both 205 | ``AvgLen`` and how long it has been since the last packet was 206 | dropped. Specifically, it is computed as follows: 207 | 208 | .. math:: \mathsf{TempP = MaxP\ x\ (AvgLen - MinThreshold)\ /\ (MaxThreshold - MinThreshold)} 209 | 210 | .. math:: \mathsf{P = TempP\ /\ (1 - count\ x\ TempP)} 211 | 212 | ``TempP`` is the variable that is plotted on the y-axis in :numref:`Figure 213 | %s `, ``count`` keeps track of how many newly arriving 214 | packets have been queued (not dropped), and ``AvgLen`` has been between 215 | the two thresholds. ``P`` increases slowly as ``count`` increases, 216 | thereby making a drop increasingly likely as the time since the last 217 | drop increases. This makes closely spaced drops relatively less likely 218 | than widely spaced drops. This extra step in calculating ``P`` was 219 | introduced by the inventors of RED when they observed that, without it, 220 | the packet drops were not well distributed in time but instead tended to 221 | occur in clusters. Because packet arrivals from a certain connection are 222 | likely to arrive in bursts, this clustering of drops is likely to cause 223 | multiple drops in a single connection. This is not desirable, since only 224 | one drop per round-trip time is enough to cause a connection to reduce 225 | its window size, whereas multiple drops might send it back into slow 226 | start. 227 | 228 | As an example, suppose that we set ``MaxP`` to 0.02 and ``count`` is 229 | initialized to zero. If the average queue length were halfway between 230 | the two thresholds, then ``TempP``, and the initial value of ``P``, 231 | would be half of ``MaxP``, or 0.01. An arriving packet, of course, has a 232 | 99 in 100 chance of getting into the queue at this point. With each 233 | successive packet that is not dropped, ``P`` slowly increases, and by 234 | the time 50 packets have arrived without a drop, ``P`` would have 235 | doubled to 0.02. In the unlikely event that 99 packets arrived without 236 | loss, ``P`` reaches 1, guaranteeing that the next packet is dropped. The 237 | important thing about this part of the algorithm is that it ensures a 238 | roughly even distribution of drops over time. 239 | 240 | The intent is that, if RED drops a small percentage of packets when 241 | ``AvgLen`` exceeds ``MinThreshold``, this will cause a few TCP 242 | connections to reduce their window sizes, which in turn will reduce the 243 | rate at which packets arrive at the router. All going well, ``AvgLen`` 244 | will then decrease and congestion is avoided. The queue length can be 245 | kept short, while throughput remains high since few packets are dropped. 246 | 247 | Note that, because RED is operating on a queue length averaged over 248 | time, it is possible for the instantaneous queue length to be much 249 | longer than ``AvgLen``. In this case, if a packet arrives and there is 250 | nowhere to put it, then it will have to be dropped. When this happens, 251 | RED is operating in tail drop mode. One of the goals of RED is to 252 | prevent tail drop behavior if possible. 253 | 254 | The random nature of RED confers an interesting property on the 255 | algorithm. Because RED drops packets randomly, the probability that RED 256 | decides to drop a particular flow’s packet(s) is roughly proportional to 257 | the share of the bandwidth that flow is currently getting at that 258 | router. This is because a flow that is sending a relatively large number 259 | of packets is providing more candidates for random dropping. Thus, there 260 | is some sense of fair resource allocation built into RED, although it is 261 | by no means precise. While arguably fair, because RED punishes 262 | high-bandwidth flows more than low-bandwidth flows, it increases the 263 | probability of a TCP restart, which is doubly painful for those 264 | high-bandwidth flows. 265 | 266 | 267 | A fair amount of analysis has gone into setting the various RED 268 | parameters—for example, ``MaxThreshold``, ``MinThreshold``, ``MaxP`` 269 | and ``Weight``—all in the name of optimizing the power function 270 | (throughput-to-delay ratio). The performance of these parameters has 271 | also been confirmed through simulation, and the algorithm has been 272 | shown not to be overly sensitive to them. It is important to keep in 273 | mind, however, that all of this analysis and simulation hinges on a 274 | particular characterization of the network workload. The real 275 | contribution of RED is a mechanism by which the router can more 276 | accurately manage its queue length. Defining precisely what 277 | constitutes an optimal queue length depends on the traffic mix and is 278 | a subject of ongoing study. 279 | 280 | Consider the setting of the two thresholds, ``MinThreshold`` and 281 | ``MaxThreshold``. If the traffic is fairly bursty, then ``MinThreshold`` 282 | should be sufficiently large to allow the link utilization to be 283 | maintained at an acceptably high level. Also, the difference between the 284 | two thresholds should be larger than the typical increase in the 285 | calculated average queue length in one RTT. Setting ``MaxThreshold`` to 286 | twice ``MinThreshold`` seems to be a reasonable rule of thumb given the 287 | traffic mix on today’s Internet. In addition, since we expect the 288 | average queue length to hover between the two thresholds during periods 289 | of high load, there should be enough free buffer space *above* 290 | ``MaxThreshold`` to absorb the natural bursts that occur in Internet 291 | traffic without forcing the router to enter tail drop mode. 292 | 293 | We noted above that ``Weight`` determines the time constant for the 294 | running average low-pass filter, and this gives us a clue as to how we 295 | might pick a suitable value for it. Recall that RED is trying to send 296 | signals to TCP flows by dropping packets during times of congestion. 297 | Suppose that a router drops a packet from some TCP connection and then 298 | immediately forwards some more packets from the same connection. When 299 | those packets arrive at the receiver, it starts sending duplicate ACKs 300 | to the sender. When the sender sees enough duplicate ACKs, it will 301 | reduce its window size. So, from the time the router drops a packet 302 | until the time when the same router starts to see some relief from the 303 | affected connection in terms of a reduced window size, at least one 304 | round-trip time must elapse for that connection. There is probably not 305 | much point in having the router respond to congestion on time scales 306 | much less than the round-trip time of the connections passing through 307 | it. As noted previously, 100 ms is not a bad estimate of average 308 | round-trip times in the Internet. Thus, ``Weight`` should be chosen such 309 | that changes in queue length over time scales much less than 100 ms are 310 | filtered out. 311 | 312 | Since RED works by sending signals to TCP flows to tell them to slow 313 | down, you might wonder what would happen if those signals are ignored. 314 | This is often called the *unresponsive flow* problem. Unresponsive 315 | flows use more than their fair share of network resources and could 316 | cause congestive collapse if there were enough of them, just as in the 317 | days before TCP congestion control. Some queueing techniques, such as 318 | weighted fair queueing, could help with this problem by isolating 319 | certain classes of traffic from others. There was also discussion of 320 | creating a variant of RED that could drop more heavily from flows that 321 | are unresponsive to the initial hints that it sends. However this 322 | turns out to be challenging because it can be hard to distinguish 323 | between non-responsive behavior and \"correct\" behavior, especially 324 | when flows have a wide variety of different RTTs and bottleneck bandwidths. 325 | 326 | As a footnote, 15 prominent network researchers urged for the 327 | widespread adoption of RED-inspired AQM in 1998. The recommendation 328 | was largely ignored, for reasons that we touch on below. AQM 329 | approaches based on RED have, however, been applied with some success 330 | in datacenters. 331 | 332 | .. _reading_rfc: 333 | .. admonition:: Further Reading 334 | 335 | R. Braden, *et al*. 336 | `Recommendations on Queue Management and Congestion Avoidance in the Internet 337 | `__. 338 | RFC 2309, April 1998. 339 | 340 | 341 | 6.3 Controlled Delay 342 | -------------------- 343 | 344 | As noted in the preceding section, RED has never been widely 345 | adopted. Certainly it never reached the level necessary to have a 346 | significant impact on congestion in the Internet. One reason 347 | is that RED is difficult to configure in a 348 | way that consistently improves performance. Note the large number 349 | of parameters that affect its operation (``MinThreshold``, 350 | ``MaxThreshold``, and ``Weight``). There is enough research 351 | showing that RED produces a wide range of outcomes (not all of 352 | them helpful) depending on the type of traffic and parameter settings. 353 | This created uncertainty around the merits of deploying it. 354 | 355 | Over a period of years, Van Jacobson (well known for his work on TCP 356 | Congestion and a co-author of the original RED paper) collaborated 357 | with Kathy Nichols and eventually other researchers to come up with an 358 | AQM approach that improves upon RED. This work became known as CoDel 359 | (pronounced *coddle*) for Controlled Delay AQM. CoDel builds on several 360 | key insights that emerged over decades of experience with TCP and 361 | AQM. 362 | 363 | .. _reading_codel: 364 | .. admonition:: Further Reading 365 | 366 | K. Nichols and V. Jacobson. 367 | `Controlling Queue Delay 368 | `__. 369 | ACM Queue, 10(5), May 2012. 370 | 371 | First, queues are an important aspect of networking and it is expected 372 | that queues will build up from time to time. For example, a newly opened 373 | connection may dump a window's worth of packets into the network, and 374 | these are likely to form a queue at the bottleneck link. This is not 375 | in itself a problem. There should be enough buffer capacity to 376 | absorb such bursts. Problems arise when there is not enough buffer 377 | capacity to absorb bursts, leading to excessive loss. This came to be 378 | understood in the 1990s as a requirement that buffers be able to hold 379 | at least one bandwidth-delay product of packets—a requirement that 380 | was probably too large and subsequently questioned by further 381 | research. But the fact is that buffers are necessary, and it is 382 | expected that they will be used to absorb bursts. The CoDel authors 383 | refer to this as \"good queue\", as illustrated in :numref:`Figure 384 | %s ` (a). 385 | 386 | .. _fig-good-bad: 387 | .. figure:: figures/Slide14.png 388 | :width: 400px 389 | :align: center 390 | 391 | Good and Bad Queue Scenarios 392 | 393 | Queues become a problem when they are persistently full. A 394 | persistently full queue is doing nothing except adding delay to the 395 | network, and it is also less able to absorb bursts if it never drains 396 | fully. The combination of large buffers and persistent queues within 397 | those buffers is a phenomenon that Jim Gettys has named 398 | *Bufferbloat*. It is clear that persistently full queues are what a 399 | well-designed AQM mechanism would seek to avoid. Queues that stay full 400 | for long periods without draining are referred to, unsurprisingly, as 401 | \"bad queue\", as shown in :numref:`Figure %s ` (b). 402 | 403 | .. _reading_bloat: 404 | .. admonition:: Further Reading 405 | 406 | J. Gettys. `Bufferbloat: Dark Buffers in the Internet 407 | `__. IEEE 408 | Internet Computing, April 2011. 409 | 410 | In a sense, then, the challenge for an AQM algorithm is to distinguish 411 | between \"good\" and \"bad\" queues, and to trigger packet loss only when 412 | the queue is determined to be \"bad\". Indeed, this is what RED is 413 | trying to do with its ``weight`` parameter (which filters out 414 | transient queue length). 415 | 416 | One of the innovations of CoDel is to focus on *sojourn time*: the 417 | time that any given packet waits in the queue. Sojourn time is 418 | independent of the bandwidth of a link and provides useful indication 419 | of congestion even on links whose bandwidth varies over time, such as 420 | wireless links. A queue that is behaving well will frequently drain to 421 | zero, and thus, some packets will experience a sojourn time close to 422 | zero, as in :numref:`Figure %s ` (a). Conversely, a 423 | congested queue will delay every packet, and the minimum sojourn time 424 | will never be close to zero, as seen in :numref:`Figure %s 425 | ` (b). CoDel therefore measures the sojourn 426 | time—something that is easy to do for every packet—and tracks whether 427 | it is consistently sitting above some small target. \"Consistently\" 428 | is defined as \"lasting longer than a typical RTT\". 429 | 430 | Rather than asking operators to determine the parameters to make 431 | CoDel work well, the algorithm chooses reasonable defaults. A target 432 | sojourn time of 5ms is used, along with a sliding measurement window 433 | of 100ms. The intuition, as with RED, is that 100ms is a typical RTT 434 | for traffic traversing the Internet, and that if congestion is lasting 435 | longer than 100ms, we may be moving into the \"bad queue\" region. So 436 | CoDel monitors the sojourn time relative to the target of 5ms. If it 437 | is above target for more than 100ms, it is time to start taking action 438 | to reduce the queue via drops (or marking if explicit congestion 439 | notification, described below, is available). 5ms is chosen as being 440 | close to zero (for better delay) but not so small that the queue would 441 | run empty. It should be noted that a great deal of experimentation and 442 | simulation has gone into these numerical choices, but more importantly, the 443 | algorithm does not seem to be overly sensitive to them. 444 | 445 | To summarize, CoDel largely ignore queues that last less than an RTT, 446 | but starts taking action as soon as a queue persists for more than 447 | an RTT. By making reasonable assumptions about Internet RTTs, the algorithm 448 | requires no configuration parameters. 449 | 450 | An additional subtlety is that CoDel drops a slowly increasing percentage of 451 | traffic as long as the observed sojourn time remains above the target. As 452 | discussed further in Section 7.4, TCP throughput has been shown to 453 | depend inversely on the square root of loss rate. Thus, as long as the 454 | sojourn time stays above the target, CoDel steadily 455 | increases its drop rate in proportion to the square root 456 | of the number of drops since the target was exceeded. The effect of 457 | this, in theory, is to cause a linear decrease in throughput of the 458 | affected TCP connections. Eventually this should lead to enough 459 | reduction in arriving traffic to allow the queue to drain, bringing 460 | the sojourn time back below the target. 461 | 462 | .. _fig-codel: 463 | .. figure:: figures/Slide16.png 464 | :width: 500px 465 | :align: center 466 | 467 | Home routers can suffer from bufferbloat, a situation CoDel is 468 | well-suited to address. 469 | 470 | There are more details to CoDel presented in the Nichols and Jacobson 471 | paper, including extensive simulations to indicate its effectiveness 472 | across a wide range of scenarios. The algorithm has been standardized 473 | as \"experimental\" by the IETF in RFC 8289. It is also implemented in 474 | the Linux kernel, which has aided in its deployment. In particular, 475 | CoDel provides value in home routers (which are often Linux-based), a 476 | point along the end-to-end path (see :numref:`Figure %s `) 477 | that commonly experiences bufferbloat. 478 | 479 | 480 | 6.4 Explicit Congestion Notification 481 | ------------------------------------ 482 | 483 | While TCP's congestion control mechanism was initially based on packet 484 | loss as the primary congestion signal, it has long been recognized 485 | that TCP could do a better job if routers were to send a more explicit 486 | congestion signal. That is, instead of *dropping* a packet and assuming TCP will eventually 487 | notice (e.g., due to the arrival of a duplicate ACK), any AQM 488 | algorithm can potentially do a better job if it instead *marks* the 489 | packet and continues to send it along its way to the destination. This 490 | idea was codified in changes to the IP and TCP headers known as 491 | *Explicit Congestion Notification* (ECN), as specified in RFC 3168. 492 | 493 | .. _reading_ecn: 494 | .. admonition:: Further Reading 495 | 496 | K. Ramakrishnan, S. Floyd, and D. Black. 497 | `The Addition of Explicit Congestion Notification (ECN) to IP 498 | `__. 499 | RFC 3168, September 2001. 500 | 501 | Specifically, this feedback is implemented by treating two bits in the 502 | IP ``TOS`` field as ECN bits. One bit is set by the source to indicate 503 | that it is ECN-capable, that is, able to react to a congestion 504 | notification. This is called the ``ECT`` bit (ECN-Capable Transport). 505 | The other bit is set by routers along the end-to-end path when 506 | congestion is encountered, as computed by whatever AQM algorithm it is 507 | running. This is called the ``CE`` bit (Congestion Encountered). 508 | 509 | In addition to these two bits in the IP header (which are 510 | transport-agnostic), ECN also includes the addition of two optional 511 | flags to the TCP header. The first, ``ECE`` (ECN-Echo), communicates 512 | from the receiver to the sender that it has received a packet with the 513 | ``CE`` bit set. The second, ``CWR`` (Congestion Window Reduced) 514 | communicates from the sender to the receiver that it has reduced the 515 | congestion window. 516 | 517 | While ECN is now the standard interpretation of two of the eight bits in 518 | the ``TOS`` field of the IP header and support for ECN is highly 519 | recommended, it is not required. Moreover, there is no single 520 | recommended AQM algorithm, but instead, there is a list of requirements 521 | a good AQM algorithm should meet. Like TCP congestion control 522 | algorithms, every AQM algorithm has its advantages and disadvantages, 523 | and so we need a lot of them to argue about. 524 | 525 | 526 | 6.5 Ingress/Egress Queues 527 | ------------------------- 528 | 529 | We have been drawing a clear line between approaches to congestion 530 | control that happen *inside the network* (i.e., the AQM algorithms 531 | described in this chapter) and *at the edge of the network* (i.e., the 532 | TCP-based algorithms described in earlier chapters). But the line 533 | isn’t necessarily that crisp. To see this, you just have to think of 534 | the end-to-end path as having a *ingress queue* at the kernel/device 535 | interface on the sending host and an *egress queue* at the 536 | device/kernel interface on the receiving host.\ [#]_ These edge queues 537 | are likely to become increasingly important as virtual switches and 538 | NIC support for virtualization become more and more common. 539 | 540 | .. [#] 541 | Confusingly, the *ingress queue* from the perspective of the 542 | network path is the outbound (egress) queue on the sending host 543 | and, the *egress queue* from the perspective of the network 544 | path is the inbound (ingress) queue on the receiving host. As 545 | shown in :numref:`Figure %s `, we use the 546 | terms ingress and egress from the network's perspective. 547 | 548 | This perspective is illustrated in :numref:`Figure %s 549 | `, where both locations sit below TCP, and provide 550 | an opportunity to inject a second piece of congestion control logic 551 | into the end-to-end path. CoDel and ECN are examples of this idea: they 552 | have been implemented at the device queue level of the Linux kernel. 553 | 554 | .. _fig-ingress_egress: 555 | .. figure:: figures/Slide15.png 556 | :width: 500px 557 | :align: center 558 | 559 | Ingress and egress queues along the end-to-end path, implemented in 560 | the sending and receiving hosts, respectively. 561 | 562 | Does this work? One issue is whether packets are dropped at the ingress 563 | or the egress. When dropping at the ingress (on the sending host), 564 | TCP is notified in the return value of the *Write* function, which 565 | causes it to "forget" that it sent the packet. This means this packet 566 | will be sent next, although TCP does decrease its congestion window in 567 | response to the failed write. In contrast, dropping packets at the 568 | egress queue (on the receiving host), means the TCP sender will not 569 | know to retransmit the packet until it detects the loss using one of 570 | its standard mechanisms (e.g., three duplicate ACKs, a timeout). Of 571 | course, having the egress implement ECN helps. 572 | 573 | When we consider this discussion in the context of the bigger 574 | congestion control picture, we can make 575 | two interesting observations. One is that Linux provides a convenient 576 | and safe way to inject new code—including congestion control 577 | logic—into the kernel, namely, using the *extended Berkeley Packet 578 | Filter (eBPF)*. eBPF is becoming an important technology in many other 579 | contexts as well. The standard kernel API for congestion control has 580 | been ported to eBPF and most existing congestion control algorithms 581 | have been ported to this framework. This simplifies the task of 582 | experimenting with new algorithms or tweaking existing algorithms by 583 | side-stepping the hurdle of waiting for the relevant Linux kernel to 584 | be deployed. 585 | 586 | .. _reading_bpf: 587 | .. admonition:: Further Reading 588 | 589 | The Linux Kernel. 590 | `BPF Documentation 591 | `__. 592 | 593 | A second observation is that by explicitly exposing the ingress/egress 594 | queues to the decision-making process, we open the door to building a 595 | congestion control mechanism that contains both a “decide when to 596 | transmit a packet” component and a “decide to queue-or-drop a packet” 597 | component. We’ll see an example of a mechanism that takes an innovative 598 | approach to using these two components in Section 7.1 when we describe 599 | On-Ramp. 600 | 601 | 602 | 603 | -------------------------------------------------------------------------------- /authors.rst: -------------------------------------------------------------------------------- 1 | About The Authors 2 | ================== 3 | 4 | **Larry Peterson** is the Robert E. Kahn Professor of Computer 5 | Science, Emeritus at Princeton University, where he served as Chair 6 | from 2003-2009. His research focuses on the design, implementation, 7 | and operation of Internet-scale distributed systems, including the 8 | widely used PlanetLab and MeasurementLab platforms. He is currently 9 | contributing to the Aether access-edge cloud project at the Open 10 | Networking Foundation (ONF), where he serves as Chief Scientist. 11 | Peterson is a member of the National Academy of Engineering, a Fellow 12 | of the ACM and the IEEE, the 2010 recipient of the IEEE Kobayashi 13 | Computer and Communication Award, and the 2013 recipient of the ACM 14 | SIGCOMM Award. He received his Ph.D. degree from Purdue University. 15 | 16 | **Lawrence Brakmo** currently works in the Kernel group at Facebook. 17 | Prior to joining Facebook, he was a member of the Host Networking 18 | group at Google, and before that, a researcher and project manager of 19 | the OS group at DoCoMo USA Labs. Brakmo has worked on TCP enhancements 20 | to improve network performance, including the design of the TCP Vegas 21 | and TCP-NV congestion control algorithms. He has also developed OS 22 | techniques to improve system reliability, performance, and energy 23 | consumption. Brakmo received his Ph.D. degree in Computer Science from 24 | The University of Arizona. 25 | 26 | **Bruce Davie** is a computer scientist noted for his contributions to 27 | the field of networking. He is a former VP and CTO for the Asia 28 | Pacific region at VMware. He joined VMware during the acquisition of 29 | Software Defined Networking (SDN) startup Nicira. Prior to that, he 30 | was a Fellow at Cisco Systems, leading a team of architects 31 | responsible for Multiprotocol Label Switching (MPLS). Davie has over 32 | 30 years of networking industry experience and has co-authored 17 33 | RFCs. He was recognized as an ACM Fellow in 2009 and chaired ACM 34 | SIGCOMM from 2009 to 2013. He was also a visiting lecturer at the 35 | Massachusetts Institute of Technology for five years. Davie is the 36 | author of multiple books and the holder of more than 40 U.S. Patents. 37 | 38 | -------------------------------------------------------------------------------- /avoidance.rst: -------------------------------------------------------------------------------- 1 | Chapter 5: Avoidance-Based Algorithms 2 | ====================================== 3 | 4 | .. include:: 5 | 6 | A review of the academic literature on TCP congestion control shows a 7 | notable gap between the original TCP Tahoe and Reno mechanisms 8 | introduced in 1988 and 1990, respectively, and the next major flurry 9 | of activity starting in 1994, marked by the introduction of an 10 | alternative approach known as TCP Vegas. This triggered an avalanche 11 | of comparative studies and alternative designs that would persist for 12 | the next 25+ years. 13 | 14 | .. _reading_vegas: 15 | .. admonition:: Further Reading 16 | 17 | L. Brakmo, S. O'Malley and L. Peterson 18 | `TCP Vegas: New Technique for Congestion Detection and Avoidance 19 | `__. 20 | ACM SIGCOMM '94 Symposium. August 1994. (Reprinted in IEEE/ACM Transactions 21 | on Networking, October 1995). 22 | 23 | Whereas every approach described to date sees packet loss as a 24 | congestion signal and tries to react to *control* congestion after the 25 | onset, TCP Vegas takes an *avoidance-based* approach to congestion: it 26 | tries to detect changes in the measured throughput rate, and adjust 27 | the sending rate *before* congestion becomes severe enough to cause 28 | packet loss. This chapter describes the general "Vegas strategy", 29 | along with three example variations to that strategy introduced over 30 | time. This case study culminates in the BBR algorithm championed by 31 | Google today. 32 | 33 | 5.1 TCP Vegas 34 | ------------- 35 | 36 | The essential idea behind TCP Vegas is to adapt the sending rate based 37 | on a comparison of the *measured* throughput rate with the *expected* 38 | throughput rate. The intuition can be seen in the trace of TCP Reno 39 | given in :numref:`Figure %s `. The top graph traces the 40 | connection’s congestion window; it shows the same information as the 41 | traces given in the previous chapter. The middle and bottom graphs 42 | depict new information: the middle graph shows the average sending 43 | rate as measured at the source, and the bottom graph shows the average 44 | queue length as measured at the bottleneck router. All three graphs 45 | are synchronized in time. In the period between 4.5 and 6.0 seconds 46 | (shaded region), the congestion window increases (top graph). We 47 | expect the observed throughput to also increase, but instead it stays 48 | flat (middle graph). This is because the throughput cannot increase 49 | beyond the available bandwidth. Beyond this point, any increase in the 50 | window size only results in packets taking up buffer space at the 51 | bottleneck router (bottom graph). 52 | 53 | .. _fig-trace3: 54 | .. figure:: figures/f06-18-9780123850591.png 55 | :width: 600px 56 | :align: center 57 | 58 | Congestion window versus observed throughput rate (the 59 | three graphs are synchronized). Top, congestion window; middle, 60 | observed throughput; bottom, buffer space taken up at the 61 | router. Colored line = `CongestionWindow`; solid bullet = timeout; 62 | hash marks = time when each packet is transmitted; vertical bars = 63 | time when a packet that was eventually retransmitted was first 64 | transmitted. 65 | 66 | A useful metaphor that describes the phenomenon illustrated in 67 | :numref:`Figure %s ` is driving on ice. The speedometer 68 | (congestion window) may say that you are going 30 miles an hour, but 69 | by looking out the car window and seeing people pass you on foot 70 | (measured throughput rate) you know that you are going no more than 5 71 | miles an hour. The uselessly spinning wheels in this analogy are like 72 | the extra packets being sent only to sit uselessly in router buffers. 73 | 74 | TCP Vegas uses this idea to measure and control the amount of extra data 75 | this connection has in transit, where by “extra data” we mean data that 76 | the source would not have transmitted had it been able to match 77 | exactly the available bandwidth of the network. The goal of TCP Vegas is 78 | to maintain the “right” amount of extra data in the network. Obviously, 79 | if a source is sending too much extra data, it will cause long delays 80 | and possibly lead to congestion. Less obviously, if a connection is 81 | sending too little extra data, it cannot respond rapidly enough to 82 | transient increases in the available network bandwidth. TCP Vegas’s 83 | congestion-avoidance actions are based on changes in the estimated 84 | amount of extra data in the network, not only on dropped packets. We now 85 | describe the algorithm in detail. 86 | 87 | First, define a given flow’s ``BaseRTT`` to be the RTT of a packet when 88 | the flow is not congested. In practice, TCP Vegas sets ``BaseRTT`` to 89 | the minimum of all measured round-trip times; it is commonly the RTT of 90 | the first packet sent by the connection, before the router queues 91 | increase due to traffic generated by this flow. If we assume that we are 92 | not overflowing the connection, then the expected throughput is given by 93 | 94 | .. math:: \mathsf{ExpectedRate = CongestionWindow\ /\ BaseRTT} 95 | 96 | where ``CongestionWindow`` is the TCP congestion window, which we 97 | assume (for the purpose of this discussion) to be equal to the number 98 | of bytes in transit. 99 | 100 | Second, TCP Vegas calculates the current sending rate, ``ActualRate``. 101 | This is done by recording the sending time for a distinguished packet, 102 | recording how many bytes are transmitted between the time that packet 103 | is sent and when its acknowledgment is received, computing the sample 104 | RTT for the distinguished packet when its acknowledgment arrives, and 105 | dividing the number of bytes transmitted by the sample RTT. This 106 | calculation is done once per round-trip time. 107 | 108 | Third, TCP Vegas compares ``ActualRate`` to ``ExpectedRate`` and 109 | adjusts the window accordingly. We let ``Diff = ExpectedRate - 110 | ActualRate``. Note that ``Diff`` is positive or 0 by definition, 111 | since the only way ``ActualRate > ExpectedRate`` is if the measured 112 | sample RTT is less than ``BaseRTT``. If that happens we change 113 | ``BaseRTT`` to the latest sampled RTT. We also define two thresholds, 114 | :math:`\alpha` < :math:`\beta`, corresponding to having too little and too much 115 | extra data in the network, respectively. When ``Diff`` < :math:`\alpha`, TCP 116 | Vegas increases the congestion window linearly during the next RTT, 117 | and when ``Diff`` > :math:`\beta`, TCP Vegas decreases the congestion window 118 | linearly during the next RTT. TCP Vegas leaves the congestion window 119 | unchanged when :math:`\alpha` < ``Diff`` < :math:`\beta`. 120 | 121 | Intuitively, we can see that the farther away the actual throughput 122 | gets from the expected throughput, the more congestion there is in the 123 | network, which implies that the sending rate should be reduced. The 124 | :math:`\beta` threshold triggers this decrease. On the other hand, when the 125 | actual throughput rate gets too close to the expected throughput, the 126 | connection is in danger of not utilizing the available bandwidth. The 127 | :math:`\alpha` threshold triggers this increase. The overall goal is to keep 128 | between :math:`\alpha` and :math:`\beta` extra bytes in the network. 129 | 130 | .. _fig-vegas: 131 | .. figure:: figures/f06-19-9780123850591.png 132 | :width: 600px 133 | :align: center 134 | 135 | Trace of TCP Vegas congestion-avoidance mechanism. 136 | Top, congestion window; bottom, expected (colored line) and actual 137 | (black line) throughput. The shaded area is the region between the 138 | :math:`\alpha` and :math:`\beta` thresholds. 139 | 140 | :numref:`Figure %s ` traces the TCP Vegas 141 | congestion-avoidance algorithm. The top graph traces the congestion 142 | window, showing the same information as the other traces given 143 | throughout this chapter. The bottom graph traces the expected and 144 | actual throughput rates that govern how the congestion window is 145 | set. It is this bottom graph that best illustrates how the algorithm 146 | works. The colored line tracks the ``ExpectedRate``, while the black 147 | line tracks the ``ActualRate``. The wide shaded strip gives the region 148 | between the :math:`\alpha` and :math:`\beta` thresholds; the top of the shaded strip is 149 | :math:`\alpha` KBps away from ``ExpectedRate``, and the bottom of the shaded 150 | strip is :math:`\beta` KBps away from ``ExpectedRate``. The goal is to keep the 151 | ``ActualRate`` between these two thresholds, within the shaded 152 | region. Whenever ``ActualRate`` falls below the shaded region (i.e., 153 | gets too far from ``ExpectedRate``), TCP Vegas decreases the 154 | congestion window because it fears that too many packets are being 155 | buffered in the network. Likewise, whenever ``ActualRate`` goes above 156 | the shaded region (i.e., gets too close to the ``ExpectedRate``), TCP 157 | Vegas increases the congestion window because it fears that it is 158 | underutilizing the network. 159 | 160 | Because the algorithm, as just presented, compares the difference 161 | between the actual and expected throughput rates to the :math:`\alpha` and :math:`\beta` 162 | thresholds, these two thresholds are defined in terms of KBps. However, 163 | it is perhaps more accurate to think in terms of how many extra 164 | *packet buffers* the connection is occupying in the network. For example, on a 165 | connection with a ``BaseRTT`` of 100 ms and a packet size of 1 KB, if 166 | :math:`\alpha` = 30 KBps and :math:`\beta` = 60 KBps, then we can think of :math:`\alpha` as specifying 167 | that the connection needs to be occupying at least 3 extra buffers in 168 | the network and :math:`\beta` as specifying that the connection should occupy no 169 | more than 6 extra buffers in the network. This setting of :math:`\alpha` 170 | and :math:`\beta` worked well in practice when Vegas was first deployed, but 171 | as we'll see in the next section, these parameters continue to be tuned 172 | for changing circumstances. 173 | 174 | Finally, you will notice that TCP Vegas decreases the congestion window 175 | linearly, seemingly in conflict with the rule that multiplicative 176 | decrease is needed to ensure stability. The explanation is that TCP 177 | Vegas does use multiplicative decrease when a timeout occurs; the linear 178 | decrease just described is an *early* decrease in the congestion window 179 | that should happen before congestion occurs and packets start being 180 | dropped. 181 | 182 | 5.2 Varied Assumptions 183 | ---------------------- 184 | 185 | TCP Vegas—and Vegas-like approaches to avoiding congestion—have been 186 | adapted over time, often in response to different assumptions about 187 | the network. Vegas was never as widely deployed as Reno, so the 188 | modifications were often driven more by lab studies than extensive 189 | real-world experience, but they have collectively refined and 190 | contributed to our understanding of avoidance-based algorithms. We 191 | summarize some of those insights here, but return to the general topic 192 | of customizing the congestion control algorithm for specific use cases 193 | in Chapter 7. 194 | 195 | 5.2.1 FAST TCP 196 | ~~~~~~~~~~~~~~ 197 | 198 | The first Vegas-inspired mechanism was FAST TCP, which modified Vegas 199 | to be more efficient on high-speed networks with large bandwidth-delay 200 | products. The idea was to increase the congestion window more 201 | aggressively during the phase when the algorithm is trying to find the 202 | available "in transit" bandwidth (before packets are buffered in the 203 | network), and then more conservatively as the algorithm starts to 204 | compete with other flows for buffers at the bottleneck router. FAST 205 | also recommended adjusting the value of :math:`\alpha` to roughly 30 packets. 206 | 207 | Beyond managing congestion in networks with large bandwidth-delay 208 | products, where keeping the pipe full is a substantial challenge, 209 | there are two other items of note about FAST. First, whereas both TCP 210 | Reno and TCP Vegas were the result of a little intuition and a lot of 211 | trial-and-error, FAST was grounded in optimization theory (which was 212 | subsequently used to explain why Vegas works). Second, unlike all 213 | other congestion control algorithms of which we are aware, an 214 | implementation of FAST was made available only as a proprietary 215 | solution. 216 | 217 | .. _reading_fast: 218 | .. admonition:: Further Reading 219 | 220 | S. Low, L. Peterson, and L. Wang. `Understanding TCP Vegas: A 221 | Duality Model. `__. 222 | Journal of the ACM, Volume 49, Issue 2, March 2002. 223 | 224 | 225 | 5.2.2 TCP Westwood 226 | ~~~~~~~~~~~~~~~~~~ 227 | 228 | While Vegas was motivated by the idea that congestion can be detected 229 | and averted *before* a loss occurs, TCP Westwood (TCPW) is motivated 230 | primarily by the realization that packet loss is not always a reliable 231 | indicator of congestion. This is particularly noticeable with wireless 232 | links, which were a novelty at the time of Vegas but becoming common 233 | by the time of TCPW. Wireless links often lose packets due to 234 | uncorrected errors on the wireless channel, which are unrelated to 235 | congestion. Hence, congestion needs to be detected another 236 | way. Interestingly, the end result is somewhat similar to Vegas, in 237 | that TCPW also tries to determine the bottleneck bandwidth by looking 238 | at the rate at which ACKs are coming back for those packets that were 239 | delivered successfully. 240 | 241 | When a packet loss occurs, TCPW does not immediately cut the 242 | congestion window in half, as it does not yet know if the loss was due 243 | to congestion or a link-related packet loss. So instead it estimates 244 | the rate at which traffic was flowing right before the packet loss 245 | occurred. This is a less aggressive form of backoff than TCP Reno. If 246 | the loss was congestion-related, TCPW should send at the rate that was 247 | acceptable before the loss. And if the loss was caused by a wireless 248 | error, TCPW has not backed off so much, and will start to ramp up 249 | again to fully utilize the network. The result was a protocol which 250 | performed similarly to Reno for fixed links but outperformed it by 251 | substantial margins when lossy links were involved. 252 | 253 | Tuning the congestion control algorithm to deal with wireless links 254 | continues to be a challenging problem, and to complicate matters, WiFi 255 | and the Mobile Cellular network have different properties. We return 256 | to this issue in Chapter 7. 257 | 258 | 259 | 5.2.3 New Vegas 260 | ~~~~~~~~~~~~~~~ 261 | 262 | Our final example is New Vegas (NV), an adaptation of Vegas's 263 | delay-based approach to datacenters, where link bandwidths are 10Gbps 264 | or higher and RTTs are typically measured in the tens of 265 | microseconds. This is an important use case that we return to in 266 | Chapter 7; our goal here is to build some intuition. 267 | 268 | To understand the basic idea of NV, suppose that we plot ``Rate`` 269 | versus ``CongestionWindow`` for every packet for which an ACK is 270 | received. For the purpose of this exercise, ``Rate`` is simply the 271 | ratio of ``CongestionWindow`` (in bytes) to the RTT of packets that 272 | have been ACKed (in seconds). Note that we use ``CongestionWindow`` 273 | in this discussion for simplicity, while in practice NV uses in-flight 274 | (unacknowledged) bytes. When plotted over time, as shown in 275 | :numref:`Figure %s `, we end up with vertical bars (rather 276 | than points) for values of ``CongestionWindow`` due to transient 277 | congestion or noise in the measurements. 278 | 279 | .. _fig-nv: 280 | .. figure:: figures/Slide4.png 281 | :width: 500px 282 | :align: center 283 | 284 | Plotting measured rate vs congestion window. 285 | 286 | The maximum slope of the top of the bars indicates the best we have 287 | been able to do in the past. In a well tuned system, the top of the 288 | bars is bounded by a straight line going through the origin. The idea 289 | is that as long as the network is not congested, doubling the amount 290 | of data we send per RTT should double the rate. 291 | 292 | New measurements of ``Rate`` and ``CongestionWindow`` can either fall close to the 293 | boundary line (black diamond in the figure) or below (blue diamond in the 294 | figure). A measurement above the line causes NV to automatically 295 | update the line by increasing its slope so the measurement will fall 296 | on the new line. If the new measurement is close to the line, then NV 297 | increases ``CongestionWindow``. If the measurement is below the line, it means 298 | that we have seen equal performance in the past with a lower 299 | ``CongestionWindow``. In the example shown in :numref:`Figure %s `, we see 300 | similar performance with ``CongestionWindow=12``, so we decrease ``CongestionWindow``. The 301 | decrease is done multiplicatively, rather than instantaneously, in case 302 | the new measurement is noisy. To filter out bad measurements, NV 303 | collects many measurements and then use the best one before making a 304 | congestion determination. 305 | 306 | 307 | 5.3 TCP BBR 308 | --------------- 309 | 310 | BBR (Bottleneck Bandwidth and RTT) is a new TCP congestion control 311 | algorithm developed by researchers at Google. Like Vegas, BBR is delay 312 | based, which means it tries to detect buffer growth so as to avoid 313 | congestion and packet loss. Both BBR and Vegas use the minimum RTT and 314 | the observed bottleneck bandwidth, as calculated over some time 315 | interval, as their main control signals. 316 | 317 | .. _fig-bbr: 318 | .. figure:: figures/Slide5.png 319 | :width: 500px 320 | :align: center 321 | 322 | Determining the optimal sending rate based on observed throughput 323 | and RTT. 324 | 325 | :numref:`Figure %s ` shows the basic idea underlying 326 | BBR. Assume a network has a single bottleneck link with some available 327 | bandwidth and queuing capacity. As the congestion window opens and 328 | more data is put in flight, initially there is an increase in 329 | throughput (on the lower graph) but no increase in delay as the 330 | bottleneck is not full. Then once the data rate reaches the bottleneck 331 | bandwidth, a queue starts to build. At this point, RTT rises, and no 332 | rise in throughput is observed. This is the beginning of the 333 | congestion phase. This graph is really a simplified version of what we 334 | see in the 4.5 to 6.0 second timeframe in :numref:`Figure %s 335 | `. 336 | 337 | Like Vegas, BBR aims to accurately determine that point where the 338 | queue has just started to build, as opposed to continuing all the way 339 | to the point of filling the buffer and causing packet drops as Reno 340 | does. A lot of the work in BBR has been around improving the 341 | sensitivity of the mechanisms that locate that sweet spot. There are 342 | numerous challenges: measurements of bandwidth and delay are noisy; 343 | network conditions are not static; and the perennial quest for 344 | fairness when competing for bandwidth against both BBR and non-BBR 345 | flows. 346 | 347 | One striking feature of BBR compared to the other approaches we have 348 | seen is that it does not rely solely on ``CongestionWindow`` to determine how much 349 | data is put in flight. Notably, BBR also tries to smooth out the rate 350 | at which a sender puts data into the network in an effort to avoid 351 | bursts that would lead to excessive queuing. Under ideal conditions, 352 | we would like to send data exactly at the rate of the bottleneck, thus 353 | achieving the highest possible throughput without causing a queue to 354 | build up. Whereas most TCP variants use the arrival of an ACK to 355 | "clock" the sending of data, thus ensuring that the amount of 356 | unacknowledged data in flight remains constant, BBR creates an 357 | estimate of the bottleneck bandwidth and uses a local scheduling 358 | algorithm to send data at that rate. ACKs still play an important role 359 | in updating knowledge about the state of the network, but they are not 360 | directly used to pace transmissions. This means that delayed ACKs do 361 | not lead to sudden bursts of transmission. Of course, ``CongestionWindow`` is 362 | still used to ensure that enough data is sent to keep the pipe full, 363 | and to ensure that the amount of data in flight is not so much greater 364 | than the bandwidth-delay product as to cause queues to overflow. 365 | 366 | In order to maintain an up-to-date view of the current RTT and 367 | bottleneck bandwidth, it is necessary to keep probing above and below 368 | the current estimate of the bottleneck bandwidth. More bandwidth can 369 | become available due to a reduction in the traffic from competing 370 | flows, changes in link properties (e.g. on wireless links), or routing 371 | changes. Changes in RTT are also possible, particularly if the path 372 | changes. To detect a change in RTT it is necessary to send less 373 | traffic, hence draining queues. To detect a change in available 374 | bandwidth, it is necessary to send more traffic. Hence, BBR probes 375 | both above and below its current estimate of the bottleneck 376 | bandwidth. If necessary, the estimates are updated, and the sending 377 | rate and ``CongestionWindow`` are updated accordingly. 378 | 379 | .. _fig-bbrstate: 380 | .. figure:: figures/Slide6.png 381 | :width: 150px 382 | :align: center 383 | 384 | State machine diagram for BBR. 385 | 386 | The process of sequentially probing for the available bandwidth and 387 | the minimum RTT is captured in the state diagram of :numref:`Figure %s 388 | `. After an aggressive startup phase to try to establish 389 | the available bandwidth on the path, the sending rate is reduced to 390 | drain the queue, and then the algorithm settles into the inner loop of 391 | the diagram, in which it periodically checks for better delay at lower 392 | sending rates, or better throughput at higher sending rates. On a 393 | relatively long timescale (multiple seconds) the algorithm moves into 394 | the ``ProbeRTT`` state, lowering its sending rate by a factor of two in an 395 | effort to fully drain the queue and test for lower RTT. 396 | 397 | One interesting aspect of this approach is that when a large flow 398 | reduces its sending rate dramatically in the ``ProbeRTT`` state, that flow's contribution to queuing delay 399 | drops, which causes other flows to simultaneously see a new, lower RTT, and update 400 | their estimates. Hence flows show a tendency to synchronize their RTT 401 | estimation at times when the queue is actually empty or close to it, 402 | improving the accuracy of this estimate. 403 | 404 | 405 | 406 | BBR is actively being worked on and rapidly evolving, with version 2 407 | in use at the time of writing. One major focus is 408 | fairness. For example, some early experiments showed CUBIC flows getting 100x less 409 | bandwidth when competing with BBR flows, and other experiments show that 410 | unfairness among BBR flows is possible. BBR version 1 was insensitive 411 | to loss, which could lead to high loss rates particularly when the 412 | amount of buffering on the path was relatively low. As several 413 | implementations of BBR are now being tried in different environments, 414 | including within Google's internal backbone and in the broader 415 | Internet, experience is being gathered to further refine the design. The 416 | IETF's Congestion Control Working Group is hosting discussions on the 417 | ongoing design and experimentation. 418 | 419 | 420 | .. _reading_bbr: 421 | .. admonition:: Further Reading 422 | 423 | N. Cardwell, Y. Cheng, C. S. Gunn, S. Yeganeh, V. Jacobson. `BBR: Congestion-based 424 | Congestion Control 425 | `__. 426 | Communications of the ACM, Volume 60, Issue 2, February 2017. 427 | 428 | 429 | 430 | -------------------------------------------------------------------------------- /biblio.rst: -------------------------------------------------------------------------------- 1 | Annotated Bibliography 2 | ======================= 3 | 4 | The set of research papers published on congestion control is 5 | extensive, with only a small subset cited in the main body of the 6 | book. This section is a place to collect a more comprehensive 7 | bibliography, which (for now) is organized according the major themes 8 | covered in the book. 9 | 10 | We invite the community help keep the bibliography complete and 11 | up-to-date. Please submit a `Pull Request to GitHub 12 | `__ to include additional 13 | citations or to fix mistakes. Post an `Issue to GitHub 14 | `__ if you have 15 | suggestions for ways to improve how the bibliography is organized. 16 | 17 | Foundational 18 | ----------------- 19 | 20 | Queuing Analysis 21 | ~~~~~~~~~~~~~~~~~~~~ 22 | 23 | * L. Kleinrock. `Queueing Systems, Volume 2 24 | `__. Wiley & 25 | Sons, May 1976. 26 | 27 | * V. Paxson and S. Floyd. `Wide-Area Traffic: The Failure of Poisson 28 | Modeling `__. 29 | IEEE/ACM Transactions on Networking, June 1995. 30 | 31 | * W. Leland *et al*, `On the self-similar nature of Ethernet traffic 32 | `__. 33 | ACM SIGCOMM '93 Symposium, August 1993. 34 | 35 | * J. Gettys. `Bufferbloat: Dark Buffers in the Internet 36 | `__. 37 | IEEE Internet Computing, April 2011. 38 | 39 | Theoretical Underpinnings 40 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~ 41 | 42 | * M. Mathis, J. Semke, J. Mahdavi, and T. Ott. `The Macroscopic 43 | Behavior of the TCP Congestion Avoidance Algorithm 44 | `__. 45 | SIGCOMM CCR, 27(3), July 1997. 46 | 47 | * F. Kelly. `Charging and Rate Control for Elastic Traffic 48 | `__. 49 | European Transactions on Telecommunications, 8:33–37, 1997. 50 | 51 | * S. Athuraliya and S. Low. `An Empirical Validation of a Duality 52 | Model of TCP and Active Queue Management Algorithms 53 | `__. 54 | Proceedings of the Winter Simulation Conference, 2001. 55 | 56 | * R. Jain and K. K. Ramakrishnan. `Congestion Avoidance in Computer 57 | Networks with a Connectionless Network Layer: Concepts, Goals and 58 | Methodology. `__. Computer 59 | Networking Symposium, April 1988. 60 | 61 | Evaluation Criteria 62 | ~~~~~~~~~~~~~~~~~~~~ 63 | 64 | * R. Jain, D. Chiu, and W. Hawe. `A Quantitative Measure of Fairness 65 | and Discrimination for Resource Allocation in Shared Computer Systems 66 | `__. 67 | DEC Research Report TR-301, 1984. 68 | 69 | * Bob Briscoe. `Flow Rate Fairness: Dismantling a Religion 70 | `__. 71 | ACM SIGCOMM CCR, April 2007. 72 | 73 | * R. Ware, *et al*. `Beyond Jain's Fairness Index: Setting the Bar for 74 | the Deployment of Congestion Control Algorithms 75 | `__. 76 | ACM SIGCOMM HotNets. November 2019. 77 | 78 | Architecture 79 | ~~~~~~~~~~~~~ 80 | 81 | * J. Saltzer, D. Reed, and D. Clark. `End-to-End Arguments in System Design 82 | `__. 83 | ACM Transactions on Computer Systems, Nov. 1984. 84 | 85 | * D. Clark, `The Design Philosophy of the DARPA Internet Protocols 86 | `__. 87 | ACM SIGCOMM, 1988. 88 | 89 | * S. Jain, *et al*. `B4: Experience with a 90 | Globally-Deployed Software Defined WAN 91 | `__. 92 | ACM SIGCOMM, August 2013. 93 | 94 | * J. Perry, *et al*. `Fastpass: A Centralized "Zero-Queue" Datacenter Network 95 | `__. 96 | ACM SIGCOMM, August 2014. 97 | 98 | 99 | General-Purpose Algorithms 100 | -------------------------------- 101 | 102 | * V. Jacobson. `Congestion Avoidance and Control 103 | `__. ACM SIGCOMM '88 104 | Symposium, August 1988. 105 | 106 | * J. Hoe. `Improving the start-up behavior of a congestion control 107 | scheme for TCP 108 | `__. ACM SIGCOMM '96 109 | Symposium. August 1996. 110 | 111 | * L. Brakmo, S. O'Malley, and L. Peterson 112 | `TCP Vegas: New Technique for Congestion Detection and Avoidance 113 | `__. 114 | ACM SIGCOMM '94 Symposium. August 1994. (Reprinted in *IEEE/ACM Transactions 115 | on Networking,* October 1995). 116 | 117 | * S. Low, L. Peterson, and L. Wang. `Understanding TCP Vegas: A 118 | Duality Model. `__. 119 | Journal of the ACM, Volume 49, Issue 2, March 2002. 120 | 121 | * S. Ha, I. Rhee, and L. Xu. `CUBIC: a new TCP-friendly high-speed TCP variant 122 | `__. 123 | ACM SIGOPS Operating Systems Review, Volume 42, Issue 5, July 2008. 124 | 125 | * N. Cardwell, Y. Cheng, C. S. Gunn, S. Yeganeh, V. Jacobson. 126 | `BBR: Congestion-based Congestion Control 127 | `__. 128 | Communications of the ACM, Volume 60, Issue 2, February 2017. 129 | 130 | * B. Briscoe, *et al.* `Implementing the "Prague Requirements" for Low 131 | Latency Low Loss Scalable Throughput (L4S) 132 | `__. 133 | Linux NetDev 0x13 Conference, March 2019. 134 | 135 | Active Queue Management 136 | --------------------------------- 137 | 138 | * K.K. Ramakrishnan and R. Jain. `A Binary Feedback Scheme for 139 | Congestion Avoidance in Computer Networks with a Connectionless 140 | Network Layer `__. 141 | ACM SIGCOMM, August 1988. 142 | 143 | * S. Floyd and V. Jacobson `Random Early Detection (RED) Gateways for Congestion Avoidance 144 | `__. 145 | IEEE/ACM Transactions on Networking. August 1993. 146 | 147 | * R. Braden, *et al*. `Recommendations on Queue Management and 148 | Congestion Avoidance in the Internet 149 | `__. RFC 2309, April 1998. 150 | 151 | * K. Ramakrishnan, S. Floyd, and D. Black. `The Addition of Explicit 152 | Congestion Notification (ECN) to IP 153 | `__. RFC 3168, 154 | September 2001. 155 | 156 | * K. Nichols and V. Jacobson. `Controlling Queue Delay 157 | `__. 158 | ACM Queue, 10(5), May 2012. 159 | 160 | Domain-Specific Algorithms 161 | ------------------------------- 162 | 163 | Datacenter 164 | ~~~~~~~~~~~~~~~~ 165 | 166 | * M. Alizadeh, *et al*. `Data Center TCP (DCTCP) 167 | `__. 168 | ACM SIGCOMM, August 2010. 169 | 170 | * R. Mittal, *et al.* `TIMELY: RTT-based Congestion Control for the Datacenter 171 | `__. 172 | ACM SIGCOMM 2015. 173 | 174 | * S. Liu, *et al*. `Breaking the Transience-Equilibrium Nexus: A New 175 | Approach to Datacenter Packet Transport 176 | `__. 177 | Usenix NSDI '21. April 2021. 178 | 179 | Background Transfers 180 | ~~~~~~~~~~~~~~~~~~~~~~~ 181 | 182 | * S. Shalunov, *et al*. `Low Extra Delay Background Transport (LEDBAT) 183 | `__. 184 | RFC 6817, December 2012. 185 | 186 | HTTP 187 | ~~~~~~~~~~~~ 188 | 189 | * J. Iyengar and I. Swett, Eds. 190 | `QUIC Loss Detection and Congestion Control 191 | `__. 192 | RFC 9002, May 2021. 193 | 194 | Wireless 195 | ~~~~~~~~~~~~~~ 196 | 197 | * H. Jiang, Z. Liu, Y. Wang, K. Lee and I. Rhee. 198 | `Understanding Bufferbloat in Cellular Networks 199 | `__. 200 | ACM SIGCOMM Workshop on Cellular Networks, August 2012. 201 | 202 | * K. Liu and J. Y. B. Lee, `On Improving TCP Performance over Mobile 203 | Data Networks `__. 204 | IEEE Transactions on Mobile Computing, 2016. 205 | 206 | * Y. Xie, F. Yi, and K. Jamieson. `PBE-CC: Congestion Control via 207 | Endpoint-Centric, Physical-Layer Bandwidth Measurements 208 | `__. SIGCOMM 2020. 209 | 210 | * Y. Gao, *et al.* `Understanding On-device Bufferbloat For Cellular 211 | Upload `__. 212 | ACM Internet Measurement Conference (IMC), November 2016. 213 | 214 | 215 | Realtime 216 | ~~~~~~~~~~~~~~~ 217 | 218 | * S. Floyd, M. Handley, J. Padhye, and J. Widmer. 219 | `TCP Friendly Rate Control (TFRC): Protocol Specification 220 | `__. 221 | RFC 5348, September 2008. 222 | 223 | * J. Padhye, V. Firoiu, D. Towsley, and J. Kurose. 224 | `Modeling TCP Throughput: A Simple Model and its Empirical Validation 225 | `__. 226 | ACM SIGCOMM, September 1998. 227 | 228 | Multipath 229 | ~~~~~~~~~ 230 | 231 | * D. Wischik, C. Raiciu, A. Greenhalgh and M. Handley. 232 | `Design, Implementation and Evaluation of Congestion Control for Multipath TCP 233 | `__. 234 | NSDI, April 2011. 235 | 236 | * C. Raiciu, M. Handley, and D. Wischik. 237 | `Coupled Congestion Control for Multipath Transport Protocols 238 | `__. 239 | RFC 6356, October 2011. 240 | 241 | 242 | Implementations and Tools 243 | -------------------------------- 244 | 245 | * S.J. Leffler, M.K. McKusick, M.J. Karels, and J.S Quarterman. `The 246 | Design and Implementation of the 4.3 BSD Unix Operating System `__. Addison-Wesley. January 1989. 247 | 248 | * `Netesto `__. 249 | 250 | * `NS-3 Network Simulator `__. 251 | 252 | * `RFC 6298: Computing TCP's Retransmission Timer 253 | `__. June 2011. 254 | 255 | * The Linux Kernel. `BPF Documentation 256 | `__. 257 | -------------------------------------------------------------------------------- /code/README: -------------------------------------------------------------------------------- 1 | Order code fragments are used (substituted in book.tex) 2 | 3 | tcp_ip.rst: code/nagle.c 4 | 5 | algorithm.rst: code/timeout.c 6 | algorithm.rst: code/cwin.c 7 | 8 | aqm.rst: code/red.c 9 | 10 | README: code/build.sh 11 | 12 | -------------------------------------------------------------------------------- /code/build.sh: -------------------------------------------------------------------------------- 1 | $ mkdir ~/tcpcc 2 | $ cd ~/tcpcc 3 | $ git clone https://github.com/SystemsApproach/tcpcc.git 4 | -------------------------------------------------------------------------------- /code/cwin.c: -------------------------------------------------------------------------------- 1 | { 2 | u_int cw = state->CongestionWindow; 3 | u_int incr = state->maxseg; 4 | 5 | if (cw > state->CongestionThreshold) 6 | incr = incr * incr / cw; 7 | state->CongestionWindow = MIN(cw + incr, TCP_MAXWIN); 8 | } 9 | -------------------------------------------------------------------------------- /code/nagle.c: -------------------------------------------------------------------------------- 1 | When the application produces data to send 2 | if both the available data and the window >= MSS 3 | send a full segment 4 | else 5 | if there is unACKed data in flight 6 | buffer the new data until an ACK arrives 7 | else 8 | send all the new data now 9 | -------------------------------------------------------------------------------- /code/red.c: -------------------------------------------------------------------------------- 1 | if AvgLen <= MinThreshold 2 | queue the packet 3 | if MinThreshold < AvgLen < MaxThreshold 4 | calculate probability P 5 | drop the arriving packet with probability P 6 | if MaxThreshold <= AvgLen 7 | drop the arriving packet 8 | -------------------------------------------------------------------------------- /code/timeout.c: -------------------------------------------------------------------------------- 1 | { 2 | SampleRTT -= (EstimatedRTT >> 3); 3 | EstimatedRTT += SampleRTT; 4 | if (SampleRTT < 0) 5 | SampleRTT = -SampleRTT; 6 | SampleRTT -= (Deviation >> 3); 7 | Deviation += SampleRTT; 8 | TimeOut = (EstimatedRTT >> 3) + (Deviation >> 1); 9 | } 10 | -------------------------------------------------------------------------------- /conf.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # 3 | # Configuration file for the Sphinx documentation builder. 4 | # 5 | # This file does only contain a selection of the most common options. For a 6 | # full list see the documentation: 7 | # http://www.sphinx-doc.org/en/master/config 8 | 9 | # -- Path setup -------------------------------------------------------------- 10 | 11 | # If extensions (or modules to document with autodoc) are in another directory, 12 | # add these directories to sys.path here. If the directory is relative to the 13 | # documentation root, use os.path.abspath to make it absolute, like shown here. 14 | # 15 | # import os 16 | # import sys 17 | # sys.path.insert(0, os.path.abspath('.')) 18 | 19 | import os 20 | 21 | from subprocess import check_output, CalledProcessError 22 | 23 | def get_version(): 24 | 25 | try: 26 | version = check_output(['cat', 'VERSION'], 27 | universal_newlines=True) 28 | except CalledProcessError: 29 | return 'unknown version' 30 | 31 | return version.rstrip() 32 | 33 | # "version" is used for html build 34 | version = get_version() 35 | # "release" is used for LaTeX build 36 | release = version 37 | 38 | 39 | # -- Project information ----------------------------------------------------- 40 | 41 | project = u'TCP Congestion Control: A Systems Approach' 42 | copyright = u'2022, Systems Approach LLC (Publisher)' 43 | author = u'Peterson, Brakmo, Davie' 44 | 45 | 46 | # -- General configuration --------------------------------------------------- 47 | 48 | # If your documentation needs a minimal Sphinx version, state it here. 49 | # 50 | # needs_sphinx = '1.0' 51 | 52 | # make all warnings errors 53 | warning_is_error = False 54 | 55 | # Add any Sphinx extension module names here, as strings. They can be 56 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom 57 | # ones. ***Replace "mathjax" with "imgmath" for epub output.*** 58 | extensions = [ 59 | 'sphinx.ext.autosectionlabel', 60 | 'sphinx.ext.coverage', 61 | 'sphinx.ext.ifconfig', 62 | 'sphinx.ext.mathjax', 63 | 'sphinx.ext.todo', 64 | 'sphinxcontrib.spelling', 65 | "sphinx_multiversion", 66 | ] 67 | 68 | # Text files with lists of words that shouldn't fail the spellchecker: 69 | spelling_word_list_filename=['dict.txt', ] 70 | 71 | # Add any paths that contain templates here, relative to this directory. 72 | templates_path = ['_templates'] 73 | 74 | # The suffix(es) of source filenames. 75 | # You can specify multiple suffix as a list of string: 76 | # 77 | # source_suffix = ['.rst', '.md'] 78 | source_suffix = '.rst' 79 | 80 | # The master toctree document. 81 | master_doc = 'index' 82 | 83 | # The language for content autogenerated by Sphinx. Refer to documentation 84 | # for a list of supported languages. 85 | # 86 | # This is also used if you do content translation via gettext catalogs. 87 | # Usually you set "language" from the command line for these cases. 88 | language = 'en' 89 | 90 | # List of patterns, relative to source directory, that match files and 91 | # directories to ignore when looking for source files. 92 | # This pattern also affects html_static_path and html_extra_path. 93 | exclude_patterns = [u'_build', 'venv-docs', 'requirements.txt', 'Thumbs.db', 'private', '.DS_Store', '*/README.rst'] 94 | 95 | # The name of the Pygments (syntax highlighting) style to use. 96 | pygments_style = None 97 | 98 | # Enable numbered figures 99 | numfig = True 100 | numfig_format = { 101 | 'figure': 'Figure %s.', 102 | 'table': 'Table %s.' 103 | } 104 | 105 | # Ignore link check for the following websites 106 | # linkcheck_ignore = [ 107 | # 'https://SDN.systemspproach.org/', 108 | # ] 109 | 110 | # -- Options for HTML output ------------------------------------------------- 111 | 112 | # The theme to use for HTML and HTML Help pages. See the documentation for 113 | # a list of builtin themes. 114 | # 115 | html_theme = 'sphinx_rtd_theme' 116 | 117 | # Theme options are theme-specific and customize the look and feel of a theme 118 | # further. For a list of options available for each theme, see the 119 | # documentation. 120 | # 121 | html_theme_options = { 122 | 'prev_next_buttons_location': 'both' 123 | } 124 | 125 | # Add any paths that contain custom static files (such as style sheets) here, 126 | # relative to this directory. They are copied after the builtin static files, 127 | # so a file named "default.css" will overwrite the builtin "default.css". 128 | html_static_path = ['_static'] 129 | 130 | html_css_files = [ 131 | 'css/rtd_theme_mods.css', 132 | ] 133 | 134 | 135 | # HTML Favicon 136 | html_favicon = '_static/bridge.ico' 137 | 138 | # HTML Index 139 | html_use_index = False 140 | 141 | # Custom sidebar templates, must be a dictionary that maps document names 142 | # to template names. 143 | # 144 | # The default sidebars (for documents that don't match any pattern) are 145 | # defined by theme itself. Builtin themes are using these templates by 146 | # default: ``['localtoc.html', 'relations.html', 'sourcelink.html', 147 | # 'searchbox.html']``. 148 | # 149 | # html_sidebars = {} 150 | 151 | #extra HTML files 152 | html_extra_path = ['_extra'] 153 | 154 | # -- Options for HTMLHelp output --------------------------------------------- 155 | 156 | # Output file base name for HTML help builder. 157 | htmlhelp_basename = 'SystemsApproach' 158 | 159 | 160 | # -- Options for LaTeX output ------------------------------------------------ 161 | #latex_engine = 'xelatex' 162 | 163 | latex_elements = { 164 | # The paper size ('letterpaper' or 'a4paper'). 165 | # 166 | 'papersize': 'letterpaper', 167 | 168 | # The font size ('10pt', '11pt' or '12pt'). 169 | # 170 | 'pointsize': '11pt', 171 | 172 | # Get unicode to work 173 | # 174 | 'fontenc': '\\usepackage[LGR,T1]{fontenc}', 175 | 176 | # Latex figure (float) alignment 177 | # 178 | 'figure_align': 'ht', 179 | } 180 | 181 | # Grouping the document tree into LaTeX files. List of tuples 182 | # (source start file, target name, title, 183 | # author, documentclass [howto, manual, or own class]). 184 | latex_documents = [ 185 | (master_doc, 'book.tex', u'TCP Congestion Control: A Systems Approach', 186 | u'Peterson, Brakmo and Davie ', 'manual', True), 187 | ] 188 | 189 | latex_toplevel_sectioning = 'chapter' 190 | 191 | 192 | # -- Options for manual page output ------------------------------------------ 193 | 194 | # One entry per manual page. List of tuples 195 | # (source start file, name, description, authors, manual section). 196 | man_pages = [ 197 | (master_doc, 'Systems Approach', u'Systems Approach', 198 | [author], 1) 199 | ] 200 | 201 | 202 | # -- Options for Texinfo output ---------------------------------------------- 203 | 204 | # Grouping the document tree into Texinfo files. List of tuples 205 | # (source start file, target name, title, author, 206 | # dir menu entry, description, category) 207 | texinfo_documents = [ 208 | (master_doc, 'TCP Congestion Control', u'Software-Defined Networks', 209 | author, 'Peterson, Brakmo, and Davie', 'A Systems Approach', 210 | 'Miscellaneous'), 211 | ] 212 | 213 | 214 | # -- Options for Epub output ------------------------------------------------- 215 | epub_title = project 216 | epub_description = 'Efficient Sharing of Network Resources' 217 | epub_cover = ('_static/cover.jpg', '') 218 | epub_show_urls = 'False' 219 | epub_use_index = False 220 | 221 | imgmath_font_size = 10 222 | 223 | # The unique identifier of the text. This can be a ISBN number 224 | # or the project homepage. 225 | # 226 | # epub_identifier = '' 227 | 228 | # A unique identification for the text. 229 | # 230 | # epub_uid = '' 231 | 232 | # A list of files that should not be packed into the epub file. 233 | epub_exclude_files = ['search.html','robots.txt'] 234 | 235 | 236 | # -- Extension configuration ------------------------------------------------- 237 | 238 | # -- options for Intersphinx extension --------------------------------------- 239 | 240 | intersphinx_mapping = { 241 | 'sphinx': ('https://www.sphinx-doc.org/en/master', None), 242 | 'aether': ('https://docs.aetherproject.org/master', None), 243 | 'sdcore': ('https://docs.sd-core.opennetworking.org/master', None), 244 | 'sdran': ('https://docs.sd-ran.org/master', None), 245 | 'sdran': ('https://docs.sd-fabric.org/master', None), 246 | 'sysapproach5g': ('https://5g.systemsapproach.org/', None), 247 | 'sysapproachnet': ('https://book.systemsapproach.org/', None), 248 | 'sysapproachsdn': ('https://sdn.systemsapproach.org/', None), 249 | } 250 | 251 | # -- Options for todo extension ---------------------------------------------- 252 | # If true, `todo` and `todoList` produce output, else they produce nothing. 253 | todo_include_todos = True 254 | 255 | 256 | # -- Set up Google Analytics 257 | # -- using approach at https://stackoverflow.com/questions/9444342/adding-a-javascript-script-tag-some-place-so-that-it-works-for-every-file-in-sph/41885884#41885884 258 | 259 | 260 | GA_INVOKE_JS = """ 261 | window.dataLayer = window.dataLayer || []; 262 | function gtag(){dataLayer.push(arguments);} 263 | gtag('js', new Date()); 264 | 265 | gtag('config', 'G-SQ9EK50CDR'); 266 | """ 267 | 268 | def setup(app): 269 | 270 | app.add_js_file('https://www.googletagmanager.com/gtag/js?id=G-SQ9EK50CDR', loading_method="async") 271 | app.add_js_file(None, body=GA_INVOKE_JS) 272 | -------------------------------------------------------------------------------- /design.rst: -------------------------------------------------------------------------------- 1 | Chapter 3: Design Space 2 | ========================== 3 | 4 | With the architectural foundation of TCP/IP in place, we are ready to 5 | explore the design space for addressing congestion. But to do this, 6 | it is helpful to first take a step back and consider the bigger 7 | picture. The Internet is a complex arrangement of compute, storage, 8 | and communication resources that is shared among millions of 9 | users. The challenge is how to assign those resources—specifically 10 | switching capacity, buffer space, and link bandwidth—to end-to-end 11 | packet flows. 12 | 13 | Because the Internet originally adopted a best-effort service model, 14 | and users (or more precisely, TCP running on their behalf) were free 15 | to send as many packets into the network as they could generate, it 16 | was not surprising that the Internet eventually suffered from the 17 | *tragedy of the commons*. And with users starting to experience congestion 18 | collapse, the natural response was to try to control it. Hence the 19 | term *congestion control*, which can be viewed as an implicit 20 | mechanism for allocating resources. It is implicit in the sense that 21 | as the control mechanism detects resources 22 | becoming scarce, it reacts in an effort to alleviate congestion. 23 | 24 | A network service model in which resources are *explicitly* allocated 25 | to packet flows is the obvious alternative; for example, an 26 | application could make an explicit request for resources before 27 | sending traffic. The best-effort assumption of IP meant such an 28 | approach was not immediately viable at the time congestion became a 29 | serious issue. Subsequent work was done to retrofit more explicit 30 | resource allocation mechanisms to the Internet's best-effort delivery 31 | model, including the ability to make *Quality-of-Service (QoS)* 32 | guarantees. It is instructive to consider the Internet's approach to 33 | congestion in the context of such efforts. The first section does so 34 | as it explores the set of design decisions that underlie the control 35 | mechanisms outlined in this book. We then define the criteria by 36 | which different congestion-control mechanisms can be quantitatively 37 | evaluated and compared. 38 | 39 | 3.1 Implementation Choices 40 | ------------------------------- 41 | 42 | We start by introducing four implementation choices that a congestion 43 | control mechanism faces, and the design rationale behind the decisions 44 | that were made for TCP/IP. Some of the decisions were "obvious" given 45 | the circumstances under which they were made, but for completeness—and 46 | because the Internet's continual evolution means circumstances 47 | change—it is prudent to consider them all. 48 | 49 | 3.1.1 Centralized versus Distributed 50 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 51 | 52 | In principle, the first design decision is whether a network's 53 | approach to resource allocation is centralized or distributed. In 54 | practice, the Internet's scale—along with the autonomy of the 55 | organizations that connect to it—dictated a distributed 56 | approach. Indeed, distributed management of resources was an 57 | explicitly stated goal of the Internet's design, as articulated by 58 | Dave Clark. But acknowledging this default decision is important for 59 | two reasons. 60 | 61 | .. _reading_design: 62 | .. admonition:: Further Reading 63 | 64 | D. Clark, `The Design Philosophy of the DARPA Internet 65 | Protocols `__. 66 | ACM SIGCOMM, 1988. 67 | 68 | First, while the Internet's approach to congestion control is 69 | distributed across its millions of hosts and routers, it is fair to 70 | think of them as cooperatively trying to achieve a globally optimal 71 | solution. From this perspective, there is a shared objective 72 | function, and all the elements are implementing a distributed 73 | algorithm to optimize that function. The various mechanisms described 74 | throughout this book are simply defining different objective 75 | functions, where a persistent challenge has been how to think about 76 | competing objective functions when multiple mechanisms have been 77 | deployed. 78 | 79 | Second, while a centralized approach is not practical for the Internet 80 | as a whole, it can be appropriate for limited domains. For example, a 81 | logically centralized controller could collect information about the 82 | state of the network's links and switches, compute a globally optimal 83 | allocation, and then advise (or even police) end hosts as to how much 84 | capacity is available to each of them. Such an approach would certainly 85 | be limited by the time-scale in which the centralized controller could 86 | be responsive to changes in the network, but it has been successfully 87 | applied to the coarse-grained allocation decisions made by traffic 88 | engineering mechanisms like B4 and SWAN. Exactly where one draws a 89 | line between coarse-grain traffic engineering decisions and fine-grain 90 | congestion control decisions is not clear, but it's good to keep an 91 | open mind about the spectrum of options that are available. 92 | 93 | .. _reading_b4: 94 | .. admonition:: Further Reading 95 | 96 | S. Jain, *et al*. `B4: Experience with a 97 | Globally-Deployed Software Defined WAN 98 | `__. 99 | ACM SIGCOMM, August 2013. 100 | 101 | Centralized control has also been used effectively in datacenters, 102 | which are an interesting environment for congestion control. First, 103 | they have very low RTTs (for traffic between servers in the 104 | datacenter, if not for flows heading in or out of the datacenter). 105 | Second, in many cases a datacenter can be treated as a greenfield, 106 | raising the possibility to try new approaches that don't have to 107 | coexist fairly with incumbent algorithms. Fastpass, developed in a 108 | collaboration between MIT and Facebook researchers, is a good example 109 | of such a centralized approach. 110 | 111 | .. _reading_fastpass: 112 | .. admonition:: Further Reading 113 | 114 | J. Perry, *et al*. `Fastpass: A Centralized "Zero-Queue" Datacenter Network 115 | `__. 116 | ACM SIGCOMM, August 2014. 117 | 118 | 119 | 3.1.2 Router-Centric versus Host-Centric 120 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 121 | 122 | Given a distributed approach to resource allocation, the next question 123 | is whether to implement the mechanism inside the network (i.e., at 124 | the routers or switches) or at the edges of the network (i.e., in the 125 | hosts, perhaps as part of the transport protocol). This is not 126 | strictly an either/or situation. Both locations are involved, and the 127 | real issue is where the majority of the burden falls. Individual 128 | routers always take responsibility for deciding which packets to 129 | forward and which packets to drop. However, there is a range of options 130 | in how much the router involves the end hosts in specifying how this 131 | decision is made, or learning how this decision was made. 132 | 133 | At one end of the spectrum, routers can allow hosts to reserve 134 | capacity and then ensure each flow's packets are delivered 135 | accordingly. They might do this, for example, by implementing a 136 | signalling protocol along with Fair 137 | Queuing, accepting new flows only when there is sufficient capacity, 138 | and policing hosts to make sure their flows stay within their 139 | reservations. This would correspond to a reservation-based approach in 140 | which the network is able to make QoS guarantees. We consider this 141 | out-of-scope for the purpose of this book. 142 | 143 | At the other end of the spectrum is a host-centric approach. The 144 | router makes no guarantees and offers no explicit feedback about the 145 | available capacity (i.e., silently drops packets when its buffers are 146 | full) and it is the host's responsibility to observe the network 147 | conditions (e.g., how many packets they are successfully getting 148 | through the network) and adjust its behavior accordingly. 149 | 150 | In the middle, routers can take more proactive action to assist the 151 | end hosts in doing their job, but not by reserving buffer space. This 152 | involves the router sending *feedback* to the end hosts when its 153 | buffers are full. We describe some of these forms of *Active Queue 154 | Management (AQM)* in Chapter 6, but the host-centric mechanisms 155 | described in the next two chapters assume routers silently tail-drop 156 | packets when their buffers are full. 157 | 158 | Historically, the host-centric approach has been implemented in the 159 | transport layer—usually by TCP, or by some other transport protocol 160 | that mimics TCP's algorithm, such as DCCP (datagram congestion control 161 | protocol) or QUIC (a relatively recent transport protocol designed for 162 | HTTP-based applications). However, it is also possible to implement 163 | congestion control in the application itself. *DASH (Dynamic Adaptive 164 | Streaming over HTTP)* is an example, although it is best viewed as a 165 | combination of congestion control in the transport layer (since it 166 | runs over TCP) and the application layer. Based on measured network 167 | performance, the server that is streaming video to a client switches 168 | among a range of different video encodings, thus changing the rate at 169 | which data is sent into the HTTP stream. In effect, TCP tries to find 170 | a sustainable bandwidth for the flow, and then the application adapts 171 | its sending rate to fully leverage that rate without sending more data 172 | than can be sustained under the current network conditions. Primary 173 | responsibility for congestion control falls to TCP, but the 174 | application aims to keep the pipe full while also maintaining a good 175 | user experience. 176 | 177 | 178 | 3.1.3 Window-Based versus Rate-Based 179 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 180 | 181 | Having settled on a host-centric approach, the next implementation 182 | choice is whether the mechanism is *window-based* or *rate-based*. 183 | TCP uses a window-based mechanism to implement flow control, so the 184 | design decision for TCP congestion control seems obvious. And in 185 | fact, the congestion-control mechanisms described in Chapter 4 are 186 | centered around an algorithm for computing a *congestion window*, 187 | where the sender is throttled by whichever is lesser: the advertised 188 | flow-control window or the computed congestion-control window. 189 | 190 | But it is also possible to compute the rate at which the network is able 191 | to deliver packets, and to pace transmissions accordingly. The 192 | observed rate is just the number of bytes delivered over some time 193 | period, such as the measured RTT. We point out this duality between 194 | rates and windows because a rate-based approach is more appropriate 195 | for multimedia applications that generate data at some average rate 196 | and which need at least some minimum throughput to be useful. For 197 | example, a video codec might generate video at an average rate of 198 | 1 Mbps with a peak rate of 2 Mbps. 199 | 200 | A rate-based approach is the logical choice in a reservation-based 201 | system that supports different QoS levels, but even in a best-effort 202 | network like the Internet, it is possible to implement an adaptive 203 | rate-based congestion-control mechanism that informs the application 204 | when it needs to adjust it transmission rate, for example by adjusting 205 | its codec. This is the core idea of TCP-friendly rate control (TFRC), 206 | which extends the concepts of TCP congestion avoidance to applications 207 | that more naturally send packets at a specific rate (e.g., the bitrate 208 | produced by a video codec at a given quality level). TFRC is typically 209 | used in conjunction with RTP, a transport protocol designed for real-time 210 | applications. We will see examples of such mechanisms in Chapter 7. 211 | 212 | Finally, one of the recent advances in TCP congestion control is BBR 213 | (Bottleneck Bandwidth and RTT) which uses a combination of 214 | window-based and rate-based control, in an effort to limit the build 215 | up of queues within the network. We examine this approach in some 216 | detail in Chapter 5. 217 | 218 | 219 | 220 | 221 | 3.1.4 Control-based versus Avoidance-based 222 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 223 | 224 | The final implementation choice we draw attention to is somewhat 225 | subtle. The challenge is for the end-host, based on feedback and 226 | observations, to compute how much capacity is available in the 227 | network, and adjust its sending rate accordingly. There are two 228 | general strategies for doing this: an aggressive approach that 229 | purposely sends packets at a rate that causes packet loss and then 230 | responds to it, and a conservative approach that tries to detect the 231 | onset of queue build-up and slow down before they actually overflow. 232 | We refer to the mechanisms of the first type as *control-based*, and 233 | we refer to mechanisms of the second type as *avoidance-based*. 234 | 235 | .. _reading_avoidance: 236 | .. admonition:: Further Reading 237 | 238 | R. Jain and K. K. Ramakrishnan. `Congestion Avoidance in 239 | Computer Networks with a Connectionless Network Layer: 240 | Concepts, Goals and Methodology. `__. 241 | Computer Networking Symposium, April 1988. 242 | 243 | This distinction was first called out by Raj Jain and 244 | K.K. Ramakrishnan Jain in 1988. It is often overlooked—and the term 245 | "congestion control" is used generically to refer to both—but our take 246 | is that the distinction represents an important difference, and so we 247 | will call it out when appropriate. Admittedly, we will also fall back 248 | to the generic use of "congestion control" when the distinction is not 249 | critical to the discussion. 250 | 251 | Also note that the approaches we call "control-based" and 252 | "avoidance-based" are sometimes referred to as *loss-based* and 253 | *delay-based*, respectively, according to the criteria each uses as a 254 | signal that the congestion window needs to be adjusted. The former 255 | adjusts the window when it detects a loss and the latter adjusts the 256 | window when it detects a change in the delay gradient. When viewed 257 | from this perspective, each of the algorithms introduced over the next 258 | four chapters effectively refines the fidelity of these signals in one 259 | way or another. 260 | 261 | 262 | 3.2 Evaluation Criteria 263 | ----------------------- 264 | 265 | Having identified the set of design decisions that go into crafting a 266 | congestion-control mechanism, the next question is whether any given 267 | solution is good or not. Recall that in Chapter 1 we posed the 268 | question of how a network *effectively* and *fairly* allocates its 269 | resources. This suggests at least two broad measures by which a 270 | resource allocation scheme can be evaluated. We consider each in turn. 271 | 272 | 3.2.1 Effectiveness 273 | ~~~~~~~~~~~~~~~~~~~ 274 | 275 | A good starting point for evaluating the effectiveness of a 276 | congestion-control mechanism is to consider the two principal metrics 277 | of networking: throughput and delay. Clearly, we want as much 278 | throughput and as little delay as possible. Unfortunately, these goals 279 | can be at odds with each other. One way to increase throughput is to 280 | allow as many packets into the network as possible, so as to drive the 281 | utilization of all the links up to 100%. We would do this to avoid the 282 | possibility of a link becoming idle because an idle link hurts 283 | throughput. The problem with this strategy is that increasing the 284 | number of packets in the network also increases the length of the 285 | queues at each router. Such *persistent queues* mean packets are 286 | delayed in the network, or worse, dropped. Having to drop packets in 287 | the middle of the network not only impacts delay but also hurts 288 | throughput because upstream link bandwidth has been wasted on a packet 289 | that was not successfully delivered all the way to the destination.\ [#]_ 290 | 291 | .. [#] 292 | We sometimes use the term *goodput* instead of *throughput* to 293 | emphasize that we care about data that is successfully delivered 294 | through the network to the receiver, as opposed to just transmitted 295 | by the sender. 296 | 297 | The ratio of throughput to delay is a general metric for evaluating 298 | the effectiveness of a resource allocation scheme. This ratio is 299 | sometimes referred to as the *power* of the system: 300 | 301 | .. math:: 302 | 303 | \mathsf{Power = Throughput / Delay} 304 | 305 | Intuitively, the objective is to maximize this ratio, which is a 306 | function of how much load you place on the system. The load, in turn, 307 | is set by the resource allocation mechanism. :numref:`Figure %s 308 | ` gives a representative power curve, where, ideally, the 309 | resource allocation mechanism would operate at the peak of this 310 | curve. To the left of the peak, the mechanism is being too 311 | conservative; that is, it is not allowing enough packets to be sent to 312 | keep the links busy. To the right of the peak, so many packets are 313 | being allowed into the network that either (a) increases in delay 314 | (denominator) due to queuing are starting to dominate any small gains 315 | in throughput, or (b) throughput (numerator) actually starts to drop 316 | due to packets being dropped. 317 | 318 | .. _fig-power: 319 | .. figure:: figures/f06-03-9780123850591.png 320 | :width: 350px 321 | :align: center 322 | 323 | Ratio of throughput to delay as a function of load. 324 | 325 | Moreover, we need to be concerned about what happens even when the 326 | system is operating under heavy load—towards the right end of the 327 | curve in :numref:`Figure %s `. Ideally, we would like to 328 | avoid the situation in which the system throughput approaches 329 | zero. The goal is for the mechanism to be *stable*\ —where packets 330 | continue to get through the network even when it is operating under 331 | heavy load. If a mechanism is not stable under heavy load, the 332 | network will suffer from *congestion collapse*. 333 | 334 | Note that while both "persistent queues" and "congestion collapse" are 335 | to be avoided, there is no precise definition for the threshold at 336 | which a network suffers from either. They are both subjective 337 | judgments about an algorithm's behavior, where at the end of the day, 338 | latency and throughput are the two performance indicators that matter. 339 | 340 | 3.2.2 Fairness 341 | ~~~~~~~~~~~~~~~~~~~ 342 | 343 | The effective utilization of network resources is not the only criterion 344 | for judging a resource allocation scheme. We must also consider the 345 | issue of fairness. However, we quickly get into murky waters when we try 346 | to define what exactly constitutes fair resource allocation. For 347 | example, a reservation-based resource allocation scheme provides an 348 | explicit way to create controlled unfairness. With such a scheme, we 349 | might use reservations to enable a video stream to receive 1 Mbps across 350 | some link while a file transfer receives only 10 kbps over the same 351 | link. 352 | 353 | In the absence of explicit information to the contrary, when several 354 | flows share a particular link, we would like for each flow to receive 355 | an equal share of the bandwidth. This definition presumes that a 356 | *fair* share of bandwidth means an *equal* share of bandwidth. But, 357 | even in the absence of reservations, equal shares may not equate to 358 | fair shares. Should we also consider the length of the paths being 359 | compared? For example, as illustrated in :numref:`Figure %s 360 | `, what is fair when one four-hop flow is competing with 361 | three one-hop flows? 362 | 363 | .. _fig-path-len: 364 | .. figure:: figures/Slide10.png 365 | :width: 550px 366 | :align: center 367 | 368 | One four-hop flow competing with three one-hop flows. 369 | 370 | Assuming that the most fair situation would be one in which all flows 371 | receive the same bandwidth, 372 | networking researcher Raj Jain proposed a metric that can be used to 373 | quantify the fairness of a congestion-control mechanism. Jain’s fairness 374 | index is defined as follows. Given a set of flow throughputs 375 | 376 | .. math:: 377 | 378 | (x_{1}, x_{2}, \ldots , x_{n}) 379 | 380 | (measured in consistent units such as bits/second), the following 381 | function assigns a fairness index to the flows: 382 | 383 | .. math:: 384 | 385 | f(x_{1}, x_{2}, \ldots ,x_{n}) = \frac{( \sum_{i=1}^{n} x_{i} 386 | )^{2}} {n \sum_{i=1}^{n} x_{i}^{2}} 387 | 388 | The fairness index always results in a number between 0 and 1, with 1 389 | representing greatest fairness. To understand the intuition behind this 390 | metric, consider the case where all *n* flows receive a throughput of 391 | 1 unit of data per second. We can see that the fairness index in this 392 | case is 393 | 394 | .. math:: 395 | 396 | \frac{n^2}{n \times n} = 1 397 | 398 | Now, suppose one flow receives a throughput of :math:`1 + \Delta`. 399 | Now the fairness index is 400 | 401 | .. math:: 402 | 403 | \frac{((n - 1) + 1 + \Delta)^2}{n(n - 1 + (1 + \Delta)^2)} 404 | = \frac{n^2 + 2n\Delta + \Delta^2}{n^2 + 2n\Delta + n\Delta^2} 405 | 406 | Note that the denominator exceeds the numerator by :math:`(n-1)\Delta^2`. 407 | Thus, whether the odd flow out was getting more or less than all the 408 | other flows (positive or negative :math:`\Delta`), the fairness index has 409 | now dropped below one. Another simple case to 410 | consider is where only *k* of the *n* flows receive equal throughput, 411 | and the remaining *n-k* users receive zero throughput, in which case the 412 | fairness index drops to \ *k/n*. 413 | 414 | .. _reading_jain: 415 | .. admonition:: Further Reading 416 | 417 | R. Jain, D. Chiu, and W. Hawe. `A Quantitative Measure of Fairness 418 | and Discrimination for Resource Allocation in Shared Computer Systems 419 | `__. 420 | DEC Research Report TR-301, 1984. 421 | 422 | In the next section we revisit the notion of fairness as it applies to 423 | the deployment of new congestion control algorithms. As noted above, 424 | it is not as clear-cut as it might first appear. 425 | 426 | TCP-friendly rate control (TFRC) also uses the notion of 427 | fairness. TFRC uses the TCP throughput equation (discussed in Section 428 | 1.3) to estimate the share of a 429 | congested link's bandwidth that 430 | would be obtained by a flow that implemented TCP's congestion control 431 | scheme, and sets that as a target rate for an application to 432 | send data. The application can then make decisions to help it hit that 433 | target rate. For example, a video streaming application might choose among a 434 | set of different encoding quality levels to try to maintain an 435 | average rate at the "fair" level as determined by TFRC. 436 | 437 | 3.3 Comparative Analysis 438 | --------------------------- 439 | 440 | The first step in evaluating any congestion control mechanism is to 441 | measure its performance in isolation, including: 442 | 443 | * The average throughput (goodput) flows are able to achieve. 444 | 445 | * The average end-to-end delay flows experience. 446 | 447 | * That the mechanism avoid persistent queues across a range of 448 | operating scenarios. 449 | 450 | * That the mechanism be stable across a range of operating scenarios. 451 | 452 | * The degree to which flows receive a fair share of the available 453 | capacity. 454 | 455 | The inevitable second step is to compare two or more mechanisms. This 456 | is because, given the decentralized nature of the Internet, there is 457 | no way to ensure uniform adoption of just one mechanism. 458 | Comparing quantitative metrics like throughput is easy. The problem is 459 | how to evaluate multiple mechanisms that might coexist, competing with 460 | each other for network resources. 461 | 462 | The question is not whether a given mechanism treats all of its flows 463 | fairly, but whether mechanism A is fair to flows managed by 464 | mechanism B. If mechanism A is able to measure improved throughput 465 | over B, but it does so by being more aggressive, and hence, stealing 466 | bandwidth from B's flows, then A's improvement is not fairly gained 467 | and may be discounted. It should be evident that the Internet's highly 468 | decentralized approach to congestion control 469 | works because a large number of flows respond in a cooperative way to 470 | congestion, which opens the door to more aggressive flows improving 471 | their performance at the expense of those which implement the 472 | accepted, less aggressive algorithms. 473 | 474 | .. _reading_ware: 475 | .. admonition:: Further Reading 476 | 477 | R. Ware, *et al*. `Beyond Jain's Fairness Index: Setting the Bar for 478 | the Deployment of Congestion Control Algorithms 479 | `__. 480 | ACM SIGCOMM HotNets. November 2019. 481 | 482 | Arguments like this have been made many times over the last 30 years, 483 | which has raised a high bar to the deployment of new algorithms. Even 484 | if global deployment of a new algorithm would be a net positive, 485 | incremental deployment (which is the only real option) could 486 | negatively impact flows using existing algorithms, leading to a 487 | reluctance to deploy new approaches. But such 488 | analysis suffers from three problems, as identified by Ranysha Ware and 489 | colleagues: 490 | 491 | * **Ideal-Driven Goalposting:** A fairness-based threshold asserts 492 | new mechanism B should equally share the bottleneck link with 493 | currently deployed mechanism A. This goal is too idealistic in 494 | practice, especially when A is sometimes unfair to its own flows. 495 | 496 | * **Throughput-Centricity:** A fairness-based threshold focuses on 497 | how new mechanism B impacts a competitor flow using mechanism A 498 | by focusing on A’s achieved throughput. However, this ignores other 499 | important figures of merit for good performance, such as latency, 500 | flow completion time, or loss rate. 501 | 502 | * **Assumption of Balance:** Inter-mechanism interactions often have 503 | some bias, but a fairness metric cannot tell whether the outcome 504 | is biased for or against the status quo. It makes a difference in 505 | terms of deployability whether a new mechanism B takes a larger 506 | share of bandwidth than legacy mechanism A or leaves a larger 507 | share for A to consume: the former might elicit complaints from 508 | legacy users of A, where the latter would not. Jain’s Fairness 509 | Index assigns an equivalent score to both scenarios. 510 | 511 | Instead of a simple calculation of Jain's fairness index, Ware 512 | advocates for a threshold based on *harm*, as measured by a reduction 513 | in throughput or an increase in latency or jitter. Intuitively, if the amount of 514 | harm caused by flows using a new mechanism B on flows using existing 515 | mechanism A is within a bound derived from how much harm A-managed 516 | flows cause other A-managed flows, we can consider B deployable 517 | alongside A without harm. Ware goes on to propose concrete measures of 518 | acceptable harm, which turns out to be more complicated than it 519 | might first appear. Even with a single congestion control algorithm, 520 | the amount of harm that one flow causes another depends on factors 521 | such as its RTT, start time, and duration. Thus measures of harm need 522 | to take into account the range of impacts that different flows have on 523 | each other under the existing regime and aim to do no worse with a 524 | new algorithm. 525 | 526 | 3.4 Experimental Methodology 527 | -------------------------------- 528 | 529 | Our approach to evaluating congestion-control mechanisms is to measure 530 | their performance on real systems, and as we pointed out in Chapter 1, 531 | the *de facto* specification of the respective mechanisms is the version 532 | implemented in Linux. We now describe one specific way to perform 533 | those measurements, illustrating one methodology that is widely 534 | practiced today. Our approach uses *Netesto (Network Test Toolkit)*, a 535 | collection of software tools available on GitHub. The alternative is 536 | simulation-based, with NS-3 being the most popular open source tool. 537 | 538 | .. _reading_ns3: 539 | .. admonition:: Further Reading 540 | 541 | `Netesto `__ 542 | 543 | `NS-3 Network Simulator `__ 544 | 545 | Note that while the experiments described in this section measure real 546 | congestion control algorithms (which, of course, we have not yet 547 | described in any detail), the intent is to outline how algorithms are 548 | evaluated, and not to actually draw any conclusions about specific 549 | mechanisms. 550 | 551 | 3.4.1 Experimental Setup 552 | ~~~~~~~~~~~~~~~~~~~~~~~~ 553 | 554 | Our approach uses real TCP senders/receivers running on Linux hosts, 555 | with a range of behaviors studied using a combination of kernel 556 | packages like ``netem`` and ``tbf qdisc``. Performance data is then 557 | collected using ``tcpdump``. The network connecting the end-hosts is 558 | constructed from a combination of real switches and emulated elements, 559 | supporting for example, wide-area delays and low-bandwidth links. 560 | 561 | The experiments can be characterized along two orthogonal 562 | dimensions. One is the topology of the network. This includes link 563 | bandwidths, RTTs, buffer sizes, and so on. The other dimension is the 564 | traffic workload we run on the network. This includes the number and 565 | duration of flows, as well as the characteristics of each flow (e.g., 566 | stream vs. RPC). 567 | 568 | With respect to network topology, we evaluate algorithms on three 569 | specific configurations: 570 | 571 | * LAN with :math:`20\mu\rm{s}` RTT and 10-Gbps link bandwidth. This scenario 572 | represents servers in the same datacenter rack. 573 | 574 | * WAN with 10ms RTT and 10-Gbps link bandwidth, with delay introduced 575 | on the receiver by configuring a 20,000 packet send queue. The 576 | bottleneck is a real switch with shallow buffers (1-2 MB). This is a 577 | good scenario to visualize the algorithm’s dynamics when looking at 578 | two to three flows. 579 | 580 | * WAN with 40ms RTT and 10/100-Mbps bottleneck bandwidth, with an 581 | intermediate router introduced to reduce the link bandwidth to 10 or 582 | 100 Mbps. This scenario reflects a connection an end-user might 583 | experience on a modern network. 584 | 585 | :numref:`Figure %s ` shows the topology for the first two 586 | scenarios, where the senders and receivers are connected through a 587 | single switch. Delay is achieved for the second scenario using 588 | ``netem`` in the Receiver, which affects only the ACKs being sent 589 | back. 590 | 591 | .. _fig-10gig: 592 | .. figure:: figures/Slide2.png 593 | :width: 350px 594 | :align: center 595 | 596 | Topology for 10-Gbps Tests, optionally with 10ms of delay introduced. 597 | 598 | :numref:`Figure %s ` shows the topology for the third 599 | scenario, where the router is implemented by a server-based forwarder 600 | that throttles outgoing link bandwidth using ``tbf qdisc``. 601 | 602 | .. _fig-100meg: 603 | .. figure:: figures/Slide3.png 604 | :width: 550px 605 | :align: center 606 | 607 | Topology for 10-Mbps and 100-Mbps Tests with 10ms or 40ms of delay 608 | introduced. 609 | 610 | With respect to traffic workload, we evaluate the dynamics and 611 | fairness of algorithms with the following tests: 612 | 613 | * 2-flow Test: The first flow lasts 60 seconds, and the second flow lasts 614 | 20 seconds and starts 22 seconds after the first one. 615 | 616 | * 3-flow Test: The first flow lasts 60 seconds, the second flow lasts 40 617 | seconds and starts 12 seconds after the first one, the third flow lasts 618 | 20 seconds and starts 26 seconds after the first one. 619 | 620 | These tests make it possible to: 621 | 622 | * Examine how quickly existing flows adapt to new flows. 623 | 624 | * Examine how quickly flows adapt to released bandwidth from terminating flows. 625 | 626 | * Measure fairness between flows with the same (or different) congestion algorithm(s). 627 | 628 | * Measure levels of congestion. 629 | 630 | * Identify conditions under which performance changes abruptly, 631 | signalling a possible instability. 632 | 633 | Additional tests include a combination of streaming, plus 10-KB and 634 | 1-MB RPCs. These tests allow us to see if the smaller RPC flows are 635 | penalized, and if so, by how much. These tests make it possible to: 636 | 637 | * Study behavior under increasing loads. 638 | 639 | * Measure the performance (throughput and latency) of 1-MB and 10-KB 640 | flows, as well as how fairly is the available bandwidth divided 641 | between them. 642 | 643 | * Identify conditions when the retransmissions or latency change 644 | abruptly, signalling an instability. 645 | 646 | 647 | 3.4.2 Example Results 648 | ~~~~~~~~~~~~~~~~~~~~~ 649 | 650 | The following shows some example results, selected to illustrate the 651 | evaluation process. We start with a simple 2-flow experiment, where 652 | both flows are managed by the same congestion-control algorithm. 653 | :numref:`Figure %s ` shows the resulting goodput 654 | graph. As one would hope, once the second flow (in red) starts just 655 | after 20 seconds, the goodput of both flows converge towards a nearly 656 | equal sharing of the available bandwidth. This convergence is not 657 | immediate (the two plots cross over roughly ten seconds after the 658 | second flow begins), a behavior other algorithms try to correct (e.g., 659 | by using explicit feedback from routers). On the plus side, the first 660 | flow does quickly adapt to the released bandwidth once the second flow 661 | terminates. 662 | 663 | .. _fig-graph_1a: 664 | .. figure:: figures/Graph_1A.png 665 | :width: 500px 666 | :align: center 667 | 668 | Goodput (bytes per second delivered end-to-end) realized by two 669 | flows running under the same congestion-control algorithm. 670 | 671 | It is also possible to look more closely at these two flows, for 672 | example, by tracking the congestion window for each. The corresponding 673 | plot is shown in :numref:`Figure %s `. Not surprisingly, 674 | different algorithms would have different "patterns" to congestion 675 | windows over time, as we will see in the next chapter. 676 | 677 | .. _fig-graph_1b: 678 | .. figure:: figures/Graph_1B.png 679 | :width: 500px 680 | :align: center 681 | 682 | Congestion window (measured in bytes) for two flows competing for 683 | bandwidth under the same congestion-control algorithm. 684 | 685 | We could repeat these experiments but vary the algorithm used by one 686 | of the flows. This would allow us to visualize how the two algorithms 687 | interact. If they are both fair, you would expect to see results 688 | similar to :numref:`Figure %s `. If not, you might see a 689 | graph similar to :numref:`Figure %s `, in which the 690 | second flow (Algorithm B) aggressively takes bandwidth away from the 691 | first flow (Algorithm A). 692 | 693 | .. _fig-graph_6c: 694 | .. figure:: figures/Graph_6C.png 695 | :width: 500px 696 | :align: center 697 | 698 | Goodput (bytes per second delivered end-to-end) realized by two 699 | flows running under different congestion-control algorithms, with 700 | one flow receiving significantly less bandwidth than the other. 701 | 702 | These experiments can be repeated with three concurrent flows, but we 703 | turn next to evaluating how various algorithms treat different 704 | workloads. In particular, we are interested in the question of *size 705 | fairness*, that is, how a given algorithm treats back-to-back 10-KB or 706 | 1-MB RPC calls when they have to compete with ongoing stream-based 707 | flows. Some example results are shown in :numref:`Figure %s 708 | ` (1-MB RPCs) and :numref:`Figure %s ` 709 | (10-KB RPCs). The figures show the performance of five different 710 | algorithms (represented by different colors), across test runs with 1, 711 | 2, 4, 8, and 16 concurrent streaming flows. 712 | 713 | .. _fig-graph_8b: 714 | .. figure:: figures/Graph_8B.png 715 | :width: 500px 716 | :align: center 717 | 718 | Average goodput (measured in Gbps) realized by a sequence of 719 | 1-MB RPC calls for five different algorithms, when competing with 720 | a varied number of TCP streams. 721 | 722 | .. _fig-graph_8c: 723 | .. figure:: figures/Graph_8C.png 724 | :width: 500px 725 | :align: center 726 | 727 | Average goodput (measured in Gbps) realized by a sequence of 728 | 10-KB RPC calls for five different algorithms, when competing with 729 | a varied number of TCP streams. 730 | 731 | The 1-MB results are unsurprising, with no significant outliers across 732 | the five algorithms, and the average goodput decreasing as the RPCs 733 | compete with more and more streams. Although not shown in :numref:`Figure 734 | %s `, the fourth algorithm (green), which performs best 735 | when all flows are stream-based, suffers a significant number of 736 | retransmissions when sharing the available bandwidth among RPC calls. 737 | 738 | The 10-KB results do have a significant outlier, with the third 739 | algorithm (yellow) performing significantly better; by a factor of 740 | 4x. If you plot latency rather than bandwidth—the more relevant metric 741 | for small-message RPC calls—it turns out the third algorithm both 742 | achieves the lowest latencies and does so consistently, with the 99th 743 | and 99.9-th percentiles being the same. 744 | 745 | Finally, all of the preceding experiments can be repeated on a 746 | network topology that includes wide-area RTTs. Certainly inter-flow 747 | fairness and size fairness continue to be concerns, but there is also 748 | an increased likelihood that queuing delays become an issue. For 749 | example, :numref:`Figure %s ` shows the 99% latencies 750 | for four different algorithms when the network topology includes a 751 | 10-Mbps bottleneck link and a 40ms RTT. One important observation 752 | about this result is that the second algorithm (red) performs poorly 753 | when there is less than one bandwidth-delay product of buffering 754 | available at the bottleneck router, calling attention to another 755 | variable that can influence your results. 756 | 757 | .. _fig-graph_16b: 758 | .. figure:: figures/Graph_16B.png 759 | :width: 500px 760 | :align: center 761 | 762 | 99th percentile latencies for 10-KB RPC calls when competing with a 763 | single streaming flow on a 40ms WAN, measured for a different 764 | number of buffers at the bottleneck router. 765 | 766 | We conclude this discussion of experimental methodology by permitting 767 | ourselves one summary evaluation statement. When looking across a set 768 | of algorithms and a range of topology/traffic scenarios, we conclude 769 | that: *No single algorithm is better than all other algorithms under 770 | all conditions.* One explanation, as these examples demonstrate, is 771 | how many factors there are to take into consideration. This also 772 | explains why congestion control continues to be a topic of interest 773 | for both network researchers and network practitioners. 774 | -------------------------------------------------------------------------------- /dict.txt: -------------------------------------------------------------------------------- 1 | ACK 2 | Aether 3 | Athuraliya 4 | BBR 5 | BPF 6 | eBPF 7 | Bemmel 8 | Braden 9 | Brakmo 10 | Cardwell 11 | Centric 12 | Cheng 13 | Chiu 14 | Connor 15 | Cwnd 16 | ECN 17 | Fastpass 18 | Gbps 19 | Geng 20 | GigE 21 | Goodput 22 | Gunn 23 | Hawe 24 | IPv 25 | Janey 26 | KBps 27 | Kahn 28 | Karels 29 | Karn 30 | Kleinrock 31 | Kobayashi 32 | Leffler 33 | Mahdavi 34 | Makefile 35 | Mbps 36 | Multipath 37 | Multiprotocol 38 | Nagle 39 | Nico 40 | O'Malley 41 | Malley 42 | OC 43 | Ott 44 | Paxson 45 | Ph 46 | QoS 47 | Quarterman 48 | RTP 49 | RTT 50 | Raj 51 | Ramakrishnan 52 | Ranysha 53 | Realtime 54 | Retransmit 55 | Saltzer 56 | Sanjeewa 57 | Semke 58 | TCP 59 | TFRC 60 | TFRC 61 | Usenix 62 | Vibert 63 | VMware 64 | Westwood 65 | Yeganeh 66 | Davie 67 | Metcalf 68 | Alizadeh 69 | Shalunov 70 | Cascone 71 | Vachuska 72 | Wi 73 | Fi 74 | Iyengar 75 | Swett 76 | Handley 77 | Padhye 78 | Widmer 79 | Padhye 80 | Firoiu 81 | Towsley 82 | Kurose 83 | Haiqing 84 | Jiang 85 | Liu 86 | Xie 87 | Yi 88 | Jamieson 89 | Sunay 90 | Gettys 91 | Briscoe 92 | Xu 93 | Radhika 94 | Mittal 95 | Yihua 96 | Gao 97 | Wischik 98 | Raiciu 99 | Greenhalgh 100 | Micheloni 101 | vanBemmel 102 | Omer 103 | Shapira 104 | Giulio 105 | 106 | 107 | al 108 | asymptotes 109 | asymptoting 110 | backoff 111 | basestation 112 | basestations 113 | bitrate 114 | burstiness 115 | bufferbloat 116 | bursty 117 | centric 118 | codec 119 | connectionless 120 | cwnd 121 | datagram 122 | deployability 123 | deployable 124 | et 125 | goodput 126 | granularities 127 | halvings 128 | incrementing 129 | jitter 130 | kbps 131 | kilobits 132 | latencies 133 | misdelivered 134 | multipath 135 | nd 136 | piecewise 137 | pre 138 | queueing 139 | resends 140 | retransmission 141 | retransmissions 142 | retransmit 143 | retransmits 144 | retransmitted 145 | retransmitting 146 | rollout 147 | sawtooth 148 | subflow 149 | subflows 150 | teardown 151 | throughputs 152 | timeframe 153 | toolset 154 | topologies 155 | uncongested 156 | underutilizing 157 | unscaled 158 | utilizations 159 | virtualenv 160 | acked 161 | learnings 162 | intra 163 | misordering 164 | lossy 165 | todo 166 | th 167 | Vidhi 168 | Goel 169 | -------------------------------------------------------------------------------- /figures/Figure-sources.pptx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SystemsApproach/tcpcc/abb44dfa1a05e457243995dac0630e73febd82f5/figures/Figure-sources.pptx -------------------------------------------------------------------------------- /figures/Graph_16B.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SystemsApproach/tcpcc/abb44dfa1a05e457243995dac0630e73febd82f5/figures/Graph_16B.png -------------------------------------------------------------------------------- /figures/Graph_1A.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SystemsApproach/tcpcc/abb44dfa1a05e457243995dac0630e73febd82f5/figures/Graph_1A.png -------------------------------------------------------------------------------- /figures/Graph_1B.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SystemsApproach/tcpcc/abb44dfa1a05e457243995dac0630e73febd82f5/figures/Graph_1B.png -------------------------------------------------------------------------------- /figures/Graph_6C.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SystemsApproach/tcpcc/abb44dfa1a05e457243995dac0630e73febd82f5/figures/Graph_6C.png -------------------------------------------------------------------------------- /figures/Graph_8B.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SystemsApproach/tcpcc/abb44dfa1a05e457243995dac0630e73febd82f5/figures/Graph_8B.png -------------------------------------------------------------------------------- /figures/Graph_8C.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SystemsApproach/tcpcc/abb44dfa1a05e457243995dac0630e73febd82f5/figures/Graph_8C.png -------------------------------------------------------------------------------- /figures/Slide1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SystemsApproach/tcpcc/abb44dfa1a05e457243995dac0630e73febd82f5/figures/Slide1.png -------------------------------------------------------------------------------- /figures/Slide10.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SystemsApproach/tcpcc/abb44dfa1a05e457243995dac0630e73febd82f5/figures/Slide10.png -------------------------------------------------------------------------------- /figures/Slide11.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SystemsApproach/tcpcc/abb44dfa1a05e457243995dac0630e73febd82f5/figures/Slide11.png -------------------------------------------------------------------------------- /figures/Slide12.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SystemsApproach/tcpcc/abb44dfa1a05e457243995dac0630e73febd82f5/figures/Slide12.png -------------------------------------------------------------------------------- /figures/Slide13.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SystemsApproach/tcpcc/abb44dfa1a05e457243995dac0630e73febd82f5/figures/Slide13.png -------------------------------------------------------------------------------- /figures/Slide14.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SystemsApproach/tcpcc/abb44dfa1a05e457243995dac0630e73febd82f5/figures/Slide14.png -------------------------------------------------------------------------------- /figures/Slide15.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SystemsApproach/tcpcc/abb44dfa1a05e457243995dac0630e73febd82f5/figures/Slide15.png -------------------------------------------------------------------------------- /figures/Slide16.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SystemsApproach/tcpcc/abb44dfa1a05e457243995dac0630e73febd82f5/figures/Slide16.png -------------------------------------------------------------------------------- /figures/Slide2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SystemsApproach/tcpcc/abb44dfa1a05e457243995dac0630e73febd82f5/figures/Slide2.png -------------------------------------------------------------------------------- /figures/Slide3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SystemsApproach/tcpcc/abb44dfa1a05e457243995dac0630e73febd82f5/figures/Slide3.png -------------------------------------------------------------------------------- /figures/Slide4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SystemsApproach/tcpcc/abb44dfa1a05e457243995dac0630e73febd82f5/figures/Slide4.png -------------------------------------------------------------------------------- /figures/Slide5.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SystemsApproach/tcpcc/abb44dfa1a05e457243995dac0630e73febd82f5/figures/Slide5.png -------------------------------------------------------------------------------- /figures/Slide6.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SystemsApproach/tcpcc/abb44dfa1a05e457243995dac0630e73febd82f5/figures/Slide6.png -------------------------------------------------------------------------------- /figures/Slide7.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SystemsApproach/tcpcc/abb44dfa1a05e457243995dac0630e73febd82f5/figures/Slide7.png -------------------------------------------------------------------------------- /figures/Slide8.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SystemsApproach/tcpcc/abb44dfa1a05e457243995dac0630e73febd82f5/figures/Slide8.png -------------------------------------------------------------------------------- /figures/Slide9.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SystemsApproach/tcpcc/abb44dfa1a05e457243995dac0630e73febd82f5/figures/Slide9.png -------------------------------------------------------------------------------- /figures/f03-16-9780123850591.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SystemsApproach/tcpcc/abb44dfa1a05e457243995dac0630e73febd82f5/figures/f03-16-9780123850591.png -------------------------------------------------------------------------------- /figures/f05-03-9780123850591.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SystemsApproach/tcpcc/abb44dfa1a05e457243995dac0630e73febd82f5/figures/f05-03-9780123850591.png -------------------------------------------------------------------------------- /figures/f05-04-9780123850591.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SystemsApproach/tcpcc/abb44dfa1a05e457243995dac0630e73febd82f5/figures/f05-04-9780123850591.png -------------------------------------------------------------------------------- /figures/f05-05-9780123850591.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SystemsApproach/tcpcc/abb44dfa1a05e457243995dac0630e73febd82f5/figures/f05-05-9780123850591.png -------------------------------------------------------------------------------- /figures/f05-08-9780123850591.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SystemsApproach/tcpcc/abb44dfa1a05e457243995dac0630e73febd82f5/figures/f05-08-9780123850591.png -------------------------------------------------------------------------------- /figures/f05-10-9780123850591.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SystemsApproach/tcpcc/abb44dfa1a05e457243995dac0630e73febd82f5/figures/f05-10-9780123850591.png -------------------------------------------------------------------------------- /figures/f06-03-9780123850591.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SystemsApproach/tcpcc/abb44dfa1a05e457243995dac0630e73febd82f5/figures/f06-03-9780123850591.png -------------------------------------------------------------------------------- /figures/f06-05-9780123850591.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SystemsApproach/tcpcc/abb44dfa1a05e457243995dac0630e73febd82f5/figures/f06-05-9780123850591.png -------------------------------------------------------------------------------- /figures/f06-08-9780123850591.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SystemsApproach/tcpcc/abb44dfa1a05e457243995dac0630e73febd82f5/figures/f06-08-9780123850591.png -------------------------------------------------------------------------------- /figures/f06-09-9780123850591.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SystemsApproach/tcpcc/abb44dfa1a05e457243995dac0630e73febd82f5/figures/f06-09-9780123850591.png -------------------------------------------------------------------------------- /figures/f06-10-9780123850591.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SystemsApproach/tcpcc/abb44dfa1a05e457243995dac0630e73febd82f5/figures/f06-10-9780123850591.png -------------------------------------------------------------------------------- /figures/f06-11-9780123850591.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SystemsApproach/tcpcc/abb44dfa1a05e457243995dac0630e73febd82f5/figures/f06-11-9780123850591.png -------------------------------------------------------------------------------- /figures/f06-12-9780123850591.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SystemsApproach/tcpcc/abb44dfa1a05e457243995dac0630e73febd82f5/figures/f06-12-9780123850591.png -------------------------------------------------------------------------------- /figures/f06-13-9780123850591.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SystemsApproach/tcpcc/abb44dfa1a05e457243995dac0630e73febd82f5/figures/f06-13-9780123850591.png -------------------------------------------------------------------------------- /figures/f06-14-9780123850591.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SystemsApproach/tcpcc/abb44dfa1a05e457243995dac0630e73febd82f5/figures/f06-14-9780123850591.png -------------------------------------------------------------------------------- /figures/f06-15-9780123850591.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SystemsApproach/tcpcc/abb44dfa1a05e457243995dac0630e73febd82f5/figures/f06-15-9780123850591.png -------------------------------------------------------------------------------- /figures/f06-16-9780123850591.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SystemsApproach/tcpcc/abb44dfa1a05e457243995dac0630e73febd82f5/figures/f06-16-9780123850591.png -------------------------------------------------------------------------------- /figures/f06-17-9780123850591.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SystemsApproach/tcpcc/abb44dfa1a05e457243995dac0630e73febd82f5/figures/f06-17-9780123850591.png -------------------------------------------------------------------------------- /figures/f06-18-9780123850591.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SystemsApproach/tcpcc/abb44dfa1a05e457243995dac0630e73febd82f5/figures/f06-18-9780123850591.png -------------------------------------------------------------------------------- /figures/f06-19-9780123850591.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SystemsApproach/tcpcc/abb44dfa1a05e457243995dac0630e73febd82f5/figures/f06-19-9780123850591.png -------------------------------------------------------------------------------- /foreword.rst: -------------------------------------------------------------------------------- 1 | Foreword 2 | ========== 3 | 4 | Congestion control is unquestionably one of the most important, most 5 | fundamental topics in computer networking. It’s also one of most 6 | challenging, as it requires controlling endpoints that are potentially 7 | distributed around the globe, in different organizations, and 8 | supporting different applications. The role of the network layer in 9 | supporting transport-layer congestion control is also a multi-faceted, 10 | nuanced challenge. And congestion control is needed in just about 11 | every Internet scenario one can imagine: from the public Internet that 12 | spans the globe and carries all types of traffic, to long “fat” pipes 13 | carrying massive amounts of file-transfer data, to specialized 14 | datacenter networks, to private commercial backbone networks, to 15 | mobile and wireless networks. 16 | 17 | With all of these challenges, how does one make sense of the many 18 | (many!) approaches towards congestion control that have been 19 | developed? What are the fundamental challenges these approaches are 20 | solving? What is the role of the network layer, and more broadly what 21 | is the design space for congestion control protocols? Are there broad 22 | classes or approaches towards congestion control that can be 23 | identified? Which approaches have been adopted in practice, and why? 24 | And among those many “flavors”/variations of TCP that you might have 25 | heard about—how do they differ and in what scenarios are they best 26 | used, and why? So many questions! 27 | 28 | To make sense of this and to answer all of these questions (and more) 29 | would require not just a book, but a great book! And now fortunately, 30 | there is such a book—this book! The three authors of *TCP Congestion 31 | Control: A Systems Approach* are among the most knowledgeable 32 | congestion control researchers on the planet—Brakmo and Peterson’s 33 | TCP Vegas protocol (you can learn more about that in section 5.1) 34 | pioneered the notion that endpoints could anticipate and avoid 35 | congestion, rather than react to observed congestion; TCP Vegas has 36 | served as a foundation on which more recent congestion avoidance 37 | protocols (such as the BBR protocol championed by Google, see Section 38 | 5.3) have been designed. The authors are also absolutely fabulous 39 | writers (and I say this as a textbook author myself)—lucid, clear, and 40 | engaging, and able to organize and communicate complex ideas, with 41 | just the right amount of detail and discussion of practice. The 42 | “systems approach” that Larry Peterson and Bruce Davie have championed 43 | is also exactly what is needed to truly understand congestion control, 44 | where deep, system-wide issues in network architecture come to the 45 | fore (e.g., the separation and interaction of network and transport 46 | layer functionalities; the question of implementing networks services, 47 | such as congestion control, in the either the application layer or in 48 | the network). 49 | 50 | This book is a needed and most welcome addition to the fabulous set of 51 | open source, “systems approach” books that Larry, Bruce and others 52 | have been developing. I hope you read it cover-to-cover, consult it 53 | again later as you need it in the future, and enjoy it as much as I 54 | have. 55 | 56 | | Jim Kurose 57 | | Amherst, Massachusetts 58 | -------------------------------------------------------------------------------- /index.rst: -------------------------------------------------------------------------------- 1 | .. image:: _static/SystemsApproachLogoURL.png 2 | :width: 300px 3 | :align: center 4 | :target: https://systemsapproach.org 5 | 6 | | 7 | 8 | TCP Congestion Control: A Systems Approach 9 | ============================================= 10 | 11 | Peterson, Brakmo, and Davie 12 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 13 | 14 | | 15 | 16 | .. toctree:: 17 | :maxdepth: 2 18 | :caption: Table of Contents 19 | 20 | foreword.rst 21 | preface.rst 22 | intro.rst 23 | tcp_ip.rst 24 | design.rst 25 | algorithm.rst 26 | avoidance.rst 27 | aqm.rst 28 | variants.rst 29 | biblio.rst 30 | README.rst 31 | authors.rst 32 | latest.rst 33 | print.rst 34 | -------------------------------------------------------------------------------- /intro.rst: -------------------------------------------------------------------------------- 1 | Chapter 1: Introduction 2 | ======================== 3 | 4 | The Internet is considered an engineering success with few peers, and 5 | rightfully so. It has scaled to connect billions of devices, supports 6 | every imagined communications application, and accommodates 7 | transmission rates ranging from tens of bits per day to hundreds of 8 | gigabits per second. But at its core is a thorny technical challenge 9 | that has drawn widespread attention for the last 30-plus years, from 10 | both practitioners trying to make the Internet perform better and 11 | theoreticians wanting to understand its mathematical underpinnings: 12 | how the Internet’s resources are best allocated to all the competing 13 | interests trying to use it. 14 | 15 | Resource allocation is a hard problem in any computer system, but 16 | especially so for a system as complex as the Internet. The problem was 17 | not top-of-mind when the Internet’s TCP/IP protocol stack was first 18 | deployed in the early 1980s. By the end of the decade, however, with the 19 | Internet gaining serious use in universities (but predating 20 | the World Wide Web's invention by several years), the network began 21 | to experience a 22 | phenomenon known as *congestion collapse*. A solution—congestion 23 | control—was developed and deployed in the late 1980s and the 24 | immediate crisis was addressed. The 25 | Internet community has been studying and refining its approach to 26 | congestion control ever since. This book is about that journey. 27 | 28 | The most famous early efforts to manage congestion were undertaken by two 29 | researchers, Van Jacobson and Mike Karels. The resulting paper, 30 | *Congestion Avoidance and Control*, published in 1988, is one of the 31 | most cited papers in networking of all time. There are 32 | good reasons for that. One is that congestion collapse really did 33 | threaten the nascent Internet, and the work undertaken to 34 | address it was foundational to the Internet's ultimate 35 | success. Without that work it's unlikely we'd have the global Internet 36 | we have today. 37 | 38 | Another reason for the citation impact of this work is that congestion 39 | control has been an amazingly fruitful area of research for over three 40 | decades. Congestion control, and resource allocation more broadly, are 41 | wide open design spaces with plenty of room for innovation. Decades of 42 | research and implementation have built on the early foundations, and 43 | it seems fair to assume that new approaches 44 | or refinements to the existing approaches will continue to appear for 45 | as long as the Internet exists. 46 | 47 | In this book, we explore the design space for congestion control in 48 | the Internet and present a description of the major approaches to 49 | managing or avoiding congestion that 50 | have been developed over the last three decades. 51 | 52 | 53 | .. _reading_vj: 54 | .. admonition:: Further Reading 55 | 56 | V. Jacobson. `Congestion Avoidance and Control 57 | `__. 58 | ACM SIGCOMM '88 Symposium, August 1988. 59 | 60 | 61 | 1.1 What is Congestion? 62 | ------------------------ 63 | 64 | Anyone who has driven on a highway at rush hour has experienced 65 | congestion. There is a limited resource—the space on the highway—and a 66 | set of cars, trucks, etc. that compete for that resource. As rush hour 67 | gets underway, more traffic arrives but the road keeps working as 68 | intended, just with more vehicles on it. But there 69 | comes a point where the number of vehicles becomes so large that 70 | everyone has to slow down (because there is no longer enough space for 71 | everyone to keep a safe distance at the speed limit) at which point the 72 | road actually becomes *less effective* at moving vehicles. So, just at 73 | the point when you would be wanting more capacity, there is actually 74 | less capacity to move traffic, as illustrated in :numref:`Figure %s `. This is the essence of *congestion 75 | collapse*, when congestion is so bad that the system starts to perform 76 | significantly worse than it did without congestion. The mechanism of congestion collapse is quite a bit different for 77 | packet networks than for highways, but it is equally problematic [#]_. 78 | 79 | 80 | .. _fig-collapse: 81 | .. figure:: figures/Slide1.png 82 | :width: 400px 83 | :align: center 84 | 85 | As load increases, throughput rises then falls at the point of 86 | congestion collapse. 87 | 88 | 89 | .. [#] Networking people like making analogies between real-world 90 | congestion and network congestion, but it's important to 91 | recognize that analogies are imperfect. 92 | 93 | This book focuses on congestion control for packet-switched 94 | networks. A fundamental aspect of packet switching is *multiplexing*, 95 | which is the means by which a system resource—such as a link or a 96 | queue in a router—is shared among multiple users or applications. In 97 | the case of the Internet, packet networks are *statistically 98 | multiplexed*, which means that, as packets show up somewhat randomly, 99 | we rely on the statistical properties of those arrivals to ensure that 100 | we don't run out of resources. The existence of congestion collapse 101 | shows that sometimes the statistics don't quite work out as we'd like. 102 | 103 | To see how this might work, consider the simple network illustrated in 104 | :numref:`Figure %s `, where the three hosts on the left side 105 | of the network (senders S1-S3) are sending data to the three hosts on 106 | the right (receivers R1-R3) by sharing a switched network that 107 | contains only one physical link. (For simplicity, assume that host S1 108 | is sending data to host R1, and so on.) In this situation, three flows 109 | of data—corresponding to the three pairs of hosts—are multiplexed onto 110 | a single physical link by switch 1 and then *demultiplexed* back into 111 | separate flows by switch 2. Note that we are being intentionally vague 112 | about exactly what a “flow of data” corresponds to for now, but we 113 | will make this more precise in later chapters. 114 | 115 | .. _fig-mux: 116 | .. figure:: figures/Slide11.png 117 | :width: 400px 118 | :align: center 119 | 120 | Multiplexing multiple logical flows over a single 121 | physical link. 122 | 123 | Statistical multiplexing means that all the hosts in this network send 124 | packets whenever it suits them, and if it happens that several packets 125 | turn up at the same time at a switch, one of them will be transmitted 126 | first while the others are placed into a queue. So both the link and 127 | the queue are shared resources, and both are finite. The link can 128 | only carry so many bits per second, and the queue can only hold so 129 | many packets (or bytes) before it has to start discarding 130 | packets. Managing the access to these shared resources, and trying to 131 | do so in a way that prevents congestion collapse, is the essence 132 | of congestion control. A switch that occasionally puts packets in a 133 | queue is operating normally. A switch that has large numbers of 134 | packets in its queues all or most 135 | of the time is congested. We'll get to the definition of congestion 136 | collapse for networks later on, but it starts with congested switches, 137 | routers or links. 138 | 139 | For a deeper introduction to statistical multiplexing, and why it's 140 | the approach of choice for packet networks, we refer to the 141 | following text. 142 | 143 | .. _reading_statmux: 144 | .. admonition:: Further Reading 145 | 146 | `Requirements 147 | `__. 148 | *Computer Networks: A Systems Approach*, 2020. 149 | 150 | 151 | When a switch builds a queue of packets awaiting transmission, it 152 | needs to decide which packet gets sent next. Each switch in a 153 | packet-switched network makes this decision independently, on a 154 | packet-by-packet basis. One of the issues that arises is how to make 155 | this decision in a fair manner. For example, many switches are 156 | designed to service packets on a first-in, first-out (FIFO) 157 | basis. Another approach would be to transmit the packets from each of 158 | the different flows that are currently sending data through the switch 159 | in a round-robin manner. This might be done to ensure that certain 160 | flows receive a particular share of the link’s bandwidth or that they 161 | never have their packets delayed in the switch for more than a certain 162 | length of time. A network that attempts to allocate bandwidth to 163 | particular flows is sometimes said to support *Quality-of-Service 164 | (QoS)*. 165 | 166 | One thing to take away from this discussion is that it is in the 167 | nature of packet-switched networks that they will sometimes be 168 | congested. The focus of this book is on the large body of work that 169 | has been done to mitigate congestion, either by responding to it in 170 | effective ways to lessen it, or by preventing it before it occurs. 171 | 172 | 1.2 Controlling Congestion 173 | --------------------------- 174 | 175 | Resource allocation and congestion control are complex issues that have 176 | been the subject of much study ever since the first network was 177 | designed. They are still active areas of research. One factor that makes 178 | these issues complex is that they are not isolated to a single level 179 | of a protocol hierarchy. Resource allocation is partially implemented in 180 | the routers, switches, and links inside the network and partially in the 181 | transport protocol running on the end hosts. End systems may use 182 | signalling protocols to convey their resource requirements to network 183 | nodes, which respond with information about resource 184 | availability. Application protocols may themselves be designed to mitigate 185 | congestion, for example, by changing the resolution of video transmission 186 | based on the current network conditions. This is a canonical example 187 | of a *systems issue*: you can't fully understand congestion without 188 | looking at all the places in the system that it touches. 189 | 190 | We should clarify our terminology before going any further. By *resource 191 | allocation*, we mean the process by which network elements try to meet 192 | the competing demands that applications have for network 193 | resources—primarily link bandwidth and buffer space in routers or 194 | switches. Of course, it will often not be possible to meet all the 195 | demands, meaning that some users or applications may receive fewer 196 | network resources than they want. Part of the resource allocation 197 | problem is deciding when to say no and to whom. 198 | 199 | We use the term *congestion control* to describe the efforts made by 200 | network nodes (including end systems) to prevent or respond to overload conditions. Since 201 | congestion is generally bad for everyone, the first order of business is 202 | making congestion subside, or preventing it in the first place. This 203 | might be achieved simply by persuading a few hosts to stop sending, thus 204 | improving the situation for everyone else. However, it is more common 205 | for congestion-control mechanisms to have some aspect of fairness—that 206 | is, they try to share the pain among all users, rather than causing 207 | great pain to a few. Thus, we see that many congestion-control 208 | mechanisms have some sort of resource allocation built into them. 209 | 210 | It is also important to understand the difference between flow control 211 | and congestion control. Flow control involves keeping a fast sender from 212 | overrunning a slow receiver. Congestion control, by contrast, is 213 | intended to keep a set of senders from sending too much data *into the 214 | network* because of lack of resources at some point. These two concepts 215 | are often confused; as we will see, they also share some mechanisms. 216 | 217 | Given all the different places and layers where congestion control and resource 218 | allocation can be implemented, it is helpful to start with a simple 219 | approach, which is pretty much what Jacobson and Karels did (although 220 | their solution ended up having quite a few moving parts). 221 | 222 | In the early Internet, routers implemented the most basic resource 223 | allocation approach possible: FIFO queuing with tail drop. There was 224 | no awareness of flows or applications, so they simply accepted packets 225 | as they arrived, placed them in a queue whenever the outbound link 226 | capacity was less than the arrival rate, served the queue by the FIFO 227 | discipline, and dropped arriving packets if the queue was full 228 | ("tail-drop"). This is still the most common form of queuing 229 | today; we will discuss other approaches to queuing including 230 | *Active Queue Management* in a later chapter. 231 | 232 | The reason that congestion collapse occurred in the early Internet is that 233 | dropped packets are not just discarded and forgotten. When the 234 | end-to-end transport protocol is TCP, as it is for most Internet 235 | traffic, a dropped packet will be retransmitted. So as congestion 236 | rises, the number of retransmitted packets rises; in other words, the 237 | number of packets sent into the network increases even if there is no 238 | real increase in the offered load from users and applications. More 239 | packets lead to more drops leading to more retransmissions and so 240 | on. You can see how this leads to collapse. 241 | 242 | A useful term in this context is *goodput*, which is distinguished 243 | from throughput in the sense that only packets doing useful work are 244 | counted towards goodput. So, for example, if a link is running at 100% 245 | utilization, but 60% of the packets on that link are retransmitted due 246 | to earlier losses, you could say the goodput was only 40%. 247 | 248 | The key insight of early researchers on congestion control was that it 249 | was possible and necessary for TCP to do something other than blindly retransmit 250 | lost packets during times of congestion. TCP would have to detect the 251 | congestion—which it can do, for example, by noticing the loss of 252 | packets—and then respond to the congestion by *reducing* the amount of 253 | traffic sent into the network. This interaction between the end-to-end 254 | protocol and the network during times of congestion formed the basis 255 | for much of today's congestion control and avoidance approaches. We'll 256 | get into the specifics of how these approaches work in subsequent 257 | chapters. 258 | 259 | 260 | 1.3 Theoretical Underpinnings 261 | ------------------------------ 262 | 263 | There has been a lot of important theoretical work done to understand 264 | congestion. At the core of congestion is queuing, and there is a huge 265 | body of theory behind queuing, much of which extends into other 266 | physical realms such as supermarket checkouts and road congestion. The 267 | standard reference on queuing for packet networks was written by one 268 | of the early pioneers of the ARPANET, Leonard Kleinrock. 269 | 270 | .. _reading_queue: 271 | .. admonition:: Further Reading 272 | 273 | L. Kleinrock. `Queueing Systems, Volume 2 274 | `__. 275 | 276 | As packet networks became more widespread in the 1980s, there was a 277 | great deal of interest in how traffic behaved, with a growing 278 | realization that it might be more complex than had first been 279 | thought. One of the most popular models for data traffic was the 280 | Poisson model, which had worked well for various systems like call 281 | arrivals in the telephone network and people arriving at a queue in a 282 | supermarket. But the more that people studied the Internet and other 283 | packet networks, the worse the Poisson model started to look. There 284 | are a number of seminal papers that make the case for more complex 285 | models, of which the following are two. 286 | 287 | .. _reading_pfail: 288 | .. admonition:: Further Reading 289 | 290 | V. Paxson and S. Floyd. `Wide-Area Traffic: The Failure of Poisson Modeling 291 | `__. 292 | IEEE/ACM Transactions on Networking, June 1995. 293 | 294 | 295 | W. Leland *et al*, `On the self-similar nature of Ethernet 296 | traffic 297 | `__. 298 | ACM SIGCOMM '93 Symposium, August 1993. 299 | 300 | These papers and others contributed to the consensus that Internet 301 | traffic is much more “bursty”—packets arrive in clumps—than had been 302 | assumed by early models. Furthermore, this burstiness displays 303 | *self-similarity*—a property of fractals, whereby when you zoom in, 304 | you keep seeing similar complexity at finer resolutions. For Internet 305 | traffic, this means that at any time scale, from microseconds to 306 | hours, you will see similar sorts of patterns. 307 | 308 | This research had a number of practical consequences, such as the 309 | realization that packet queues might get to be very long indeed, and 310 | thus routers and switches should have reasonably large packet 311 | buffers. (Correctly sizing those buffers became its own research 312 | topic.) Link utilizations could not be reliably kept close to 100% all 313 | the time, because you had to allow room for unpredictable bursts. 314 | 315 | Two topics of particular importance when thinking about congestion 316 | avoidance are *fairness* and *stability*. When the network is 317 | congested, it's going to be necessary for some users or flows to send 318 | less. It is clearly worth asking: which flows should send less? Should 319 | all flows share the pain equally? And what happens if some flows pay 320 | more attention to congestion signals than others? These questions are 321 | at the heart of the fairness issue. Jain's *fairness index* is one of 322 | the widely accepted ways to measure how fair a network is. We dig into 323 | this topic in Chapter 3. 324 | 325 | Stability is a critical property for any sort of control system, which 326 | is what congestion control is. Congestion is detected, some action is 327 | taken to reduce the total amount of traffic, causing congestion to 328 | ease, at which point it would seem reasonable to start sending more 329 | traffic again, leading back to more congestion. You can imagine that 330 | this sort of oscillation between congested and uncongested states 331 | could go on forever, and would be quite detrimental if the network is 332 | swinging from underutilized to collapsing. We really want it to find 333 | an equilibrium where the network is busy but not so much so that 334 | congestion collapse occurs. Finding these stable control loops has 335 | been one of the key challenges for congestion control system designers 336 | over the decades. The quest for stability features heavily in the 337 | early work of Jacobson and Karels and stability remains a requirement 338 | that subsequent approaches have to meet. 339 | 340 | Once the initial congestion control algorithms of TCP were implemented 341 | and deployed, researchers began to build mathematical models of TCP's 342 | behavior. This enabled the relationship between packet loss rate, 343 | round-trip time, and throughput to be established. The foundation was 344 | laid in the paper by Mathis and colleagues, but there has been a body 345 | of work that is ongoing as the congestion control algorithms 346 | evolve. The idea that TCP would converge to a certain throughput given 347 | stable conditions of RTT and loss also formed the basis for 348 | *TCP-friendly rate control (TFRC)*. TFRC extends TCP-like congestion 349 | control to applications that don't use TCP, based on the idea that 350 | they can still share available capacity in a fair way with those that 351 | do. We return to this topic in Chapter 7. 352 | 353 | .. _reading_mathis_eqn: 354 | .. admonition:: Further Reading 355 | 356 | M. Mathis, J. Semke, J. Mahdavi, and T. Ott. `The Macroscopic 357 | Behavior of the TCP Congestion Avoidance Algorithm 358 | `__. 359 | SIGCOMM CCR, 27(3), July 1997. 360 | 361 | Finally, much of the theoretical work on congestion control has framed 362 | the problem as *"a distributed algorithm to share network resources 363 | among competing sources, where the goal is to choose source rate so as 364 | to maximize aggregate source utility subject to capacity 365 | constraints."* Formulating a congestion-control mechanism as an algorithm 366 | to optimize an objective function is traceable to a paper by Frank 367 | Kelly in 1997, and later extended by Sanjeewa Athuraliya and Steven 368 | Low to take into account both traffic sources (TCP) and router queuing 369 | techniques (AQM). 370 | 371 | .. _reading_kelly_low: 372 | .. admonition:: Further Reading 373 | 374 | F. Kelly. `Charging and Rate Control for Elastic Traffic 375 | `__. 376 | European Transactions on Telecommunications, 8:33–37, 1997. 377 | 378 | S. Athuraliya and S. Low, `An Empirical Validation of a Duality 379 | Model of TCP and Active Queue Management Algorithms 380 | `__. Proceedings of the 381 | Winter Simulation Conference, 2001. 382 | 383 | This book does not pursue the mathematical formulation outlined in 384 | these papers (and the large body of work that followed), but we do 385 | find it helpful to recognize that there is an established connection 386 | between optimizing a utility function and the pragmatic aspects of the 387 | mechanisms described in this book. Congestion control is an area of 388 | networking in which theory and practice have been productively linked 389 | to explore the solution space and develop robust approaches to the 390 | problem. 391 | 392 | 1.4 Congestion Control Today 393 | ---------------------------- 394 | 395 | It sometimes feels like networking protocols have all been nailed down 396 | and standardized for decades, but few areas have remained as dynamic 397 | as congestion control. While the early work by Jacobson, Karels and 398 | others laid the foundation, there has been a long series of 399 | innovations that continue today. We cover many of these in detail in 400 | subsequent chapters, but you can rest assured that new ideas in 401 | congestion control will continue to emerge for years to come. 402 | 403 | Sometimes innovations are necessitated by changes in the 404 | landscape. For example, as bandwidths increased from megabits to 405 | gigabits per second, the amount of data in flight at any instant 406 | increased, which raises the stakes for detecting and responding to 407 | congestion quickly. High latency links, such as trans-oceanic cables 408 | and satellite links added to this problem by raising the round-trip 409 | time (RTT). These 410 | situations led to such innovations as using delay (and changes to 411 | delay) as a congestion signal (first seen in TCP Vegas). Also, with these "fatter pipes", there is a 412 | greater incentive to get the pipe filled quickly; you don't want to 413 | spend 10 RTTs figuring out how quickly you can send data 414 | if your message could have been sent in one or two RTTs. This led to 415 | efforts to more quickly determine the bottleneck bandwidth, such as 416 | XCP, RCP, and Quick-start for TCP. 417 | 418 | Wireless networks, which became mainstream long after the early days 419 | of TCP, added a new issue to the mix: packet loss was no longer a 420 | reliable congestion signal, but could instead be attributed to a noisy 421 | radio channel. This led to a range of approaches to either hide the 422 | loss from the TCP hosts or to improve the mechanisms by which TCP 423 | detects congestion. 424 | 425 | Cloud datacenters became another "use case" for congestion-control 426 | mechanisms. Unlike the Internet in general, where end-to-end latencies 427 | are highly variable, the RTT in a datacenter is both predictable and 428 | relatively small (<10ms). And because the network is highly regular in 429 | structure (e.g., a leaf-spine fabric), it is well-understood where and 430 | under what circumstances congestion is likely to occur. This makes TCP 431 | running in a datacenter ripe for a purpose-tuned algorithm rather than 432 | having to use the general-purpose mechanism that runs on the global 433 | Internet. 434 | 435 | New applications have also contributed to the interest in improving 436 | congestion control. One salient example is the rise of video streaming 437 | as the (currently) dominant source of traffic on the Internet. Again, 438 | there were many approaches developed to make video work better under 439 | conditions of congestion. One that has enjoyed great success is 440 | *Dynamic Adaptive Streaming over HTTP (DASH)*, in which the server 441 | delivering the video switches from one quality of encoding to another 442 | (and hence from one bit-rate to another) in response to the measured 443 | congestion on the path to the receiver. This moves the congestion 444 | control loop up to the application layer, or rather, it adds a second 445 | control loop on top of the one already provided by TCP. 446 | 447 | This quick tour of innovations is hardly exhaustive, and we will see 448 | more detail on these and other approaches in the coming chapters. The 449 | important thing to understand at this point is that congestion control 450 | continues to evolve as the technology landscape and application 451 | requirements change. 452 | 453 | 454 | 1.5 Reference Implementation 455 | ------------------------------- 456 | 457 | We saw in Section 1.3 that there is a rich body of literature studying 458 | the mathematical properties of congestion-control algorithms, yet 459 | congestion control remains a highly pragmatic concern. It is estimated 460 | that TCP connections carry 85% of the traffic on the Internet, and 461 | those connections are anchored in software implementations of TCP 462 | running in every imaginable OS (e.g., Linux, Windows, MacOS, iOS, 463 | Android). As a practical matter, the very specification of the 464 | congestion-control mechanisms we discuss in this book is represented 465 | in kernel-level code, typically implemented in C. The theory defines 466 | abstract models of this code, but the code *specifies* the algorithm. 467 | 468 | If the implementation is effectively the specification, then which 469 | implementation is authoritative; which is the *reference 470 | implementation?* The answer has been the dominant open source 471 | implementation of the day. This was originally the *Berkeley Software 472 | Distribution (BSD)* implementation of Unix, and in fact, the initial 473 | algorithm proposed by Jacobson and Karels was a noteworthy feature of 474 | the Tahoe release of BSD 4.3 in 1988. This connection between BSD Unix 475 | and the TCP congestion-control algorithms was so strong that the 476 | variants of algorithm became known (named) according to the BSD 477 | release: e.g., TCP Tahoe, and later TCP Reno. 478 | 479 | .. _reading_bsd: 480 | .. admonition:: Further Reading 481 | 482 | S.J. Leffler, M.K. McKusick, M.J. Karels, and J.S Quarterman. `The 483 | Design and Implementation of the 4.3 BSD UNIX Operating System 484 | `__. 485 | Addison-Wesley. January 1989. 486 | 487 | .. sidebar:: Berkeley Unix 488 | 489 | *Any student of the Internet should have an appreciation for 490 | the role Berkeley Unix (aka BSD) played in the success of the 491 | Internet. Unix, of course, originated at AT&T Bell Labs in the 492 | early 1970s, but it was an investment by DARPA to support an 493 | open source implementation of Unix—which was to include the 494 | fledgling TCP/IP protocol stack—that proved to be 495 | transformative.* 496 | 497 | *At the time, the success of the Internet was not a foregone 498 | conclusion. It was viewed by many as a research curiosity, and 499 | certainly did not enjoy much support within the computing and 500 | telecommunication incumbents of the day. It was largely because 501 | universities (and their students) had access to an open 502 | implementation of the Internet protocol stack, and affordable 503 | hardware to run it on, that TCP/IP took root. Seeding 504 | transformative technology through open source software and 505 | readily available hardware has proven to be a powerful 506 | strategy, of which BSD is an early success story.* 507 | 508 | BSD and its descendants continue to this day (notably as FreeBSD), but it was eventually 509 | overtaken by Linux, in the early 2000s, as the *de facto* open source, 510 | Unix-based OS. All the variants of TCP congestion control described in 511 | this book are available (and can be optionally activated) in the Linux 512 | kernel. They have become the reference implementation of those 513 | algorithms, which leads us to our final point: The standard for 514 | evaluating TCP congestion-control mechanisms is empirical, by running 515 | real traffic between Linux-based implementations of TCP senders and 516 | receivers. The open question is: What traffic, and over what network? 517 | 518 | While useful insights can often be gained by observing the behavior of 519 | TCP connections running across the actual Internet, the wide 520 | variability (in both time and space) of "the Internet" makes 521 | controlled experiments virtually impossible. Instead, the current 522 | best-practice is to run a collection of "representative flows" over 523 | isolated but "representative network topologies." There is no 524 | established gold standard for either the set of flows or the set of 525 | network topologies, so experimental results are never definitive. But 526 | the body of evidence collected using this methodology has proven 527 | sufficient to advance the state-of-the-art over the years. 528 | 529 | For the purposes of this book, we use the experimental methodology 530 | described in Chapter 3. We use it to both visualize the behavior of 531 | the various algorithms (helping to build intuition) and to highlight 532 | problematic scenarios that continue to make congestion control such a 533 | challenging and interesting technical problem. 534 | -------------------------------------------------------------------------------- /latest.rst: -------------------------------------------------------------------------------- 1 | .. role:: pop 2 | 3 | :pop:`Read The Latest!` 4 | ======================== 5 | 6 | `Systems Approach Newsletter: `__ Stay 7 | up to date with the latest developments by subscribing to the 8 | `Systems Approach Newsletter 9 | `__, where the authors 10 | connect the concepts and lessons in this book to what's happening in 11 | the Internet today. 12 | 13 | `Book Series: `__ Also check out 14 | our companion books that cover emerging topics in more depth. 15 | 16 | * `Private 5G: A Systems Approach `__ 17 | 18 | * `Software-Defined Networks: A Systems Approach `__ 19 | 20 | * `Edge Cloud Operations: A Systems Approach `__ 21 | 22 | .. * `TCP Congestion Control: A Systems Approach `__ 23 | -------------------------------------------------------------------------------- /preface.rst: -------------------------------------------------------------------------------- 1 | Preface 2 | ======= 3 | 4 | Congestion control has been one of the most active areas of research 5 | in computer networking from the earliest days of packet switching. The 6 | work of Jacobson and Karels in the 1980s laid the foundation for 7 | decades of subsequent work by introducing a suite of congestion 8 | control mechanisms into TCP. This was done at a time of crisis, with 9 | the Internet showing signs of congestion collapse. Ethernet inventor 10 | Bob Metcalf famously predicted the Internet would collapse in the 11 | 1990s and followed up on his promise to eat his words when it did 12 | not. But it was clear even then that congestion control was not a 13 | fully solved problem, and improvements to the algorithms on which the 14 | Internet's smooth functioning depends have multiplied ever since. 15 | 16 | This book grew out of our own involvement in developing congestion 17 | control algorithms over the last three decades. There have been so 18 | many developments in congestion control over that time that it’s 19 | nearly impossible to include all of them. What we have tried to do in 20 | this book is provide a framework for understanding congestion control 21 | as a systems problem, and to characterize the many approaches along a 22 | few main themes. For example, our work on TCP Vegas opened up a line 23 | of research that continues today, where the aim is to avoid severe 24 | congestion rather than react after it has set in. We thus consider 25 | avoidance-based approaches as one of the main categories of congestion 26 | control. 27 | 28 | We expect this to be an evolving manuscript. There are many efforts in 29 | congestion control that are not currently covered, the algorithms that 30 | are covered continue to be refined, and new approaches will likely 31 | emerge to address new use cases. We will update the book as necessary 32 | to reflect the state of the field. Please help by submitting your 33 | comments and feedback. We also welcome contributions to the on-line 34 | annotated bibliography. 35 | 36 | Finally, we extend our thanks to those who have contributed to the 37 | open source effort to improve this book. They include: 38 | 39 | - Bill Fisher 40 | - Giulio Micheloni 41 | - J van Bemmel 42 | - Omer Shapira 43 | - Nico Vibert 44 | - Vik Vanderlinden 45 | - Vidhi Goel 46 | 47 | Please send 48 | us your comments and feedback using the `Issues Link 49 | `__. See the `Wiki 50 | `__ for the latest todo 51 | list. 52 | 53 | | Larry Peterson, Lawrence Brakmo, and Bruce Davie 54 | | May 2022 55 | 56 | -------------------------------------------------------------------------------- /print.rst: -------------------------------------------------------------------------------- 1 | .. role:: pop 2 | 3 | :pop:`Print Copies` 4 | =========================== 5 | 6 | We make all books in the *Systems Approach* series available as both 7 | print and e-books. This book is available via Amazon: `TCP Congestion Control: A Systems Approach `__ 8 | 9 | `Book Series: `__ Also check out 10 | our companion books that cover networking and emerging topics in more depth. 11 | 12 | * `Computer Networks: A Systems Approach `__ 13 | 14 | * `Software-Defined Networks: A Systems Approach 15 | `__ 16 | 17 | * `Private 5G: A Systems Approach `__ 18 | 19 | * `Edge Cloud Operations: A Systems Approach `__ 20 | 21 | .. * `TCP Congestion Control: A Systems Approach `__ 22 | 23 | As participants in the Amazon Associate program we may earn income from qualifying purchases using the links above. 24 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | Sphinx~=5.3.0 2 | doc8~=0.10.1 3 | docutils~=0.17.1 4 | reuse~=0.14.0 5 | sphinx-rtd-theme~=1.0.0 6 | sphinxcontrib-spelling~=7.3.2 7 | sphinx-multiversion~=0.2.4 8 | pytz~=2023.3 9 | --------------------------------------------------------------------------------