├── .github
    ├── CODEOWNERS
    └── workflows
    │   ├── publish-docs.yaml
    │   └── validate-docs.yaml
├── .gitignore
├── .gitpod.yml
├── LICENSE
├── Makefile
├── README.rst
├── VERSION
├── _extra
    └── robots.txt
├── _static
    ├── SystemsApproachLogoURL.png
    ├── bridge.ico
    ├── cover.jpg
    ├── css
    │   └── rtd_theme_mods.css
    └── fonts
    │   ├── Inconsolata-Bold.ttf
    │   └── Inconsolata-Regular.ttf
├── algorithm.rst
├── aqm.rst
├── authors.rst
├── avoidance.rst
├── biblio.rst
├── code
    ├── README
    ├── build.sh
    ├── cwin.c
    ├── nagle.c
    ├── red.c
    └── timeout.c
├── conf.py
├── design.rst
├── dict.txt
├── figures
    ├── Figure-sources.pptx
    ├── Graph_16B.png
    ├── Graph_1A.png
    ├── Graph_1B.png
    ├── Graph_6C.png
    ├── Graph_8B.png
    ├── Graph_8C.png
    ├── Slide1.png
    ├── Slide10.png
    ├── Slide11.png
    ├── Slide12.png
    ├── Slide13.png
    ├── Slide14.png
    ├── Slide15.png
    ├── Slide16.png
    ├── Slide2.png
    ├── Slide3.png
    ├── Slide4.png
    ├── Slide5.png
    ├── Slide6.png
    ├── Slide7.png
    ├── Slide8.png
    ├── Slide9.png
    ├── f03-16-9780123850591.png
    ├── f05-03-9780123850591.png
    ├── f05-04-9780123850591.png
    ├── f05-05-9780123850591.png
    ├── f05-08-9780123850591.png
    ├── f05-10-9780123850591.png
    ├── f06-03-9780123850591.png
    ├── f06-05-9780123850591.png
    ├── f06-08-9780123850591.png
    ├── f06-09-9780123850591.png
    ├── f06-10-9780123850591.png
    ├── f06-11-9780123850591.png
    ├── f06-12-9780123850591.png
    ├── f06-13-9780123850591.png
    ├── f06-14-9780123850591.png
    ├── f06-15-9780123850591.png
    ├── f06-16-9780123850591.png
    ├── f06-17-9780123850591.png
    ├── f06-18-9780123850591.png
    └── f06-19-9780123850591.png
├── foreword.rst
├── index.rst
├── intro.rst
├── latest.rst
├── preface.rst
├── print.rst
├── requirements.txt
├── tcp_ip.rst
└── variants.rst


/.github/CODEOWNERS:
--------------------------------------------------------------------------------
1 | #require review
2 | * @llpeterson @drbruced12
3 | 


--------------------------------------------------------------------------------
/.github/workflows/publish-docs.yaml:
--------------------------------------------------------------------------------
 1 | name: Publish Docs Workflow
 2 | run-name: ${{ github.actor }} is publishing document artifacts 🚀
 3 | on:
 4 |   push:
 5 |     branches:
 6 |       - master
 7 | 
 8 |           # Sets permissions of the GITHUB_TOKEN to allow deployment to GitHub Pages
 9 | permissions:
10 |   contents: read
11 |   pages: write
12 |   id-token: write
13 | 
14 | # Allow only one concurrent deployment, skipping runs queued between the run in-progress and latest queued.
15 | # However, do NOT cancel in-progress runs as we want to allow these production deployments to complete.
16 | concurrency:
17 |   group: "pages"
18 |   cancel-in-progress: false
19 | 
20 | jobs:
21 |   # Single deploy job since we're just deploying
22 |   deploy:
23 |     environment:
24 |       name: github-pages
25 |       url: ${{ steps.deployment.outputs.page_url }}
26 |     runs-on: ubuntu-latest
27 |     steps:
28 |       - name: Checkout
29 |         uses: actions/checkout@v4
30 |       - name: Setup Pages
31 |         uses: actions/configure-pages@v4
32 |       - name: Build html
33 |         run: make html
34 |       - name: Upload artifact
35 |         uses: actions/upload-pages-artifact@v3
36 |         with:
37 |           # Upload build repository
38 |           path: './_build/html'
39 |       - name: Deploy to GitHub Pages
40 |         id: deployment
41 |         uses: actions/deploy-pages@v4
42 | 
43 |      
44 |       - run: echo "🍏 This job's status is ${{ job.status }}."
45 | 


--------------------------------------------------------------------------------
/.github/workflows/validate-docs.yaml:
--------------------------------------------------------------------------------
 1 | name: Validate Docs Workflow
 2 | run-name: ${{ github.actor }} is validating document source
 3 | on: [pull_request, workflow_dispatch]
 4 | jobs:
 5 |   Validate_Docs:
 6 |     runs-on: ubuntu-latest
 7 |     steps:
 8 |       - run: echo "🎉 The job was automatically triggered by a ${{ github.event_name }} event."
 9 |       - run: echo "🐧 This job is now running on a ${{ runner.os }} server hosted by GitHub!"
10 |       - run: echo "🔎 The name of your branch is ${{ github.ref }} and your repository is ${{ github.repository }}."
11 |       - name: Check out repo
12 |         uses: actions/checkout@v4
13 |       - name: Validate source
14 |         run: make test
15 |       - name: Build html
16 |         run: make html
17 |       - name: List built files
18 |         run: |
19 |           ls ${{ github.workspace }}/_build/html
20 | 
21 |       - run: echo "🍏 This job's status is ${{ job.status }}."
22 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | *.pdf
 2 | *.tex
 3 | venv-docs
 4 | .DS_Store
 5 | */.DS_Store
 6 | figures-pdf/
 7 | figures-hi_res/
 8 | figures-low_res/
 9 | private/
10 | local/
11 | scripts/
12 | _build/
13 | 


--------------------------------------------------------------------------------
/.gitpod.yml:
--------------------------------------------------------------------------------
 1 | # This configuration file was automatically generated by Gitpod.
 2 | # Please adjust to your needs (see https://www.gitpod.io/docs/config-gitpod-file)
 3 | # and commit this file to your remote git repository to share the goodness with others.
 4 | 
 5 | tasks:
 6 |   - init: sudo apt-get -y install libenchant1c2a
 7 |     command: make html
 8 |   - name: Start web server
 9 |     init: python -m http.server 8000
10 | 
11 | ports:
12 |   - port: 8000
13 |     onOpen: open-preview
14 | 
15 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 | Creative Commons Attribution 4.0 International Public License
  2 | 
  3 | By exercising the Licensed Rights (defined below), You accept and
  4 | agree to be bound by the terms and conditions of this Creative Commons
  5 | Attribution-NonCommercial-NoDerivatives 4.0 International Public
  6 | License ("Public License"). To the extent this Public License may be
  7 | interpreted as a contract, You are granted the Licensed Rights in
  8 | consideration of Your acceptance of these terms and conditions, and
  9 | the Licensor grants You such rights in consideration of benefits the
 10 | Licensor receives from making the Licensed Material available under
 11 | these terms and conditions.
 12 | 
 13 | Section 1 – Definitions.
 14 | 
 15 |   (a) Adapted Material means material subject to Copyright and Similar
 16 |   Rights that is derived from or based upon the Licensed Material and
 17 |   in which the Licensed Material is translated, altered, arranged,
 18 |   transformed, or otherwise modified in a manner requiring permission
 19 |   under the Copyright and Similar Rights held by the Licensor. For
 20 |   purposes of this Public License, where the Licensed Material is a
 21 |   musical work, performance, or sound recording, Adapted Material is
 22 |   always produced where the Licensed Material is synched in timed
 23 |   relation with a moving image.
 24 |   
 25 |   (b) Copyright and Similar Rights means copyright and/or similar
 26 |   rights closely related to copyright including, without limitation,
 27 |   performance, broadcast, sound recording, and Sui Generis Database
 28 |   Rights, without regard to how the rights are labeled or
 29 |   categorized. For purposes of this Public License, the rights
 30 |   specified in Section 2(b)(1)-(2) are not Copyright and Similar
 31 |   Rights.
 32 | 
 33 |   (c) Effective Technological Measures means those measures that, in
 34 |   the absence of proper authority, may not be circumvented under laws
 35 |   fulfilling obligations under Article 11 of the WIPO Copyright Treaty
 36 |   adopted on December 20, 1996, and/or similar international
 37 |   agreements.
 38 | 
 39 |   (d) Exceptions and Limitations means fair use, fair dealing, and/or
 40 |   any other exception or limitation to Copyright and Similar Rights
 41 |   that applies to Your use of the Licensed Material.
 42 | 
 43 |   (e) Licensed Material means the artistic or literary work, database,
 44 |   or other material to which the Licensor applied this Public License.
 45 |   
 46 |   (f) Licensed Rights means the rights granted to You subject to the
 47 |   terms and conditions of this Public License, which are limited to
 48 |   all Copyright and Similar Rights that apply to Your use of the
 49 |   Licensed Material and that the Licensor has authority to license.
 50 | 
 51 |   (g) Licensor means the individual(s) or entity(ies) granting rights
 52 |   under this Public License.
 53 | 
 54 |   (h) NonCommercial means not primarily intended for or directed
 55 |   towards commercial advantage or monetary compensation. For purposes
 56 |   of this Public License, the exchange of the Licensed Material for
 57 |   other material subject to Copyright and Similar Rights by digital
 58 |   file-sharing or similar means is NonCommercial provided there is no
 59 |   payment of monetary compensation in connection with the exchange.
 60 | 
 61 |   (i) Share means to provide material to the public by any means or
 62 |   process that requires permission under the Licensed Rights, such as
 63 |   reproduction, public display, public performance, distribution,
 64 |   dissemination, communication, or importation, and to make material
 65 |   available to the public including in ways that members of the public
 66 |   may access the material from a place and at a time individually
 67 |   chosen by them.
 68 | 
 69 |   (j) Sui Generis Database Rights means rights other than copyright
 70 |   resulting from Directive 96/9/EC of the European Parliament and of
 71 |   the Council of 11 March 1996 on the legal protection of databases,
 72 |   as amended and/or succeeded, as well as other essentially equivalent
 73 |   rights anywhere in the world.
 74 | 
 75 |   (k) You means the individual or entity exercising the Licensed
 76 |   Rights under this Public License. Your has a corresponding meaning.
 77 |   
 78 | Section 2 – Scope.
 79 | 
 80 |   (a) License grant.
 81 | 
 82 |     (1) Subject to the terms and conditions of this Public License,
 83 |     the Licensor hereby grants You a worldwide, royalty-free,
 84 |     non-sublicensable, non-exclusive, irrevocable license to exercise
 85 |     the Licensed Rights in the Licensed Material to:
 86 | 
 87 |       (A) reproduce and Share the Licensed Material, in whole or in
 88 |       part, for NonCommercial purposes only; and
 89 | 
 90 |       (B) produce and reproduce, but not Share, Adapted Material for
 91 |       NonCommercial purposes only.
 92 |   
 93 |     (2) Exceptions and Limitations. For the avoidance of doubt, where
 94 |     Exceptions and Limitations apply to Your use, this Public License
 95 |     does not apply, and You do not need to comply with its terms and
 96 |     conditions.
 97 | 
 98 |     (3) Term. The term of this Public License is specified in Section
 99 |     6(a).
100 | 
101 |     (4) Media and formats; technical modifications allowed. The
102 |     Licensor authorizes You to exercise the Licensed Rights in all
103 |     media and formats whether now known or hereafter created, and to
104 |     make technical modifications necessary to do so. The Licensor
105 |     waives and/or agrees not to assert any right or authority to
106 |     forbid You from making technical modifications necessary to
107 |     exercise the Licensed Rights, including technical modifications
108 |     necessary to circumvent Effective Technological Measures. For
109 |     purposes of this Public License, simply making modifications
110 |     authorized by this Section 2(a)(4) never produces Adapted
111 |     Material.
112 | 
113 |     (5) Downstream recipients.
114 | 
115 |       (A) Offer from the Licensor – Licensed Material. Every recipient
116 |       of the Licensed Material automatically receives an offer from
117 |       the Licensor to exercise the Licensed Rights under the terms and
118 |       conditions of this Public License.
119 | 
120 |       (B) No downstream restrictions. You may not offer or impose any
121 |       additional or different terms or conditions on, or apply any
122 |       Effective Technological Measures to, the Licensed Material if
123 |       doing so restricts exercise of the Licensed Rights by any
124 |       recipient of the Licensed Material.
125 | 
126 |     (6) No endorsement. Nothing in this Public License constitutes or
127 |     may be construed as permission to assert or imply that You are, or
128 |     that Your use of the Licensed Material is, connected with, or
129 |     sponsored, endorsed, or granted official status by, the Licensor
130 |     or others designated to receive attribution as provided in Section
131 |     3(a)(1)(A)(i).
132 | 
133 |   (b) Other rights.
134 | 
135 |     (1) Moral rights, such as the right of integrity, are not licensed
136 |     under this Public License, nor are publicity, privacy, and/or
137 |     other similar personality rights; however, to the extent possible,
138 |     the Licensor waives and/or agrees not to assert any such rights
139 |     held by the Licensor to the limited extent necessary to allow You
140 |     to exercise the Licensed Rights, but not otherwise.
141 | 
142 |     (2) Patent and trademark rights are not licensed under this Public
143 |     License.
144 | 
145 |     (3) To the extent possible, the Licensor waives any right to
146 |     collect royalties from You for the exercise of the Licensed
147 |     Rights, whether directly or through a collecting society under any
148 |     voluntary or waivable statutory or compulsory licensing scheme. In
149 |     all other cases the Licensor expressly reserves any right to
150 |     collect such royalties, including when the Licensed Material is
151 |     used other than for NonCommercial purposes.
152 |     
153 | Section 3 – License Conditions.
154 | 
155 | Your exercise of the Licensed Rights is expressly made subject to the
156 | following conditions.
157 | 
158 |   (a) Attribution.
159 | 
160 |     (1) If You Share the Licensed Material, You must:
161 | 
162 |       (A) retain the following if it is supplied by the Licensor with
163 |       the Licensed Material:
164 |   
165 |         (i) identification of the creator(s) of the Licensed Material
166 |         and any others designated to receive attribution, in any
167 |         reasonable manner requested by the Licensor (including by
168 |         pseudonym if designated);
169 | 
170 |         (ii) a copyright notice;
171 |   
172 |         (iii) a notice that refers to this Public License;
173 |   
174 |         (iv) a notice that refers to the disclaimer of warranties;
175 |   
176 |         (v) a URI or hyperlink to the Licensed Material to the extent
177 |         reasonably practicable;
178 |   
179 |       (B) indicate if You modified the Licensed Material and retain an
180 |       indication of any previous modifications; and
181 | 
182 |       (C) indicate the Licensed Material is licensed under this Public
183 |       License, and include the text of, or the URI or hyperlink to,
184 |       this Public License.
185 | 
186 |     For the avoidance of doubt, You do not have permission under this
187 |     Public License to Share Adapted Material.
188 | 
189 |     (2) You may satisfy the conditions in Section 3(a)(1) in any
190 |     reasonable manner based on the medium, means, and context in which
191 |     You Share the Licensed Material. For example, it may be reasonable
192 |     to satisfy the conditions by providing a URI or hyperlink to a
193 |     resource that includes the required information.
194 | 
195 |     (3) If requested by the Licensor, You must remove any of the
196 |     information required by Section 3(a)(1)(A) to the extent
197 |     reasonably practicable.
198 | 
199 | Section 4 – Sui Generis Database Rights.
200 | 
201 | Where the Licensed Rights include Sui Generis Database Rights that
202 | apply to Your use of the Licensed Material:
203 | 
204 |   (a) for the avoidance of doubt, Section 2(a)(1) grants You the right
205 |   to extract, reuse, reproduce, and Share all or a substantial portion
206 |   of the contents of the database for NonCommercial purposes only and
207 |   provided You do not Share Adapted Material;
208 |   
209 |   (b) if You include all or a substantial portion of the database
210 |   contents in a database in which You have Sui Generis Database
211 |   Rights, then the database in which You have Sui Generis Database
212 |   Rights (but not its individual contents) is Adapted Material; and
213 | 
214 |   (c) You must comply with the conditions in Section 3(a) if You Share
215 |   all or a substantial portion of the contents of the database.
216 | 
217 | For the avoidance of doubt, this Section 4 supplements and does not
218 | replace Your obligations under this Public License where the Licensed
219 | Rights include other Copyright and Similar Rights.
220 | 
221 | Section 5 – Disclaimer of Warranties and Limitation of Liability.
222 | 
223 |   (a) Unless otherwise separately undertaken by the Licensor, to the
224 |   extent possible, the Licensor offers the Licensed Material as-is and
225 |   as-available, and makes no representations or warranties of any kind
226 |   concerning the Licensed Material, whether express, implied,
227 |   statutory, or other. This includes, without limitation, warranties
228 |   of title, merchantability, fitness for a particular purpose,
229 |   non-infringement, absence of latent or other defects, accuracy, or
230 |   the presence or absence of errors, whether or not known or
231 |   discoverable. Where disclaimers of warranties are not allowed in
232 |   full or in part, this disclaimer may not apply to You.
233 |   
234 |   (b) To the extent possible, in no event will the Licensor be liable
235 |   to You on any legal theory (including, without limitation,
236 |   negligence) or otherwise for any direct, special, indirect,
237 |   incidental, consequential, punitive, exemplary, or other losses,
238 |   costs, expenses, or damages arising out of this Public License or
239 |   use of the Licensed Material, even if the Licensor has been advised
240 |   of the possibility of such losses, costs, expenses, or
241 |   damages. Where a limitation of liability is not allowed in full or
242 |   in part, this limitation may not apply to You.
243 | 
244 |   (c) The disclaimer of warranties and limitation of liability
245 |   provided above shall be interpreted in a manner that, to the extent
246 |   possible, most closely approximates an absolute disclaimer and
247 |   waiver of all liability.
248 | 
249 | Section 6 – Term and Termination.
250 | 
251 |   (a) This Public License applies for the term of the Copyright and
252 |   Similar Rights licensed here. However, if You fail to comply with
253 |   this Public License, then Your rights under this Public License
254 |   terminate automatically.
255 | 
256 |   (b) Where Your right to use the Licensed Material has terminated
257 |   under Section 6(a), it reinstates:
258 | 
259 |     (1) automatically as of the date the violation is cured, provided
260 |     it is cured within 30 days of Your discovery of the violation; or
261 | 
262 |     (2) upon express reinstatement by the Licensor.
263 | 
264 |   For the avoidance of doubt, this Section 6(b) does not affect
265 |   any right the Licensor may have to seek remedies for Your violations
266 |   of this Public License.
267 | 
268 |   (c) For the avoidance of doubt, the Licensor may also offer the
269 |   Licensed Material under separate terms or conditions or stop
270 |   distributing the Licensed Material at any time; however, doing so
271 |   will not terminate this Public License.
272 | 
273 |   (d) Sections 1, 5, 6, 7, and 8 survive termination of this Public
274 |   License.
275 | 
276 | Section 7 – Other Terms and Conditions.
277 | 
278 |   (a) The Licensor shall not be bound by any additional or different
279 |   terms or conditions communicated by You unless expressly agreed.
280 | 
281 |   (b) Any arrangements, understandings, or agreements regarding the
282 |   Licensed Material not stated herein are separate from and
283 |   independent of the terms and conditions of this Public License.
284 | 
285 | Section 8 – Interpretation.
286 | 
287 |   (a) For the avoidance of doubt, this Public License does not, and
288 |   shall not be interpreted to, reduce, limit, restrict, or impose
289 |   conditions on any use of the Licensed Material that could lawfully
290 |   be made without permission under this Public License.
291 | 
292 |   (b) To the extent possible, if any provision of this Public License
293 |   is deemed unenforceable, it shall be automatically reformed to the
294 |   minimum extent necessary to make it enforceable. If the provision
295 |   cannot be reformed, it shall be severed from this Public License
296 |   without affecting the enforceability of the remaining terms and
297 |   conditions.
298 | 
299 |   (c) No term or condition of this Public License will be waived and
300 |   no failure to comply consented to unless expressly agreed to by the
301 |   Licensor.
302 | 
303 |   (d) Nothing in this Public License constitutes or may be interpreted
304 |   as a limitation upon, or waiver of, any privileges and immunities
305 |   that apply to the Licensor or You, including from the legal
306 |   processes of any jurisdiction or authority.
307 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | # Makefile for ONF documentation created with Sphinx
 2 | 
 3 | # use bash for pushd/popd, and to fail quickly. virtualenv's activate
 4 | # has undefined variables, so no -u
 5 | SHELL = bash -e -o pipefail
 6 | 
 7 | # You can set these variables from the command line.
 8 | SPHINXOPTS   ?= 
 9 | SPHINXBUILD  ?= sphinx-build
10 | SOURCEDIR    ?= .
11 | BUILDDIR     ?= _build
12 | 
13 | # Create the virtualenv with all the tools installed
14 | VIRTUALENV    = venv-docs
15 | 
16 | # Put it first so that "make" without argument is like "make help".
17 | help: $(VIRTUALENV)
18 | 	source ./$(VIRTUALENV)/bin/activate ;\
19 |   $(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
20 | 
21 | .PHONY: help lint reload Makefile test
22 | 
23 | # Create the virtualenv with all the tools installed
24 | $(VIRTUALENV):
25 | 	python3 -m venv $@ ;\
26 |   source ./$@/bin/activate ;\
27 |   pip install -r requirements.txt
28 | 
29 | # lint and link verification. linkcheck is built into sphinx
30 | test: lint spelling 
31 | 
32 | # lint all .rst files
33 | lint: $(VIRTUALENV)
34 | 	source ./$</bin/activate ;\
35 |   doc8 --ignore-path $< --ignore-path _build --max-line-length 120 .
36 | 
37 | # clean up
38 | clean:
39 | 	rm -rf "$(BUILDDIR)"
40 | 
41 | # clean-all - delete the virtualenv too
42 | clean-all: clean
43 | 	rm -rf "$(VIRTUALENV)"
44 | 
45 | # Catch-all target: route all unknown targets to Sphinx using the new
46 | # "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
47 | %: $(VIRTUALENV) Makefile
48 | 	source ./$</bin/activate ; set -u;\
49 |   $(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
50 | 


--------------------------------------------------------------------------------
/README.rst:
--------------------------------------------------------------------------------
 1 | About The Book
 2 | ===============
 3 | 
 4 | .. image:: https://github.com/SystemsApproach/tcpcc/actions/workflows/publish-docs.yaml/badge.svg
 5 |   :align: left
 6 |   :alt: deployment status button
 7 |   :target: https://github.com/SystemsApproach/tcpcc/actions/
 8 | 
 9 | |
10 | 
11 | 
12 | Source for *TCP Congestion Control: A Systems Approach* is available
13 | on GitHub under
14 | terms of the `Creative Commons (CC BY-NC-ND 4.0)
15 | <https://creativecommons.org/licenses/by-nc-nd/4.0>`__ license. The
16 | community is invited to contribute corrections, improvements, updates,
17 | and new material under the same terms. While this license does not
18 | automatically grant the right to make derivative works, we are keen to
19 | discuss derivative works (such as translations) with interested
20 | parties. Please reach out to
21 | `discuss@systemsapproach.org <mailto:discuss@systemsapproach.org>`__.
22 | 
23 | If you make use of this work, the attribution should include the
24 | following information:
25 | 
26 | | *Title: TCP Congestion Control: A Systems Approach*
27 | | *Authors: Larry Peterson, Lawrence Brakmo, and Bruce Davie*
28 | | *Source:* https://github.com/SystemsApproach/tcpcc
29 | | *License:* \ `CC BY-NC-ND 4.0 <https://creativecommons.org/licenses/by-nc-nd/4.0>`__
30 | 
31 | Read the Book
32 | -------------
33 | 
34 | This book is part of the `Systems Approach Series
35 | <https://www.systemsapproach.org/books>`__, with an online version
36 | published at `https://tcpcc.systemsapproach.org
37 | <https://tcpcc.systemsapproach.org>`__.
38 | 
39 | To track progress and receive notices about new versions, you can follow
40 | the project on
41 | `Mastodon <https://discuss.systems/@SystemsAppr>`__.
42 | To read a running commentary on how the Internet is evolving, and for
43 | updates on our writing projects, you can sign up for the
44 | `Systems Approach Newsletter <https://systemsapproach.org/newsletter>`__.
45 | 
46 | Build the Book
47 | --------------
48 | 
49 | To build a web-viewable version, you first need to download the source:
50 | 
51 | .. code:: shell
52 | 
53 |    $ mkdir ~/systemsapproach
54 |    $ cd ~/systemsapproach
55 |    $ git clone https://github.com/SystemsApproach/tcpcc.git
56 |    $ cd tcpcc
57 | 
58 | The build process is stored in the Makefile and requires Python be
59 | installed. The Makefile will create a virtualenv (``venv-docs``) which
60 | installs the documentation generation toolset. You may also need to
61 | install the ``enchant`` C library using your system’s package manager
62 | for the spelling checker to function properly.
63 | 
64 | To generate HTML in ``_build/html``,  run ``make html``.
65 | 
66 | To check the formatting of the book, run ``make lint``.
67 | 
68 | To check spelling, run ``make spelling``. If there are additional
69 | words, names, or acronyms that are correctly spelled but not in the
70 | dictionary, please add them to the ``dict.txt`` file.
71 | 
72 | To see the other available output formats, run ``make``.
73 | 
74 | Contribute to the Book
75 | ----------------------
76 | 
77 | We hope that if you use this material, you are also willing to
78 | contribute back to it. If you are new to open source, you might check
79 | out this `How to Contribute to Open
80 | Source <https://opensource.guide/how-to-contribute/>`__ guide. Among
81 | other things, you’ll learn about posting *Issues* that you’d like to see
82 | addressed, and issuing *Pull Requests* to merge your improvements back
83 | into GitHub.
84 | 
85 | If you’d like to contribute and are looking for something that needs
86 | attention, see the `wiki <https://github.com/SystemsApproach/tcpcc/wiki>`__
87 | for the current TODO list.
88 | 


--------------------------------------------------------------------------------
/VERSION:
--------------------------------------------------------------------------------
1 | Version 1.1-dev


--------------------------------------------------------------------------------
/_extra/robots.txt:
--------------------------------------------------------------------------------
 1 | User-agent: AI2Bot
 2 | User-agent: Ai2Bot-Dolma
 3 | User-agent: aiHitBot
 4 | User-agent: Amazonbot
 5 | User-agent: anthropic-ai
 6 | User-agent: Applebot
 7 | User-agent: Applebot-Extended
 8 | User-agent: Brightbot 1.0
 9 | User-agent: Bytespider
10 | User-agent: CCBot
11 | User-agent: ChatGPT-User
12 | User-agent: Claude-Web
13 | User-agent: ClaudeBot
14 | User-agent: cohere-ai
15 | User-agent: cohere-training-data-crawler
16 | User-agent: Cotoyogi
17 | User-agent: Crawlspace
18 | User-agent: Diffbot
19 | User-agent: DuckAssistBot
20 | User-agent: FacebookBot
21 | User-agent: Factset_spyderbot
22 | User-agent: FirecrawlAgent
23 | User-agent: FriendlyCrawler
24 | User-agent: Google-Extended
25 | User-agent: GoogleOther
26 | User-agent: GoogleOther-Image
27 | User-agent: GoogleOther-Video
28 | User-agent: GPTBot
29 | User-agent: iaskspider/2.0
30 | User-agent: ICC-Crawler
31 | User-agent: ImagesiftBot
32 | User-agent: img2dataset
33 | User-agent: imgproxy
34 | User-agent: ISSCyberRiskCrawler
35 | User-agent: Kangaroo Bot
36 | User-agent: meta-externalagent
37 | User-agent: Meta-ExternalAgent
38 | User-agent: meta-externalfetcher
39 | User-agent: Meta-ExternalFetcher
40 | User-agent: NovaAct
41 | User-agent: OAI-SearchBot
42 | User-agent: omgili
43 | User-agent: omgilibot
44 | User-agent: Operator
45 | User-agent: PanguBot
46 | User-agent: Perplexity-User
47 | User-agent: PerplexityBot
48 | User-agent: PetalBot
49 | User-agent: Scrapy
50 | User-agent: SemrushBot-OCOB
51 | User-agent: SemrushBot-SWA
52 | User-agent: Sidetrade indexer bot
53 | User-agent: TikTokSpider
54 | User-agent: Timpibot
55 | User-agent: VelenPublicWebCrawler
56 | User-agent: Webzio-Extended
57 | User-agent: YouBot
58 | Disallow: /
59 | 


--------------------------------------------------------------------------------
/_static/SystemsApproachLogoURL.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SystemsApproach/tcpcc/abb44dfa1a05e457243995dac0630e73febd82f5/_static/SystemsApproachLogoURL.png


--------------------------------------------------------------------------------
/_static/bridge.ico:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SystemsApproach/tcpcc/abb44dfa1a05e457243995dac0630e73febd82f5/_static/bridge.ico


--------------------------------------------------------------------------------
/_static/cover.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SystemsApproach/tcpcc/abb44dfa1a05e457243995dac0630e73febd82f5/_static/cover.jpg


--------------------------------------------------------------------------------
/_static/css/rtd_theme_mods.css:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright 2019-present Open Networking Foundation
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  * http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.  */
15 | 
16 | /* Don't restrict content width on the RTD theme
17 |  * from: https://stackoverflow.com/a/32898444 */
18 | 
19 | .wy-nav-content {
20 |   max-width: none;
21 | }
22 | 
23 | .wy-table-responsive table td, .wy-table-responsive table th {
24 |   white-space: normal;
25 | }
26 | 
27 | /* Colors for navigation */
28 | 
29 | .wy-side-nav-search, .wy-nav-top {
30 |     background: #2F5597;
31 | }
32 | 
33 | /* .wy-menu-vertical header,.wy-menu-vertical p.caption{color:#2F5597} */
34 | 
35 | .wy-menu-vertical header,.wy-menu-vertical p.caption{color:#6AB0DE}
36 | 
37 | /* Headings */
38 | h1, h2 {
39 |   font-weight: bold;
40 |   line-height: 1.25;
41 |   color: #3279a8
42 |   text-rendering: optimizeLegibility;
43 | }
44 | 
45 | h3, h4, h5, h6 {
46 |   margin-bottom: .5rem;
47 |   font-style: italic;
48 |   line-height: 1.25;
49 |   color: #313131;
50 |   text-rendering: optimizeLegibility;
51 | }
52 | 
53 | h1 {
54 |   margin-bottom: 2rem;
55 |   font-size: 2rem;
56 | }
57 | 
58 | h2 {
59 |   margin-bottom: .5rem;
60 |   margin-top: 1rem;
61 |   font-size: 1.5rem;   
62 | }
63 | 
64 | h3 {
65 |   margin-top: 1.5rem;
66 |   font-size: 1.25rem;
67 | }
68 | 
69 | .pop {
70 |     color: #6AB0DE;
71 |     font-style: italic;
72 |     font-weight: bold;
73 | }
74 | aside.sidebar {
75 |     margin: 0 0 0.5em 1em;
76 |     border: 1px solid #ddb;
77 |     padding: 7px 7px 0 7px;
78 |     background-color: #ffe;
79 |     width: 40%;
80 |     float: right;
81 | }
82 | 


--------------------------------------------------------------------------------
/_static/fonts/Inconsolata-Bold.ttf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SystemsApproach/tcpcc/abb44dfa1a05e457243995dac0630e73febd82f5/_static/fonts/Inconsolata-Bold.ttf


--------------------------------------------------------------------------------
/_static/fonts/Inconsolata-Regular.ttf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SystemsApproach/tcpcc/abb44dfa1a05e457243995dac0630e73febd82f5/_static/fonts/Inconsolata-Regular.ttf


--------------------------------------------------------------------------------
/aqm.rst:
--------------------------------------------------------------------------------
  1 | Chapter 6:  Active Queue Management
  2 | ===================================
  3 | 
  4 | We now look at the role routers can play in congestion control, an
  5 | approach often referred to as *Active Queue Management* (AQM).  By
  6 | its very nature, AQM introduces an element of avoidance to the
  7 | end-to-end solution, even when paired with a control-based approach
  8 | like TCP Reno.
  9 | 
 10 | Changing router behavior has never been the Internet’s preferred way
 11 | of introducing new features, but nonetheless, the approach has been a
 12 | constant source of consternation over the last 30 years. The problem
 13 | is that while it’s generally agreed that routers are in an ideal
 14 | position to detect the onset of congestion—it's their queues that
 15 | start to fill up—there has not been a consensus on exactly what the
 16 | best algorithm is. The following describes two of the classic
 17 | mechanisms, and concludes with a brief discussion of where things
 18 | stand today.
 19 | 
 20 | 6.1 DECbit
 21 | ----------
 22 | 
 23 | The first mechanism was developed for use on the Digital Network
 24 | Architecture (DNA), an early peer of the TCP/IP Internet that also
 25 | adopted a connectionless/best-effort network model. A description
 26 | of the approach, published by K.K. Ramakrishnan and Raj Jain, was
 27 | presented at the same SIGCOMM as the Jacobson/Karels paper in 1988.
 28 | 
 29 | .. _reading_decbit:
 30 | .. admonition:: Further Reading
 31 | 
 32 |       K.K. Ramakrishnan and R. Jain.
 33 |       `A Binary Feedback Scheme for
 34 |       Congestion Avoidance in Computer Networks with a Connectionless
 35 |       Network Layer <https://dl.acm.org/doi/pdf/10.1145/52324.52355>`__.
 36 |       ACM SIGCOMM, August 1988.
 37 | 
 38 | The idea is to more evenly split the responsibility for congestion
 39 | control between the routers and the end hosts. Each router monitors
 40 | the load it is experiencing and explicitly notifies the end nodes when
 41 | congestion is about to occur. This notification is implemented by
 42 | setting a binary congestion bit in the packets that flow through the
 43 | router, which came to be known as the *DECbit*. The destination host
 44 | then copies this congestion bit into the ACK it sends back to the
 45 | source. Finally, the source adjusts its sending rate so as to avoid
 46 | congestion. The following discussion describes the algorithm in more
 47 | detail, starting with what happens in the router.
 48 | 
 49 | A single congestion bit is added to the packet header. A router sets
 50 | this bit in a packet if its average queue length is greater than or
 51 | equal to 1 at the time the packet arrives. This average queue length
 52 | is measured over a time interval that spans the last busy+idle cycle,
 53 | plus the current busy cycle. (The router is *busy* when it is
 54 | transmitting and *idle* when it is not.) :numref:`Figure %s
 55 | <fig-decbit>` shows the queue length at a router as a function of
 56 | time. Essentially, the router calculates the area under the curve and
 57 | divides this value by the time interval to compute the average queue
 58 | length. Using a queue length of 1 as the trigger for setting the
 59 | congestion bit is a trade-off between significant queuing (and hence
 60 | higher throughput) and increased idle time (and hence lower delay). In
 61 | other words, a queue length of 1 seems to optimize the power function.
 62 | 
 63 | .. _fig-decbit:
 64 | .. figure:: figures/f06-14-9780123850591.png
 65 |    :width: 500px
 66 |    :align: center
 67 | 
 68 |    Computing average queue length at a router.
 69 | 
 70 | Now turning our attention to the host half of the mechanism, the source
 71 | records how many of its packets resulted in some router setting the
 72 | congestion bit. In particular, the source maintains a congestion window,
 73 | just as in TCP, and watches to see what fraction of the last window’s
 74 | worth of packets resulted in the bit being set. If less than 50% of the
 75 | packets had the bit set, then the source increases its congestion window
 76 | by one packet. If 50% or more of the last window’s worth of packets had
 77 | the congestion bit set, then the source decreases its congestion window
 78 | to 0.875 times the previous value. The value 50% was chosen as the
 79 | threshold based on analysis that showed it to correspond to the peak of
 80 | the power curve. The “increase by 1, decrease by 0.875” rule was
 81 | selected because additive increase/multiplicative decrease makes the
 82 | mechanism stable.
 83 | 
 84 | 6.2 Random Early Detection
 85 | --------------------------
 86 | 
 87 | A second mechanism, called *random early detection* (RED), is similar to
 88 | the DECbit scheme in that each router is programmed to monitor its own
 89 | queue length and, when it detects that congestion is imminent, to notify
 90 | the source to adjust its congestion window. RED, invented by Sally Floyd
 91 | and Van Jacobson in the early 1990s, differs from the DECbit scheme in
 92 | two major ways.
 93 | 
 94 | .. _reading_red:
 95 | .. admonition:: Further Reading
 96 | 
 97 |       S. Floyd and V.  Jacobson `Random Early Detection (RED)
 98 |       Gateways for Congestion Avoidance <http://www.icir.org/floyd/papers/early.twocolumn.pdf>`__.
 99 |       IEEE/ACM Transactions on Networking. August 1993.
100 | 
101 | The first is that rather than explicitly sending a congestion
102 | notification message to the source, RED is most commonly implemented
103 | such that it *implicitly* notifies the source of congestion by dropping
104 | one of its packets. The source is, therefore, effectively notified by
105 | the subsequent timeout or duplicate ACK. In case you haven’t already
106 | guessed, RED is designed to be used in conjunction with TCP, which
107 | currently detects congestion by means of timeouts (or some other means
108 | of detecting packet loss such as duplicate ACKs). As the “early” part of
109 | the RED acronym suggests, the gateway drops the packet earlier than it
110 | would have to, so as to notify the source that it should decrease its
111 | congestion window sooner than it would normally have. In other words,
112 | the router drops a few packets before it has exhausted its buffer space
113 | completely, so as to cause the source to slow down, with the hope that
114 | this will mean it does not have to drop lots of packets later on.
115 | 
116 | The second difference between RED and DECbit is in the details of how
117 | RED decides when to drop a packet and what packet it decides to drop. To
118 | understand the basic idea, consider a simple FIFO queue. Rather than
119 | wait for the queue to become completely full and then be forced to drop
120 | each arriving packet (the tail drop policy described in Section 2.1.3), we
121 | could decide to drop each arriving packet with some *drop probability*
122 | whenever the queue length exceeds some *drop level*. This idea is called
123 | *early random drop*. The RED algorithm defines the details of how to
124 | monitor the queue length and when to drop a packet.
125 | 
126 | In the following paragraphs, we describe the RED algorithm as originally
127 | proposed by Floyd and Jacobson. We note that several modifications have
128 | since been proposed both by the inventors and by other researchers.
129 | However, the key ideas are the same as those presented below, and most
130 | current implementations are close to the algorithm that follows.
131 | 
132 | First, RED computes an average queue length using a weighted running
133 | average similar to the one used in the original TCP timeout computation.
134 | That is, ``AvgLen`` is computed as
135 | 
136 | .. math:: \mathsf{AvgLen = (1 - Weight)\ x\ AvgLen + Weight\ x\ SampleLen}
137 | 
138 | where 0 < ``Weight`` < 1 and ``SampleLen`` is the length of the queue
139 | when a sample measurement is made. In most software implementations, the
140 | queue length is measured every time a new packet arrives at the gateway.
141 | In hardware, it might be calculated at some fixed sampling interval.
142 | 
143 | The reason for using an average queue length rather than an
144 | instantaneous one is that it more accurately captures the notion of
145 | congestion. Because of the bursty nature of Internet traffic, queues
146 | can become full very quickly and then become empty again. If a queue
147 | is spending most of its time empty, then it’s probably not appropriate
148 | to conclude that the router is congested and to tell the hosts to slow
149 | down. Thus, the weighted running average calculation tries to detect
150 | long-lived congestion, as indicated in the right-hand portion of
151 | :numref:`Figure %s <fig-red-avg>`, by filtering out short-term changes
152 | in the queue length. You can think of the running average as a
153 | low-pass filter, where ``Weight`` determines the time constant of the
154 | filter. The question of how we pick this time constant is discussed
155 | below.
156 | 
157 | .. _fig-red-avg:
158 | .. figure:: figures/f06-15-9780123850591.png
159 |    :width: 500px
160 |    :align: center
161 | 
162 |    Weighted running average queue length.
163 | 
164 | Second, RED has two queue length thresholds that trigger certain
165 | activity: ``MinThreshold`` and ``MaxThreshold``. When a packet arrives
166 | at the gateway, RED compares the current ``AvgLen`` with these two
167 | thresholds, according to the following rules:
168 | 
169 | .. literalinclude:: code/red.c
170 | 
171 | If the average queue length is smaller than the lower threshold, no
172 | action is taken, and if the average queue length is larger than the
173 | upper threshold, then the packet is always dropped. If the average
174 | queue length is between the two thresholds, then the newly arriving
175 | packet is dropped with some probability ``P``. This situation is
176 | depicted in :numref:`Figure %s <fig-red>`. The approximate
177 | relationship between ``P`` and ``AvgLen`` is shown in :numref:`Figure
178 | %s <fig-red-prob>`. Note that the probability of drop increases slowly
179 | when ``AvgLen`` is between the two thresholds, reaching ``MaxP`` at
180 | the upper threshold, at which point it jumps to unity. The rationale
181 | behind this is that, if ``AvgLen`` reaches the upper threshold, then
182 | the gentle approach (dropping a few packets) is not working and
183 | drastic measures are called for: dropping all arriving packets. Some
184 | research has suggested that a smoother transition from random dropping
185 | to complete dropping, rather than the discontinuous approach shown
186 | here, may be appropriate.
187 | 
188 | .. _fig-red:
189 | .. figure:: figures/f06-16-9780123850591.png
190 |    :width: 300px
191 |    :align: center
192 | 
193 |    RED thresholds on a FIFO queue.
194 | 
195 | .. _fig-red-prob:
196 | .. figure:: figures/f06-17-9780123850591.png
197 |    :width: 400px
198 |    :align: center
199 | 
200 |    Drop probability function for RED.
201 | 
202 | Although :numref:`Figure %s <fig-red-prob>` shows the probability of
203 | drop as a function only of ``AvgLen``, the situation is actually a
204 | little more complicated. In fact, ``P`` is a function of both
205 | ``AvgLen`` and how long it has been since the last packet was
206 | dropped. Specifically, it is computed as follows:
207 | 
208 | .. math:: \mathsf{TempP = MaxP\ x\ (AvgLen - MinThreshold)\ /\ (MaxThreshold - MinThreshold)}
209 | 
210 | .. math:: \mathsf{P = TempP\ /\ (1 - count\ x\ TempP)}
211 | 
212 | ``TempP`` is the variable that is plotted on the y-axis in :numref:`Figure
213 | %s <fig-red-prob>`, ``count`` keeps track of how many newly arriving
214 | packets have been queued (not dropped), and ``AvgLen`` has been between
215 | the two thresholds. ``P`` increases slowly as ``count`` increases,
216 | thereby making a drop increasingly likely as the time since the last
217 | drop increases. This makes closely spaced drops relatively less likely
218 | than widely spaced drops. This extra step in calculating ``P`` was
219 | introduced by the inventors of RED when they observed that, without it,
220 | the packet drops were not well distributed in time but instead tended to
221 | occur in clusters. Because packet arrivals from a certain connection are
222 | likely to arrive in bursts, this clustering of drops is likely to cause
223 | multiple drops in a single connection. This is not desirable, since only
224 | one drop per round-trip time is enough to cause a connection to reduce
225 | its window size, whereas multiple drops might send it back into slow
226 | start.
227 | 
228 | As an example, suppose that we set ``MaxP`` to 0.02 and ``count`` is
229 | initialized to zero. If the average queue length were halfway between
230 | the two thresholds, then ``TempP``, and the initial value of ``P``,
231 | would be half of ``MaxP``, or 0.01. An arriving packet, of course, has a
232 | 99 in 100 chance of getting into the queue at this point. With each
233 | successive packet that is not dropped, ``P`` slowly increases, and by
234 | the time 50 packets have arrived without a drop, ``P`` would have
235 | doubled to 0.02. In the unlikely event that 99 packets arrived without
236 | loss, ``P`` reaches 1, guaranteeing that the next packet is dropped. The
237 | important thing about this part of the algorithm is that it ensures a
238 | roughly even distribution of drops over time.
239 | 
240 | The intent is that, if RED drops a small percentage of packets when
241 | ``AvgLen`` exceeds ``MinThreshold``, this will cause a few TCP
242 | connections to reduce their window sizes, which in turn will reduce the
243 | rate at which packets arrive at the router. All going well, ``AvgLen``
244 | will then decrease and congestion is avoided. The queue length can be
245 | kept short, while throughput remains high since few packets are dropped.
246 | 
247 | Note that, because RED is operating on a queue length averaged over
248 | time, it is possible for the instantaneous queue length to be much
249 | longer than ``AvgLen``. In this case, if a packet arrives and there is
250 | nowhere to put it, then it will have to be dropped. When this happens,
251 | RED is operating in tail drop mode. One of the goals of RED is to
252 | prevent tail drop behavior if possible.
253 | 
254 | The random nature of RED confers an interesting property on the
255 | algorithm. Because RED drops packets randomly, the probability that RED
256 | decides to drop a particular flow’s packet(s) is roughly proportional to
257 | the share of the bandwidth that flow is currently getting at that
258 | router. This is because a flow that is sending a relatively large number
259 | of packets is providing more candidates for random dropping. Thus, there
260 | is some sense of fair resource allocation built into RED, although it is
261 | by no means precise. While arguably fair, because RED punishes
262 | high-bandwidth flows more than low-bandwidth flows, it increases the
263 | probability of a TCP restart, which is doubly painful for those
264 | high-bandwidth flows.
265 | 
266 | 
267 | A fair amount of analysis has gone into setting the various RED
268 | parameters—for example, ``MaxThreshold``, ``MinThreshold``, ``MaxP``
269 | and ``Weight``—all in the name of optimizing the power function
270 | (throughput-to-delay ratio). The performance of these parameters has
271 | also been confirmed through simulation, and the algorithm has been
272 | shown not to be overly sensitive to them. It is important to keep in
273 | mind, however, that all of this analysis and simulation hinges on a
274 | particular characterization of the network workload. The real
275 | contribution of RED is a mechanism by which the router can more
276 | accurately manage its queue length. Defining precisely what
277 | constitutes an optimal queue length depends on the traffic mix and is
278 | a subject of ongoing study.
279 | 
280 | Consider the setting of the two thresholds, ``MinThreshold`` and
281 | ``MaxThreshold``. If the traffic is fairly bursty, then ``MinThreshold``
282 | should be sufficiently large to allow the link utilization to be
283 | maintained at an acceptably high level. Also, the difference between the
284 | two thresholds should be larger than the typical increase in the
285 | calculated average queue length in one RTT. Setting ``MaxThreshold`` to
286 | twice ``MinThreshold`` seems to be a reasonable rule of thumb given the
287 | traffic mix on today’s Internet. In addition, since we expect the
288 | average queue length to hover between the two thresholds during periods
289 | of high load, there should be enough free buffer space *above*
290 | ``MaxThreshold`` to absorb the natural bursts that occur in Internet
291 | traffic without forcing the router to enter tail drop mode.
292 | 
293 | We noted above that ``Weight`` determines the time constant for the
294 | running average low-pass filter, and this gives us a clue as to how we
295 | might pick a suitable value for it. Recall that RED is trying to send
296 | signals to TCP flows by dropping packets during times of congestion.
297 | Suppose that a router drops a packet from some TCP connection and then
298 | immediately forwards some more packets from the same connection. When
299 | those packets arrive at the receiver, it starts sending duplicate ACKs
300 | to the sender. When the sender sees enough duplicate ACKs, it will
301 | reduce its window size. So, from the time the router drops a packet
302 | until the time when the same router starts to see some relief from the
303 | affected connection in terms of a reduced window size, at least one
304 | round-trip time must elapse for that connection. There is probably not
305 | much point in having the router respond to congestion on time scales
306 | much less than the round-trip time of the connections passing through
307 | it. As noted previously, 100 ms is not a bad estimate of average
308 | round-trip times in the Internet. Thus, ``Weight`` should be chosen such
309 | that changes in queue length over time scales much less than 100 ms are
310 | filtered out.
311 | 
312 | Since RED works by sending signals to TCP flows to tell them to slow
313 | down, you might wonder what would happen if those signals are ignored.
314 | This is often called the *unresponsive flow* problem. Unresponsive
315 | flows use more than their fair share of network resources and could
316 | cause congestive collapse if there were enough of them, just as in the
317 | days before TCP congestion control. Some queueing techniques, such as
318 | weighted fair queueing, could help with this problem by isolating
319 | certain classes of traffic from others. There was also discussion of
320 | creating a variant of RED that could drop more heavily from flows that
321 | are unresponsive to the initial hints that it sends. However this
322 | turns out to be challenging because it can be hard to distinguish
323 | between non-responsive behavior and \"correct\" behavior, especially
324 | when flows have a wide variety of different RTTs and bottleneck bandwidths.
325 | 
326 | As a footnote, 15 prominent network researchers urged for the
327 | widespread adoption of RED-inspired AQM in 1998. The recommendation
328 | was largely ignored, for reasons that we touch on below. AQM
329 | approaches based on RED have, however, been applied with some success
330 | in datacenters.
331 | 
332 | .. _reading_rfc:
333 | .. admonition:: Further Reading
334 | 
335 |       R. Braden, *et al*.
336 |       `Recommendations on Queue Management and Congestion Avoidance in the Internet
337 |       <https://tools.ietf.org/html/rfc2309>`__.
338 |       RFC 2309, April 1998.
339 | 
340 | 
341 | 6.3 Controlled Delay
342 | --------------------
343 | 
344 | As noted in the preceding section, RED has never been widely
345 | adopted. Certainly it never reached the level necessary to have a
346 | significant impact on congestion in the Internet. One reason
347 | is that RED is difficult to configure in a
348 | way that consistently improves performance. Note the large number
349 | of parameters that affect its operation (``MinThreshold``,
350 | ``MaxThreshold``, and ``Weight``). There is enough research
351 | showing that RED produces a wide range of outcomes (not all of
352 | them helpful) depending on the type of traffic and parameter settings.
353 | This created uncertainty around the merits of deploying it.
354 | 
355 | Over a period of years, Van Jacobson (well known for his work on TCP
356 | Congestion and a co-author of the original RED paper) collaborated
357 | with Kathy Nichols and eventually other researchers to come up with an
358 | AQM approach that improves upon RED. This work became known as CoDel
359 | (pronounced *coddle*) for Controlled Delay AQM. CoDel builds on several
360 | key insights that emerged over decades of experience with TCP and
361 | AQM.
362 | 
363 | .. _reading_codel:
364 | .. admonition:: Further Reading
365 | 
366 |       K. Nichols and V. Jacobson.
367 |       `Controlling Queue Delay
368 |       <https://queue.acm.org/detail.cfm?id=2209336>`__.
369 |       ACM Queue, 10(5), May 2012.
370 | 
371 | First, queues are an important aspect of networking and it is expected
372 | that queues will build up from time to time. For example, a newly opened
373 | connection may dump a window's worth of packets into the network, and
374 | these are likely to form a queue at the bottleneck link. This is not
375 | in itself a problem. There should be enough buffer capacity to
376 | absorb such bursts. Problems arise when there is not enough buffer
377 | capacity to absorb bursts, leading to excessive loss. This came to be
378 | understood in the 1990s as a requirement that buffers be able to hold
379 | at least one bandwidth-delay product of packets—a requirement that
380 | was probably too large and subsequently questioned by further
381 | research. But the fact is that buffers are necessary, and it is
382 | expected that they will be used to absorb bursts. The CoDel authors
383 | refer to this as \"good queue\", as illustrated in :numref:`Figure
384 | %s <fig-good-bad>` (a).
385 | 
386 | .. _fig-good-bad:
387 | .. figure:: figures/Slide14.png
388 |    :width: 400px
389 |    :align: center
390 | 
391 |    Good and Bad Queue Scenarios
392 | 
393 | Queues become a problem when they are persistently full. A
394 | persistently full queue is doing nothing except adding delay to the
395 | network, and it is also less able to absorb bursts if it never drains
396 | fully. The combination of large buffers and persistent queues within
397 | those buffers is a phenomenon that Jim Gettys has named
398 | *Bufferbloat*. It is clear that persistently full queues are what a
399 | well-designed AQM mechanism would seek to avoid. Queues that stay full
400 | for long periods without draining are referred to, unsurprisingly, as
401 | \"bad queue\", as shown in :numref:`Figure %s <fig-good-bad>` (b).
402 | 
403 | .. _reading_bloat:
404 | .. admonition::  Further Reading
405 | 
406 |    J. Gettys. `Bufferbloat: Dark Buffers in the Internet
407 |    <https://ieeexplore.ieee.org/document/5755608>`__. IEEE
408 |    Internet Computing, April 2011.
409 | 
410 | In a sense, then, the challenge for an AQM algorithm is to distinguish
411 | between \"good\" and \"bad\" queues, and to trigger packet loss only when
412 | the queue is determined to be \"bad\". Indeed, this is what RED is
413 | trying to do with its ``weight`` parameter (which filters out
414 | transient queue length).
415 | 
416 | One of the innovations of CoDel is to focus on *sojourn time*: the
417 | time that any given packet waits in the queue. Sojourn time is
418 | independent of the bandwidth of a link and provides useful indication
419 | of congestion even on links whose bandwidth varies over time, such as
420 | wireless links. A queue that is behaving well will frequently drain to
421 | zero, and thus, some packets will experience a sojourn time close to
422 | zero, as in :numref:`Figure %s <fig-good-bad>` (a). Conversely, a
423 | congested queue will delay every packet, and the minimum sojourn time
424 | will never be close to zero, as seen in :numref:`Figure %s
425 | <fig-good-bad>` (b). CoDel therefore measures the sojourn
426 | time—something that is easy to do for every packet—and tracks whether
427 | it is consistently sitting above some small target. \"Consistently\"
428 | is defined as \"lasting longer than a typical RTT\".
429 | 
430 | Rather than asking operators to determine the parameters to make
431 | CoDel work well, the algorithm chooses reasonable defaults. A target
432 | sojourn time of 5ms is used, along with a sliding measurement window
433 | of 100ms. The intuition, as with RED, is that 100ms is a typical RTT
434 | for traffic traversing the Internet, and that if congestion is lasting
435 | longer than 100ms, we may be moving into the \"bad queue\" region. So
436 | CoDel monitors the sojourn time relative to the target of 5ms. If it
437 | is above target for more than 100ms, it is time to start taking action
438 | to reduce the queue via drops (or marking if explicit congestion
439 | notification, described below, is available). 5ms is chosen as being
440 | close to zero (for better delay) but not so small that the queue would
441 | run empty. It should be noted that a great deal of experimentation and
442 | simulation has gone into these numerical choices, but more importantly, the
443 | algorithm does not seem to be overly sensitive to them.
444 | 
445 | To summarize, CoDel largely ignore queues that last less than an RTT,
446 | but starts taking action as soon as a queue persists for more than
447 | an RTT. By making reasonable assumptions about Internet RTTs, the algorithm
448 | requires no configuration parameters.
449 | 
450 | An additional subtlety is that CoDel drops a slowly increasing percentage of
451 | traffic as long as the observed sojourn time remains above the target. As
452 | discussed further in Section 7.4, TCP throughput has been shown to
453 | depend inversely on the square root of loss rate. Thus, as long as the
454 | sojourn time stays above the target, CoDel steadily
455 | increases its drop rate in proportion to the square root
456 | of the number of drops since the target was exceeded. The effect of
457 | this, in theory, is to cause a linear decrease in throughput of the
458 | affected TCP connections. Eventually this should lead to enough
459 | reduction in arriving traffic to allow the queue to drain, bringing
460 | the sojourn time back below the target.
461 | 
462 | .. _fig-codel:
463 | .. figure:: figures/Slide16.png
464 |    :width: 500px
465 |    :align: center
466 | 
467 |    Home routers can suffer from bufferbloat, a situation CoDel is
468 |    well-suited to address.
469 | 
470 | There are more details to CoDel presented in the Nichols and Jacobson
471 | paper, including extensive simulations to indicate its effectiveness
472 | across a wide range of scenarios. The algorithm has been standardized
473 | as \"experimental\" by the IETF in RFC 8289. It is also implemented in
474 | the Linux kernel, which has aided in its deployment. In particular,
475 | CoDel provides value in home routers (which are often Linux-based), a
476 | point along the end-to-end path (see :numref:`Figure %s <fig-codel>`)
477 | that commonly experiences bufferbloat.
478 | 
479 | 
480 | 6.4 Explicit Congestion Notification
481 | ------------------------------------
482 | 
483 | While TCP's congestion control mechanism was initially based on packet
484 | loss as the primary congestion signal, it has long been recognized
485 | that TCP could do a better job if routers were to send a more explicit
486 | congestion signal. That is, instead of *dropping* a packet and assuming TCP will eventually
487 | notice (e.g., due to the arrival of a duplicate ACK), any AQM
488 | algorithm can potentially do a better job if it instead *marks* the
489 | packet and continues to send it along its way to the destination. This
490 | idea was codified in changes to the IP and TCP headers known as
491 | *Explicit Congestion Notification* (ECN), as specified in RFC 3168.
492 | 
493 | .. _reading_ecn:
494 | .. admonition::  Further Reading
495 | 
496 |    K. Ramakrishnan, S. Floyd, and D. Black.
497 |    `The Addition of Explicit Congestion Notification (ECN) to IP
498 |    <https://datatracker.ietf.org/doc/html/rfc3168>`__.
499 |    RFC 3168, September 2001.
500 | 
501 | Specifically, this feedback is implemented by treating two bits in the
502 | IP ``TOS`` field as ECN bits. One bit is set by the source to indicate
503 | that it is ECN-capable, that is, able to react to a congestion
504 | notification. This is called the ``ECT`` bit (ECN-Capable Transport).
505 | The other bit is set by routers along the end-to-end path when
506 | congestion is encountered, as computed by whatever AQM algorithm it is
507 | running. This is called the ``CE`` bit (Congestion Encountered).
508 | 
509 | In addition to these two bits in the IP header (which are
510 | transport-agnostic), ECN also includes the addition of two optional
511 | flags to the TCP header. The first, ``ECE`` (ECN-Echo), communicates
512 | from the receiver to the sender that it has received a packet with the
513 | ``CE`` bit set. The second, ``CWR`` (Congestion Window Reduced)
514 | communicates from the sender to the receiver that it has reduced the
515 | congestion window.
516 | 
517 | While ECN is now the standard interpretation of two of the eight bits in
518 | the ``TOS`` field of the IP header and support for ECN is highly
519 | recommended, it is not required. Moreover, there is no single
520 | recommended AQM algorithm, but instead, there is a list of requirements
521 | a good AQM algorithm should meet. Like TCP congestion control
522 | algorithms, every AQM algorithm has its advantages and disadvantages,
523 | and so we need a lot of them to argue about.
524 | 
525 | 
526 | 6.5 Ingress/Egress Queues
527 | -------------------------
528 | 
529 | We have been drawing a clear line between approaches to congestion
530 | control that happen *inside the network* (i.e., the AQM algorithms
531 | described in this chapter) and *at the edge of the network* (i.e., the
532 | TCP-based algorithms described in earlier chapters). But the line
533 | isn’t necessarily that crisp. To see this, you just have to think of
534 | the end-to-end path as having a *ingress queue* at the kernel/device
535 | interface on the sending host and an *egress queue* at the
536 | device/kernel interface on the receiving host.\ [#]_ These edge queues
537 | are likely to become increasingly important as virtual switches and
538 | NIC support for virtualization become more and more common.
539 | 
540 | .. [#]
541 |     Confusingly, the *ingress queue* from the perspective of the
542 |     network path is the outbound (egress) queue on the sending host
543 |     and, the *egress queue* from the perspective of the network
544 |     path is the inbound (ingress) queue on the receiving host. As
545 |     shown in :numref:`Figure %s <fig-ingress_egress>`, we use the
546 |     terms ingress and egress from the network's perspective.
547 | 
548 | This perspective is illustrated in :numref:`Figure %s
549 | <fig-ingress_egress>`, where both locations sit below TCP, and provide
550 | an opportunity to inject a second piece of congestion control logic
551 | into the end-to-end path. CoDel and ECN are examples of this idea: they
552 | have been implemented at the device queue level of the Linux kernel.
553 | 
554 | .. _fig-ingress_egress:
555 | .. figure:: figures/Slide15.png
556 |    :width: 500px
557 |    :align: center
558 | 
559 |    Ingress and egress queues along the end-to-end path, implemented in
560 |    the sending and receiving hosts, respectively.
561 | 
562 | Does this work? One issue is whether packets are dropped at the ingress
563 | or the egress.  When dropping at the ingress (on the sending host),
564 | TCP is notified in the return value of the *Write* function, which
565 | causes it to "forget" that it sent the packet. This means this packet
566 | will be sent next, although TCP does decrease its congestion window in
567 | response to the failed write. In contrast, dropping packets at the
568 | egress queue (on the receiving host), means the TCP sender will not
569 | know to retransmit the packet until it detects the loss using one of
570 | its standard mechanisms (e.g., three duplicate ACKs, a timeout). Of
571 | course, having the egress implement ECN helps.
572 | 
573 | When we consider this discussion in the context of the bigger
574 | congestion control picture, we can make
575 | two interesting observations. One is that Linux provides a convenient
576 | and safe way to inject new code—including congestion control
577 | logic—into the kernel, namely, using the *extended Berkeley Packet
578 | Filter (eBPF)*. eBPF is becoming an important technology in many other
579 | contexts as well. The standard kernel API for congestion control has
580 | been ported to eBPF and most existing congestion control algorithms
581 | have been ported to this framework. This simplifies the task of
582 | experimenting with new algorithms or tweaking existing algorithms by
583 | side-stepping the hurdle of waiting for the relevant Linux kernel to
584 | be deployed.
585 | 
586 | .. _reading_bpf:
587 | .. admonition:: Further Reading
588 | 
589 |       The Linux Kernel.
590 |       `BPF Documentation
591 |       <https://www.kernel.org/doc/html/latest/bpf/index.html>`__.
592 | 
593 | A second observation is that by explicitly exposing the ingress/egress
594 | queues to the decision-making process, we open the door to building a
595 | congestion control mechanism that contains both a “decide when to
596 | transmit a packet” component and a “decide to queue-or-drop a packet”
597 | component.  We’ll see an example of a mechanism that takes an innovative
598 | approach to using these two components in Section 7.1 when we describe
599 | On-Ramp.
600 | 
601 | 
602 | 
603 | 


--------------------------------------------------------------------------------
/authors.rst:
--------------------------------------------------------------------------------
 1 | About The Authors
 2 | ==================
 3 | 
 4 | **Larry Peterson** is the Robert E. Kahn Professor of Computer
 5 | Science, Emeritus at Princeton University, where he served as Chair
 6 | from 2003-2009. His research focuses on the design, implementation,
 7 | and operation of Internet-scale distributed systems, including the
 8 | widely used PlanetLab and MeasurementLab platforms.  He is currently
 9 | contributing to the Aether access-edge cloud project at the Open
10 | Networking Foundation (ONF), where he serves as Chief Scientist.
11 | Peterson is a member of the National Academy of Engineering, a Fellow
12 | of the ACM and the IEEE, the 2010 recipient of the IEEE Kobayashi
13 | Computer and Communication Award, and the 2013 recipient of the ACM
14 | SIGCOMM Award. He received his Ph.D. degree from Purdue University.
15 | 
16 | **Lawrence Brakmo** currently works in the Kernel group at Facebook.
17 | Prior to joining Facebook, he was a member of the Host Networking
18 | group at Google, and before that, a researcher and project manager of
19 | the OS group at DoCoMo USA Labs. Brakmo has worked on TCP enhancements
20 | to improve network performance, including the design of the TCP Vegas
21 | and TCP-NV congestion control algorithms. He has also developed OS
22 | techniques to improve system reliability, performance, and energy
23 | consumption. Brakmo received his Ph.D. degree in Computer Science from
24 | The University of Arizona.
25 | 
26 | **Bruce Davie** is a computer scientist noted for his contributions to
27 | the field of networking. He is a former VP and CTO for the Asia
28 | Pacific region at VMware. He joined VMware during the acquisition of
29 | Software Defined Networking (SDN) startup Nicira. Prior to that, he
30 | was a Fellow at Cisco Systems, leading a team of architects
31 | responsible for Multiprotocol Label Switching (MPLS). Davie has over
32 | 30 years of networking industry experience and has co-authored 17
33 | RFCs. He was recognized as an ACM Fellow in 2009 and chaired ACM
34 | SIGCOMM from 2009 to 2013. He was also a visiting lecturer at the
35 | Massachusetts Institute of Technology for five years. Davie is the
36 | author of multiple books and the holder of more than 40 U.S. Patents.
37 | 
38 | 


--------------------------------------------------------------------------------
/avoidance.rst:
--------------------------------------------------------------------------------
  1 | Chapter 5:  Avoidance-Based Algorithms
  2 | ======================================
  3 | 
  4 | .. include:: <isogrk3.txt>
  5 | 
  6 | A review of the academic literature on TCP congestion control shows a
  7 | notable gap between the original TCP Tahoe and Reno mechanisms
  8 | introduced in 1988 and 1990, respectively, and the next major flurry
  9 | of activity starting in 1994, marked by the introduction of an
 10 | alternative approach known as TCP Vegas. This triggered an avalanche
 11 | of comparative studies and alternative designs that would persist for
 12 | the next 25+ years.
 13 | 
 14 | .. _reading_vegas:
 15 | .. admonition:: Further Reading
 16 | 
 17 |       L. Brakmo, S. O'Malley and L. Peterson
 18 |       `TCP Vegas: New Technique for Congestion Detection and Avoidance
 19 |       <https://sites.cs.ucsb.edu/~almeroth/classes/F05.276/papers/vegas.pdf>`__.
 20 |       ACM SIGCOMM '94 Symposium. August 1994. (Reprinted in IEEE/ACM Transactions
 21 |       on Networking, October 1995).
 22 | 
 23 | Whereas every approach described to date sees packet loss as a
 24 | congestion signal and tries to react to *control* congestion after the
 25 | onset, TCP Vegas takes an *avoidance-based* approach to congestion: it
 26 | tries to detect changes in the measured throughput rate, and adjust
 27 | the sending rate *before* congestion becomes severe enough to cause
 28 | packet loss. This chapter describes the general "Vegas strategy",
 29 | along with three example variations to that strategy introduced over
 30 | time. This case study culminates in the BBR algorithm championed by
 31 | Google today.
 32 | 
 33 | 5.1 TCP Vegas
 34 | -------------
 35 | 
 36 | The essential idea behind TCP Vegas is to adapt the sending rate based
 37 | on a comparison of the *measured* throughput rate with the *expected*
 38 | throughput rate. The intuition can be seen in the trace of TCP Reno
 39 | given in :numref:`Figure %s <fig-trace3>`. The top graph traces the
 40 | connection’s congestion window; it shows the same information as the
 41 | traces given in the previous chapter.  The middle and bottom graphs
 42 | depict new information: the middle graph shows the average sending
 43 | rate as measured at the source, and the bottom graph shows the average
 44 | queue length as measured at the bottleneck router. All three graphs
 45 | are synchronized in time. In the period between 4.5 and 6.0 seconds
 46 | (shaded region), the congestion window increases (top graph). We
 47 | expect the observed throughput to also increase, but instead it stays
 48 | flat (middle graph). This is because the throughput cannot increase
 49 | beyond the available bandwidth. Beyond this point, any increase in the
 50 | window size only results in packets taking up buffer space at the
 51 | bottleneck router (bottom graph).
 52 | 
 53 | .. _fig-trace3:
 54 | .. figure:: figures/f06-18-9780123850591.png
 55 |    :width: 600px
 56 |    :align: center
 57 | 
 58 |    Congestion window versus observed throughput rate (the
 59 |    three graphs are synchronized). Top, congestion window; middle,
 60 |    observed throughput; bottom, buffer space taken up at the
 61 |    router. Colored line = `CongestionWindow`; solid bullet = timeout;
 62 |    hash marks = time when each packet is transmitted; vertical bars =
 63 |    time when a packet that was eventually retransmitted was first
 64 |    transmitted.
 65 | 
 66 | A useful metaphor that describes the phenomenon illustrated in
 67 | :numref:`Figure %s <fig-trace3>` is driving on ice. The speedometer
 68 | (congestion window) may say that you are going 30 miles an hour, but
 69 | by looking out the car window and seeing people pass you on foot
 70 | (measured throughput rate) you know that you are going no more than 5
 71 | miles an hour. The uselessly spinning wheels in this analogy are like
 72 | the extra packets being sent only to sit uselessly in router buffers.
 73 | 
 74 | TCP Vegas uses this idea to measure and control the amount of extra data
 75 | this connection has in transit, where by “extra data” we mean data that
 76 | the source would not have transmitted had it been able to match
 77 | exactly the available bandwidth of the network. The goal of TCP Vegas is
 78 | to maintain the “right” amount of extra data in the network. Obviously,
 79 | if a source is sending too much extra data, it will cause long delays
 80 | and possibly lead to congestion. Less obviously, if a connection is
 81 | sending too little extra data, it cannot respond rapidly enough to
 82 | transient increases in the available network bandwidth. TCP Vegas’s
 83 | congestion-avoidance actions are based on changes in the estimated
 84 | amount of extra data in the network, not only on dropped packets. We now
 85 | describe the algorithm in detail.
 86 | 
 87 | First, define a given flow’s ``BaseRTT`` to be the RTT of a packet when
 88 | the flow is not congested. In practice, TCP Vegas sets ``BaseRTT`` to
 89 | the minimum of all measured round-trip times; it is commonly the RTT of
 90 | the first packet sent by the connection, before the router queues
 91 | increase due to traffic generated by this flow. If we assume that we are
 92 | not overflowing the connection, then the expected throughput is given by
 93 | 
 94 | .. math:: \mathsf{ExpectedRate = CongestionWindow\ /\ BaseRTT}
 95 | 
 96 | where ``CongestionWindow`` is the TCP congestion window, which we
 97 | assume (for the purpose of this discussion) to be equal to the number
 98 | of bytes in transit.
 99 | 
100 | Second, TCP Vegas calculates the current sending rate, ``ActualRate``.
101 | This is done by recording the sending time for a distinguished packet,
102 | recording how many bytes are transmitted between the time that packet
103 | is sent and when its acknowledgment is received, computing the sample
104 | RTT for the distinguished packet when its acknowledgment arrives, and
105 | dividing the number of bytes transmitted by the sample RTT. This
106 | calculation is done once per round-trip time.
107 | 
108 | Third, TCP Vegas compares ``ActualRate`` to ``ExpectedRate`` and
109 | adjusts the window accordingly. We let ``Diff = ExpectedRate -
110 | ActualRate``.  Note that ``Diff`` is positive or 0 by definition,
111 | since the only way ``ActualRate > ExpectedRate`` is if the measured
112 | sample RTT is less than ``BaseRTT``. If that happens we change
113 | ``BaseRTT`` to the latest sampled RTT. We also define two thresholds,
114 | :math:`\alpha` < :math:`\beta`, corresponding to having too little and too much
115 | extra data in the network, respectively. When ``Diff`` < :math:`\alpha`, TCP
116 | Vegas increases the congestion window linearly during the next RTT,
117 | and when ``Diff`` > :math:`\beta`, TCP Vegas decreases the congestion window
118 | linearly during the next RTT.  TCP Vegas leaves the congestion window
119 | unchanged when :math:`\alpha` < ``Diff`` < :math:`\beta`.
120 | 
121 | Intuitively, we can see that the farther away the actual throughput
122 | gets from the expected throughput, the more congestion there is in the
123 | network, which implies that the sending rate should be reduced. The
124 | :math:`\beta` threshold triggers this decrease. On the other hand, when the
125 | actual throughput rate gets too close to the expected throughput, the
126 | connection is in danger of not utilizing the available bandwidth. The
127 | :math:`\alpha` threshold triggers this increase. The overall goal is to keep
128 | between :math:`\alpha` and :math:`\beta` extra bytes in the network.
129 | 
130 | .. _fig-vegas:
131 | .. figure:: figures/f06-19-9780123850591.png
132 |    :width: 600px
133 |    :align: center
134 | 
135 |    Trace of TCP Vegas congestion-avoidance mechanism.
136 |    Top, congestion window; bottom, expected (colored line) and actual
137 |    (black line) throughput. The shaded area is the region between the
138 |    :math:`\alpha` and :math:`\beta` thresholds.
139 | 
140 | :numref:`Figure %s <fig-vegas>` traces the TCP Vegas
141 | congestion-avoidance algorithm. The top graph traces the congestion
142 | window, showing the same information as the other traces given
143 | throughout this chapter. The bottom graph traces the expected and
144 | actual throughput rates that govern how the congestion window is
145 | set. It is this bottom graph that best illustrates how the algorithm
146 | works. The colored line tracks the ``ExpectedRate``, while the black
147 | line tracks the ``ActualRate``. The wide shaded strip gives the region
148 | between the :math:`\alpha` and :math:`\beta` thresholds; the top of the shaded strip is
149 | :math:`\alpha` KBps away from ``ExpectedRate``, and the bottom of the shaded
150 | strip is :math:`\beta` KBps away from ``ExpectedRate``.  The goal is to keep the
151 | ``ActualRate`` between these two thresholds, within the shaded
152 | region. Whenever ``ActualRate`` falls below the shaded region (i.e.,
153 | gets too far from ``ExpectedRate``), TCP Vegas decreases the
154 | congestion window because it fears that too many packets are being
155 | buffered in the network. Likewise, whenever ``ActualRate`` goes above
156 | the shaded region (i.e., gets too close to the ``ExpectedRate``), TCP
157 | Vegas increases the congestion window because it fears that it is
158 | underutilizing the network.
159 | 
160 | Because the algorithm, as just presented, compares the difference
161 | between the actual and expected throughput rates to the :math:`\alpha` and :math:`\beta`
162 | thresholds, these two thresholds are defined in terms of KBps. However,
163 | it is perhaps more accurate to think in terms of how many extra
164 | *packet buffers* the connection is occupying in the network. For example, on a
165 | connection with a ``BaseRTT`` of 100 ms and a packet size of 1 KB, if
166 | :math:`\alpha` = 30 KBps and :math:`\beta` = 60 KBps, then we can think of :math:`\alpha` as specifying
167 | that the connection needs to be occupying at least 3 extra buffers in
168 | the network and :math:`\beta` as specifying that the connection should occupy no
169 | more than 6 extra buffers in the network. This setting of :math:`\alpha`
170 | and :math:`\beta` worked well in practice when Vegas was first deployed, but
171 | as we'll see in the next section, these parameters continue to be tuned
172 | for changing circumstances.
173 | 
174 | Finally, you will notice that TCP Vegas decreases the congestion window
175 | linearly, seemingly in conflict with the rule that multiplicative
176 | decrease is needed to ensure stability. The explanation is that TCP
177 | Vegas does use multiplicative decrease when a timeout occurs; the linear
178 | decrease just described is an *early* decrease in the congestion window
179 | that should happen before congestion occurs and packets start being
180 | dropped.
181 | 
182 | 5.2 Varied Assumptions
183 | ----------------------
184 | 
185 | TCP Vegas—and Vegas-like approaches to avoiding congestion—have been
186 | adapted over time, often in response to different assumptions about
187 | the network.  Vegas was never as widely deployed as Reno, so the
188 | modifications were often driven more by lab studies than extensive
189 | real-world experience, but they have collectively refined and
190 | contributed to our understanding of avoidance-based algorithms. We
191 | summarize some of those insights here, but return to the general topic
192 | of customizing the congestion control algorithm for specific use cases
193 | in Chapter 7.
194 | 
195 | 5.2.1 FAST TCP
196 | ~~~~~~~~~~~~~~
197 | 
198 | The first Vegas-inspired mechanism was FAST TCP, which modified Vegas
199 | to be more efficient on high-speed networks with large bandwidth-delay
200 | products. The idea was to increase the congestion window more
201 | aggressively during the phase when the algorithm is trying to find the
202 | available "in transit" bandwidth (before packets are buffered in the
203 | network), and then more conservatively as the algorithm starts to
204 | compete with other flows for buffers at the bottleneck router. FAST
205 | also recommended adjusting the value of :math:`\alpha` to roughly 30 packets.
206 | 
207 | Beyond managing congestion in networks with large bandwidth-delay
208 | products, where keeping the pipe full is a substantial challenge,
209 | there are two other items of note about FAST. First, whereas both TCP
210 | Reno and TCP Vegas were the result of a little intuition and a lot of
211 | trial-and-error, FAST was grounded in optimization theory (which was
212 | subsequently used to explain why Vegas works). Second, unlike all
213 | other congestion control algorithms of which we are aware, an
214 | implementation of FAST was made available only as a proprietary
215 | solution.
216 | 
217 | .. _reading_fast:
218 | .. admonition:: Further Reading
219 | 
220 |      S. Low, L. Peterson, and L. Wang. `Understanding TCP Vegas: A
221 |      Duality Model. <https://dl.acm.org/doi/10.1145/506147.506152>`__.
222 |      Journal of the ACM, Volume 49, Issue 2, March 2002.
223 | 
224 | 
225 | 5.2.2 TCP Westwood
226 | ~~~~~~~~~~~~~~~~~~
227 | 
228 | While Vegas was motivated by the idea that congestion can be detected
229 | and averted *before* a loss occurs, TCP Westwood (TCPW) is motivated
230 | primarily by the realization that packet loss is not always a reliable
231 | indicator of congestion. This is particularly noticeable with wireless
232 | links, which were a novelty at the time of Vegas but becoming common
233 | by the time of TCPW. Wireless links often lose packets due to
234 | uncorrected errors on the wireless channel, which are unrelated to
235 | congestion. Hence, congestion needs to be detected another
236 | way. Interestingly, the end result is somewhat similar to Vegas, in
237 | that TCPW also tries to determine the bottleneck bandwidth by looking
238 | at the rate at which ACKs are coming back for those packets that were
239 | delivered successfully.
240 | 
241 | When a packet loss occurs, TCPW does not immediately cut the
242 | congestion window in half, as it does not yet know if the loss was due
243 | to congestion or a link-related packet loss. So instead it estimates
244 | the rate at which traffic was flowing right before the packet loss
245 | occurred. This is a less aggressive form of backoff than TCP Reno. If
246 | the loss was congestion-related, TCPW should send at the rate that was
247 | acceptable before the loss. And if the loss was caused by a wireless
248 | error, TCPW has not backed off so much, and will start to ramp up
249 | again to fully utilize the network. The result was a protocol which
250 | performed similarly to Reno for fixed links but outperformed it by
251 | substantial margins when lossy links were involved.
252 | 
253 | Tuning the congestion control algorithm to deal with wireless links
254 | continues to be a challenging problem, and to complicate matters, WiFi
255 | and the Mobile Cellular network have different properties. We return
256 | to this issue in Chapter 7.
257 | 
258 | 
259 | 5.2.3 New Vegas
260 | ~~~~~~~~~~~~~~~
261 | 
262 | Our final example is New Vegas (NV), an adaptation of Vegas's
263 | delay-based approach to datacenters, where link bandwidths are 10Gbps
264 | or higher and RTTs are typically measured in the tens of
265 | microseconds. This is an important use case that we return to in
266 | Chapter 7; our goal here is to build some intuition.
267 | 
268 | To understand the basic idea of NV, suppose that we plot ``Rate``
269 | versus ``CongestionWindow`` for every packet for which an ACK is
270 | received. For the purpose of this exercise, ``Rate`` is simply the
271 | ratio of ``CongestionWindow`` (in bytes) to the RTT of packets that
272 | have been ACKed (in seconds).  Note that we use ``CongestionWindow``
273 | in this discussion for simplicity, while in practice NV uses in-flight
274 | (unacknowledged) bytes. When plotted over time, as shown in
275 | :numref:`Figure %s <fig-nv>`, we end up with vertical bars (rather
276 | than points) for values of ``CongestionWindow`` due to transient
277 | congestion or noise in the measurements.
278 | 
279 | .. _fig-nv:
280 | .. figure:: figures/Slide4.png
281 |    :width: 500px
282 |    :align: center
283 | 
284 |    Plotting measured rate vs congestion window.
285 | 
286 | The maximum slope of the top of the bars indicates the best we have
287 | been able to do in the past. In a well tuned system, the top of the
288 | bars is bounded by a straight line going through the origin. The idea
289 | is that as long as the network is not congested, doubling the amount
290 | of data we send per RTT should double the rate.
291 | 
292 | New measurements of ``Rate`` and ``CongestionWindow`` can either fall close to the
293 | boundary line (black diamond in the figure) or below (blue diamond in the
294 | figure).  A measurement above the line causes NV to automatically
295 | update the line by increasing its slope so the measurement will fall
296 | on the new line. If the new measurement is close to the line, then NV
297 | increases ``CongestionWindow``. If the measurement is below the line, it means
298 | that we have seen equal performance in the past with a lower
299 | ``CongestionWindow``. In the example shown in :numref:`Figure %s <fig-nv>`, we see
300 | similar performance with ``CongestionWindow=12``, so we decrease ``CongestionWindow``. The
301 | decrease is done multiplicatively, rather than instantaneously, in case
302 | the new measurement is noisy. To filter out bad measurements, NV
303 | collects many measurements and then use the best one before making a
304 | congestion determination.
305 | 
306 | 
307 | 5.3 TCP BBR
308 | ---------------
309 | 
310 | BBR (Bottleneck Bandwidth and RTT) is a new TCP congestion control
311 | algorithm developed by researchers at Google. Like Vegas, BBR is delay
312 | based, which means it tries to detect buffer growth so as to avoid
313 | congestion and packet loss. Both BBR and Vegas use the minimum RTT and
314 | the observed bottleneck bandwidth, as calculated over some time
315 | interval, as their main control signals.
316 | 
317 | .. _fig-bbr:
318 | .. figure:: figures/Slide5.png
319 |    :width: 500px
320 |    :align: center
321 | 
322 |    Determining the optimal sending rate based on observed throughput
323 |    and RTT.
324 | 
325 | :numref:`Figure %s <fig-bbr>` shows the basic idea underlying
326 | BBR. Assume a network has a single bottleneck link with some available
327 | bandwidth and queuing capacity. As the congestion window opens and
328 | more data is put in flight, initially there is an increase in
329 | throughput (on the lower graph) but no increase in delay as the
330 | bottleneck is not full. Then once the data rate reaches the bottleneck
331 | bandwidth, a queue starts to build. At this point, RTT rises, and no
332 | rise in throughput is observed. This is the beginning of the
333 | congestion phase. This graph is really a simplified version of what we
334 | see in the 4.5 to 6.0 second timeframe in :numref:`Figure %s
335 | <fig-trace3>`.
336 | 
337 | Like Vegas, BBR aims to accurately determine that point where the
338 | queue has just started to build, as opposed to continuing all the way
339 | to the point of filling the buffer and causing packet drops as Reno
340 | does. A lot of the work in BBR has been around improving the
341 | sensitivity of the mechanisms that locate that sweet spot. There are
342 | numerous challenges: measurements of bandwidth and delay are noisy;
343 | network conditions are not static; and the perennial quest for
344 | fairness when competing for bandwidth against both BBR and non-BBR
345 | flows.
346 | 
347 | One striking feature of BBR compared to the other approaches we have
348 | seen is that it does not rely solely on ``CongestionWindow`` to determine how much
349 | data is put in flight. Notably, BBR also tries to smooth out the rate
350 | at which a sender puts data into the network in an effort to avoid
351 | bursts that would lead to excessive queuing. Under ideal conditions,
352 | we would like to send data exactly at the rate of the bottleneck, thus
353 | achieving the highest possible throughput without causing a queue to
354 | build up. Whereas most TCP variants use the arrival of an ACK to
355 | "clock" the sending of data, thus ensuring that the amount of
356 | unacknowledged data in flight remains constant, BBR creates an
357 | estimate of the bottleneck bandwidth and uses a local scheduling
358 | algorithm to send data at that rate. ACKs still play an important role
359 | in updating knowledge about the state of the network, but they are not
360 | directly used to pace transmissions. This means that delayed ACKs do
361 | not lead to sudden bursts of transmission. Of course, ``CongestionWindow`` is
362 | still used to ensure that enough data is sent to keep the pipe full,
363 | and to ensure that the amount of data in flight is not so much greater
364 | than the bandwidth-delay product as to cause queues to overflow.
365 | 
366 | In order to maintain an up-to-date view of the current RTT and
367 | bottleneck bandwidth, it is necessary to keep probing above and below
368 | the current estimate of the bottleneck bandwidth. More bandwidth can
369 | become available due to a reduction in the traffic from competing
370 | flows, changes in link properties (e.g. on wireless links), or routing
371 | changes. Changes in RTT are also possible, particularly if the path
372 | changes. To detect a change in RTT it is necessary to send less
373 | traffic, hence draining queues. To detect a change in available
374 | bandwidth, it is necessary to send more traffic. Hence, BBR probes
375 | both above and below its current estimate of the bottleneck
376 | bandwidth. If necessary, the estimates are updated, and the sending
377 | rate and ``CongestionWindow`` are updated accordingly.
378 | 
379 | .. _fig-bbrstate:
380 | .. figure:: figures/Slide6.png
381 |    :width: 150px
382 |    :align: center
383 | 
384 |    State machine diagram for BBR.
385 | 
386 | The process of sequentially probing for the available bandwidth and
387 | the minimum RTT is captured in the state diagram of :numref:`Figure %s
388 | <fig-bbrstate>`. After an aggressive startup phase to try to establish
389 | the available bandwidth on the path, the sending rate is reduced to
390 | drain the queue, and then the algorithm settles into the inner loop of
391 | the diagram, in which it periodically checks for better delay at lower
392 | sending rates, or better throughput at higher sending rates. On a
393 | relatively long timescale (multiple seconds) the algorithm moves into
394 | the ``ProbeRTT`` state, lowering its sending rate by a factor of two in an
395 | effort to fully drain the queue and test for lower RTT.
396 | 
397 | One interesting aspect of this approach is that when a large flow
398 | reduces its sending rate dramatically in the ``ProbeRTT`` state, that flow's contribution to queuing delay
399 | drops, which causes other flows to simultaneously see a new, lower RTT, and update
400 | their estimates. Hence flows show a tendency to synchronize their RTT
401 | estimation at times when the queue is actually empty or close to it,
402 | improving the accuracy of this estimate.
403 | 
404 | 
405 | 
406 | BBR is actively being worked on and rapidly evolving, with version 2
407 | in use at the time of writing. One major focus is
408 | fairness. For example, some early experiments showed CUBIC flows getting 100x less
409 | bandwidth when competing with BBR flows, and other experiments show that
410 | unfairness among BBR flows is possible. BBR version 1 was insensitive
411 | to loss, which could lead to high loss rates particularly when the
412 | amount of buffering on the path was relatively low. As several
413 | implementations of BBR are now being tried in different environments,
414 | including within Google's internal backbone and in the broader
415 | Internet, experience is being gathered to further refine the design. The
416 | IETF's Congestion Control Working Group is hosting discussions on the
417 | ongoing design and experimentation.
418 | 
419 | 
420 | .. _reading_bbr:
421 | .. admonition:: Further Reading
422 | 
423 |      N. Cardwell, Y. Cheng, C. S. Gunn, S. Yeganeh, V. Jacobson. `BBR: Congestion-based
424 |      Congestion Control
425 |      <https://cacm.acm.org/magazines/2017/2/212428-bbr-congestion-based-congestion-control/fulltext>`__.
426 |      Communications of the ACM, Volume 60, Issue 2, February 2017.
427 | 
428 | 
429 | 
430 | 


--------------------------------------------------------------------------------
/biblio.rst:
--------------------------------------------------------------------------------
  1 | Annotated Bibliography
  2 | =======================
  3 | 
  4 | The set of research papers published on congestion control is
  5 | extensive, with only a small subset cited in the main body of the
  6 | book. This section is a place to collect a more comprehensive
  7 | bibliography, which (for now) is organized according the major themes
  8 | covered in the book.
  9 | 
 10 | We invite the community help keep the bibliography complete and
 11 | up-to-date. Please submit a `Pull Request to GitHub
 12 | <https://github.com/SystemsApproach/tcpcc>`__ to include additional
 13 | citations or to fix mistakes. Post an `Issue to GitHub
 14 | <https://github.com/SystemsApproach/tcpcc/issues>`__ if you have
 15 | suggestions for ways to improve how the bibliography is organized.
 16 | 
 17 | Foundational
 18 | -----------------
 19 | 
 20 | Queuing Analysis
 21 | ~~~~~~~~~~~~~~~~~~~~
 22 | 
 23 | * L. Kleinrock. `Queueing Systems, Volume 2
 24 |   <https://archive.org/details/queueingsystems02klei>`__.  Wiley &
 25 |   Sons, May 1976.
 26 | 
 27 | * V. Paxson and S. Floyd. `Wide-Area Traffic: The Failure of Poisson
 28 |   Modeling <https://www.icir.org/vern/papers/poisson.TON.pdf>`__.
 29 |   IEEE/ACM Transactions on Networking, June 1995.
 30 | 
 31 | * W. Leland *et al*, `On the self-similar nature of Ethernet traffic
 32 |   <https://doi.org/10.1145/167954.166255>`__.
 33 |   ACM SIGCOMM '93 Symposium, August 1993.
 34 | 
 35 | * J. Gettys. `Bufferbloat: Dark Buffers in the Internet
 36 |   <https://ieeexplore.ieee.org/document/5755608>`__.
 37 |   IEEE Internet Computing, April 2011.
 38 | 
 39 | Theoretical Underpinnings
 40 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~
 41 | 
 42 | * M. Mathis, J. Semke, J. Mahdavi, and T. Ott. `The Macroscopic
 43 |   Behavior of the TCP Congestion Avoidance Algorithm
 44 |   <https://dl.acm.org/doi/abs/10.1145/263932.264023>`__.
 45 |   SIGCOMM CCR, 27(3), July 1997.
 46 | 
 47 | * F. Kelly. `Charging and Rate Control for Elastic Traffic
 48 |   <http://www.statslab.cam.ac.uk/~frank/elastic.pdf>`__.
 49 |   European Transactions on Telecommunications, 8:33–37, 1997.
 50 | 
 51 | * S. Athuraliya and S. Low. `An Empirical Validation of a Duality
 52 |   Model of TCP and Active Queue Management Algorithms
 53 |   <https://ieeexplore.ieee.org/document/977445>`__.
 54 |   Proceedings of the Winter Simulation Conference, 2001.
 55 | 
 56 | * R. Jain and K. K. Ramakrishnan. `Congestion Avoidance in Computer
 57 |   Networks with a Connectionless Network Layer: Concepts, Goals and
 58 |   Methodology. <https://arxiv.org/pdf/cs/9809095.pdf>`__.  Computer
 59 |   Networking Symposium, April 1988.
 60 | 
 61 | Evaluation Criteria
 62 | ~~~~~~~~~~~~~~~~~~~~
 63 | 
 64 | * R. Jain, D. Chiu, and W. Hawe. `A Quantitative Measure of Fairness
 65 |   and Discrimination for Resource Allocation in Shared Computer Systems
 66 |   <https://www.cse.wustl.edu/~jain/papers/ftp/fairness.pdf>`__.
 67 |   DEC Research Report TR-301, 1984.
 68 | 
 69 | * Bob Briscoe. `Flow Rate Fairness: Dismantling a Religion
 70 |   <https://web.stanford.edu/class/cs244/papers/fair-ccr2007.pdf>`__.
 71 |   ACM SIGCOMM CCR, April 2007.
 72 | 
 73 | * R. Ware, *et al*. `Beyond Jain's Fairness Index: Setting the Bar for
 74 |   the Deployment of Congestion Control Algorithms
 75 |   <https://www.cs.cmu.edu/~rware/assets/pdf/ware-hotnets19.pdf>`__.
 76 |   ACM SIGCOMM HotNets. November 2019.
 77 | 
 78 | Architecture
 79 | ~~~~~~~~~~~~~
 80 | 
 81 | * J. Saltzer, D. Reed, and D. Clark. `End-to-End Arguments in System Design
 82 |   <https://web.mit.edu/Saltzer/www/publications/endtoend/endtoend.pdf>`__.
 83 |   ACM Transactions on Computer Systems, Nov. 1984.
 84 | 
 85 | * D. Clark, `The Design Philosophy of the DARPA Internet Protocols
 86 |   <https://dl.acm.org/doi/10.1145/52324.52336>`__.
 87 |   ACM SIGCOMM, 1988.
 88 | 
 89 | * S. Jain, *et al*. `B4: Experience with a
 90 |   Globally-Deployed Software Defined WAN
 91 |   <https://cseweb.ucsd.edu/~vahdat/papers/b4-sigcomm13.pdf>`__.
 92 |   ACM SIGCOMM, August 2013.
 93 | 
 94 | * J. Perry, *et al*. `Fastpass: A Centralized "Zero-Queue" Datacenter Network
 95 |   <http://fastpass.mit.edu/Fastpass-SIGCOMM14-Perry.pdf>`__.
 96 |   ACM SIGCOMM, August 2014.
 97 | 
 98 | 
 99 | General-Purpose Algorithms
100 | --------------------------------
101 | 
102 | * V. Jacobson. `Congestion Avoidance and Control
103 |   <https://dl.acm.org/doi/10.1145/52324.52356>`__.  ACM SIGCOMM '88
104 |   Symposium, August 1988.
105 | 
106 | * J. Hoe. `Improving the start-up behavior of a congestion control
107 |   scheme for TCP
108 |   <https://dl.acm.org/doi/10.1145/248156.248180>`__.  ACM  SIGCOMM '96
109 |   Symposium. August 1996.
110 | 
111 | * L. Brakmo, S. O'Malley, and L. Peterson
112 |   `TCP Vegas: New Technique for Congestion Detection and Avoidance
113 |   <https://sites.cs.ucsb.edu/~almeroth/classes/F05.276/papers/vegas.pdf>`__.
114 |   ACM SIGCOMM '94 Symposium. August 1994. (Reprinted in *IEEE/ACM Transactions
115 |   on Networking,* October 1995).
116 | 
117 | * S. Low, L. Peterson, and L. Wang. `Understanding TCP Vegas: A
118 |   Duality Model. <https://dl.acm.org/doi/10.1145/506147.506152>`__.
119 |   Journal of the ACM, Volume 49, Issue 2, March 2002.
120 | 
121 | * S. Ha, I. Rhee, and L. Xu. `CUBIC: a new TCP-friendly high-speed TCP variant
122 |   <https://www.cs.princeton.edu/courses/archive/fall16/cos561/papers/Cubic08.pdf>`__.
123 |   ACM SIGOPS Operating Systems Review, Volume 42, Issue 5, July 2008.
124 | 
125 | * N. Cardwell, Y. Cheng, C. S. Gunn, S. Yeganeh, V. Jacobson.
126 |   `BBR: Congestion-based Congestion Control
127 |   <https://cacm.acm.org/magazines/2017/2/212428-bbr-congestion-based-congestion-control/fulltext>`__.
128 |   Communications of the ACM, Volume 60, Issue 2, February 2017.
129 | 
130 | * B. Briscoe, *et al.* `Implementing the "Prague Requirements" for Low
131 |   Latency Low Loss Scalable Throughput (L4S)
132 |   <https://www.bobbriscoe.net/projects/latency/tcp-prague-netdev0x13.pdf>`__.
133 |   Linux NetDev 0x13 Conference, March 2019.
134 | 
135 | Active Queue Management
136 | ---------------------------------
137 | 
138 | * K.K. Ramakrishnan and R. Jain. `A Binary Feedback Scheme for
139 |   Congestion Avoidance in Computer Networks with a Connectionless
140 |   Network Layer <https://dl.acm.org/doi/pdf/10.1145/52324.52355>`__.
141 |   ACM SIGCOMM, August 1988.
142 | 
143 | * S. Floyd and V.  Jacobson `Random Early Detection (RED)  Gateways for Congestion Avoidance
144 |   <http://www.icir.org/floyd/papers/early.twocolumn.pdf>`__.
145 |   IEEE/ACM Transactions on Networking. August 1993.
146 | 
147 | * R. Braden, *et al*. `Recommendations on Queue Management and
148 |   Congestion Avoidance in the Internet
149 |   <https://tools.ietf.org/html/rfc2309>`__. RFC 2309, April 1998.
150 | 
151 | * K. Ramakrishnan, S. Floyd, and D. Black.  `The Addition of Explicit
152 |   Congestion Notification (ECN) to IP
153 |   <https://datatracker.ietf.org/doc/html/rfc3168>`__.  RFC 3168,
154 |   September 2001.
155 | 
156 | * K. Nichols and V. Jacobson. `Controlling Queue Delay
157 |   <https://queue.acm.org/detail.cfm?id=2209336>`__.
158 |   ACM Queue, 10(5), May 2012.
159 | 
160 | Domain-Specific Algorithms
161 | -------------------------------
162 | 
163 | Datacenter
164 | ~~~~~~~~~~~~~~~~
165 | 
166 | * M. Alizadeh, *et al*. `Data Center TCP (DCTCP)
167 |   <http://dl.acm.org/citation.cfm?doid=1851182.1851192>`__.
168 |   ACM SIGCOMM, August 2010.
169 | 
170 | * R. Mittal, *et al.* `TIMELY: RTT-based Congestion Control for the Datacenter
171 |   <https://conferences.sigcomm.org/sigcomm/2015/pdf/papers/p537.pdf>`__.
172 |   ACM SIGCOMM 2015.
173 | 
174 | * S. Liu, *et al*. `Breaking the Transience-Equilibrium Nexus: A New
175 |   Approach to Datacenter Packet Transport
176 |   <https://www.usenix.org/system/files/nsdi21-liu.pdf>`__.
177 |   Usenix NSDI '21. April 2021.
178 | 
179 | Background Transfers
180 | ~~~~~~~~~~~~~~~~~~~~~~~
181 | 
182 | * S. Shalunov, *et al*. `Low Extra Delay Background Transport (LEDBAT)
183 |   <https://www.rfc-editor.org/info/rfc6817>`__.
184 |   RFC 6817, December 2012.
185 | 
186 | HTTP
187 | ~~~~~~~~~~~~
188 | 
189 | * J. Iyengar and I. Swett, Eds.
190 |   `QUIC Loss Detection and Congestion Control
191 |   <https://www.rfc-editor.org/info/rfc9002>`__.
192 |   RFC 9002, May 2021.
193 | 
194 | Wireless
195 | ~~~~~~~~~~~~~~
196 | 
197 | * H. Jiang, Z. Liu, Y. Wang, K. Lee and I. Rhee.
198 |   `Understanding Bufferbloat in Cellular Networks
199 |   <https://conferences.sigcomm.org/sigcomm/2012/paper/cellnet/p1.pdf>`__.
200 |   ACM SIGCOMM Workshop on Cellular Networks, August 2012.
201 | 
202 | * K. Liu and J. Y. B. Lee, `On Improving TCP Performance over Mobile
203 |   Data Networks <http://www.mclab.info/TMC2016.pdf>`__.
204 |   IEEE Transactions on Mobile Computing, 2016.
205 | 
206 | * Y. Xie, F. Yi, and K. Jamieson. `PBE-CC: Congestion Control via
207 |   Endpoint-Centric, Physical-Layer Bandwidth Measurements
208 |   <https://arxiv.org/abs/2002.03475>`__. SIGCOMM 2020.
209 | 
210 | * Y. Gao, *et al.* `Understanding On-device Bufferbloat For Cellular
211 |   Upload <https://www-users.cse.umn.edu/~fengqian/paper/bufferbloat_imc16.pdf>`__.
212 |   ACM Internet Measurement Conference (IMC), November 2016.
213 | 
214 | 
215 | Realtime
216 | ~~~~~~~~~~~~~~~
217 | 
218 | * S. Floyd, M. Handley, J. Padhye, and J. Widmer.
219 |   `TCP Friendly Rate Control (TFRC): Protocol Specification
220 |   <https://www.rfc-editor.org/info/rfc5348>`__.
221 |   RFC 5348, September 2008.
222 | 
223 | * J. Padhye, V. Firoiu, D. Towsley, and J. Kurose.
224 |   `Modeling TCP Throughput: A Simple Model and its Empirical Validation
225 |   <https://conferences.sigcomm.org/sigcomm/1998/tp/paper25.pdf>`__.
226 |   ACM SIGCOMM, September 1998.
227 | 
228 | Multipath
229 | ~~~~~~~~~
230 | 
231 | * D. Wischik, C. Raiciu, A. Greenhalgh and M. Handley.
232 |   `Design, Implementation and Evaluation of Congestion Control for Multipath TCP
233 |   <https://www.usenix.org/conference/nsdi11/design-implementation-and-evaluation-congestion-control-multipath-tcp>`__.
234 |   NSDI, April 2011.
235 | 
236 | * C. Raiciu, M. Handley, and D. Wischik.
237 |   `Coupled Congestion Control for Multipath Transport Protocols
238 |   <https://www.rfc-editor.org/info/rfc6356>`__.
239 |   RFC 6356, October 2011.
240 | 
241 | 
242 | Implementations and Tools
243 | --------------------------------
244 | 
245 | * S.J. Leffler, M.K. McKusick, M.J. Karels, and J.S Quarterman. `The
246 |   Design and Implementation of the 4.3 BSD Unix Operating System  <https://www.goodreads.com/en/book/show/5770.The_Design_and_Implementation_of_the_4_3BSD_UNIX_Operating_System>`__. Addison-Wesley. January 1989.
247 | 
248 | * `Netesto <https://github.com/facebook/fbkutils/tree/master/netesto>`__.
249 | 
250 | * `NS-3 Network Simulator <https://www.nsnam.org>`__.
251 | 
252 | * `RFC 6298: Computing TCP's Retransmission Timer
253 |   <https://tools.ietf.org/html/rfc6298>`__. June 2011.
254 | 
255 | * The Linux Kernel. `BPF Documentation
256 |   <https://www.kernel.org/doc/html/latest/bpf/index.html>`__.
257 | 


--------------------------------------------------------------------------------
/code/README:
--------------------------------------------------------------------------------
 1 | Order code fragments are used (substituted in book.tex)
 2 | 
 3 | tcp_ip.rst:	code/nagle.c
 4 | 
 5 | algorithm.rst:	code/timeout.c
 6 | algorithm.rst:	code/cwin.c
 7 | 
 8 | aqm.rst:	code/red.c
 9 | 
10 | README:		code/build.sh
11 | 
12 | 


--------------------------------------------------------------------------------
/code/build.sh:
--------------------------------------------------------------------------------
1 | $ mkdir ~/tcpcc 
2 | $ cd ~/tcpcc 
3 | $ git clone https://github.com/SystemsApproach/tcpcc.git 
4 | 


--------------------------------------------------------------------------------
/code/cwin.c:
--------------------------------------------------------------------------------
1 | {
2 |     u_int    cw = state->CongestionWindow;
3 |     u_int    incr = state->maxseg;
4 | 
5 |     if (cw > state->CongestionThreshold) 
6 |         incr = incr * incr / cw;
7 |     state->CongestionWindow = MIN(cw + incr, TCP_MAXWIN);
8 | }
9 | 


--------------------------------------------------------------------------------
/code/nagle.c:
--------------------------------------------------------------------------------
1 | When the application produces data to send 
2 |     if both the available data and the window >= MSS 
3 |         send a full segment 
4 |     else 
5 |         if there is unACKed data in flight 
6 |             buffer the new data until an ACK arrives 
7 |         else 
8 |             send all the new data now 
9 | 


--------------------------------------------------------------------------------
/code/red.c:
--------------------------------------------------------------------------------
1 | if AvgLen <= MinThreshold 
2 |     queue the packet 
3 | if MinThreshold < AvgLen < MaxThreshold 
4 |     calculate probability P 
5 |     drop the arriving packet with probability P 
6 | if MaxThreshold <= AvgLen 
7 |     drop the arriving packet 
8 | 


--------------------------------------------------------------------------------
/code/timeout.c:
--------------------------------------------------------------------------------
 1 | {
 2 |     SampleRTT -= (EstimatedRTT >> 3);
 3 |     EstimatedRTT += SampleRTT;
 4 |     if (SampleRTT < 0) 
 5 |         SampleRTT = -SampleRTT;
 6 |     SampleRTT -= (Deviation >> 3);
 7 |     Deviation += SampleRTT;
 8 |     TimeOut = (EstimatedRTT >> 3) + (Deviation >> 1);
 9 | }
10 | 


--------------------------------------------------------------------------------
/conf.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | #
  3 | # Configuration file for the Sphinx documentation builder.
  4 | #
  5 | # This file does only contain a selection of the most common options. For a
  6 | # full list see the documentation:
  7 | # http://www.sphinx-doc.org/en/master/config
  8 | 
  9 | # -- Path setup --------------------------------------------------------------
 10 | 
 11 | # If extensions (or modules to document with autodoc) are in another directory,
 12 | # add these directories to sys.path here. If the directory is relative to the
 13 | # documentation root, use os.path.abspath to make it absolute, like shown here.
 14 | #
 15 | # import os
 16 | # import sys
 17 | # sys.path.insert(0, os.path.abspath('.'))
 18 | 
 19 | import os
 20 | 
 21 | from subprocess import check_output, CalledProcessError
 22 | 
 23 | def get_version():
 24 | 
 25 |     try:
 26 |         version = check_output(['cat', 'VERSION'],
 27 |                                universal_newlines=True)
 28 |     except CalledProcessError:
 29 |         return 'unknown version'
 30 | 
 31 |     return version.rstrip()
 32 | 
 33 | # "version" is used for html build
 34 | version = get_version()
 35 | # "release" is used for LaTeX build
 36 | release = version
 37 | 
 38 | 
 39 | # -- Project information -----------------------------------------------------
 40 | 
 41 | project = u'TCP Congestion Control: A Systems Approach'
 42 | copyright = u'2022, Systems Approach LLC (Publisher)'
 43 | author = u'Peterson, Brakmo, Davie'
 44 | 
 45 | 
 46 | # -- General configuration ---------------------------------------------------
 47 | 
 48 | # If your documentation needs a minimal Sphinx version, state it here.
 49 | #
 50 | # needs_sphinx = '1.0'
 51 | 
 52 | # make all warnings errors
 53 | warning_is_error = False
 54 | 
 55 | # Add any Sphinx extension module names here, as strings. They can be
 56 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
 57 | # ones. ***Replace "mathjax" with "imgmath" for epub output.***
 58 | extensions = [
 59 |     'sphinx.ext.autosectionlabel',
 60 |     'sphinx.ext.coverage',
 61 |     'sphinx.ext.ifconfig',
 62 |     'sphinx.ext.mathjax',
 63 |     'sphinx.ext.todo',
 64 |     'sphinxcontrib.spelling',
 65 |     "sphinx_multiversion",
 66 | ]
 67 | 
 68 | # Text files with lists of words that shouldn't fail the spellchecker:
 69 | spelling_word_list_filename=['dict.txt', ]
 70 | 
 71 | # Add any paths that contain templates here, relative to this directory.
 72 | templates_path = ['_templates']
 73 | 
 74 | # The suffix(es) of source filenames.
 75 | # You can specify multiple suffix as a list of string:
 76 | #
 77 | # source_suffix = ['.rst', '.md']
 78 | source_suffix = '.rst'
 79 | 
 80 | # The master toctree document.
 81 | master_doc = 'index'
 82 | 
 83 | # The language for content autogenerated by Sphinx. Refer to documentation
 84 | # for a list of supported languages.
 85 | #
 86 | # This is also used if you do content translation via gettext catalogs.
 87 | # Usually you set "language" from the command line for these cases.
 88 | language = 'en'
 89 | 
 90 | # List of patterns, relative to source directory, that match files and
 91 | # directories to ignore when looking for source files.
 92 | # This pattern also affects html_static_path and html_extra_path.
 93 | exclude_patterns = [u'_build', 'venv-docs', 'requirements.txt', 'Thumbs.db', 'private', '.DS_Store', '*/README.rst']
 94 | 
 95 | # The name of the Pygments (syntax highlighting) style to use.
 96 | pygments_style = None
 97 | 
 98 | # Enable numbered figures
 99 | numfig = True
100 | numfig_format = {
101 |     'figure': 'Figure %s.',
102 |     'table':  'Table %s.'
103 |     }
104 | 
105 | # Ignore link check for the following websites
106 | # linkcheck_ignore = [
107 | #     'https://SDN.systemspproach.org/',
108 | # ]
109 | 
110 | # -- Options for HTML output -------------------------------------------------
111 | 
112 | # The theme to use for HTML and HTML Help pages.  See the documentation for
113 | # a list of builtin themes.
114 | #
115 | html_theme = 'sphinx_rtd_theme'
116 | 
117 | # Theme options are theme-specific and customize the look and feel of a theme
118 | # further.  For a list of options available for each theme, see the
119 | # documentation.
120 | #
121 | html_theme_options = {
122 |     'prev_next_buttons_location': 'both'
123 | }
124 | 
125 | # Add any paths that contain custom static files (such as style sheets) here,
126 | # relative to this directory. They are copied after the builtin static files,
127 | # so a file named "default.css" will overwrite the builtin "default.css".
128 | html_static_path = ['_static']
129 | 
130 | html_css_files = [
131 |     'css/rtd_theme_mods.css',
132 |     ]
133 | 
134 | 
135 | # HTML Favicon
136 | html_favicon = '_static/bridge.ico'
137 | 
138 | # HTML Index
139 | html_use_index = False
140 | 
141 | # Custom sidebar templates, must be a dictionary that maps document names
142 | # to template names.
143 | #
144 | # The default sidebars (for documents that don't match any pattern) are
145 | # defined by theme itself.  Builtin themes are using these templates by
146 | # default: ``['localtoc.html', 'relations.html', 'sourcelink.html',
147 | # 'searchbox.html']``.
148 | #
149 | # html_sidebars = {}
150 | 
151 | #extra HTML files
152 | html_extra_path = ['_extra']
153 | 
154 | # -- Options for HTMLHelp output ---------------------------------------------
155 | 
156 | # Output file base name for HTML help builder.
157 | htmlhelp_basename = 'SystemsApproach'
158 | 
159 | 
160 | # -- Options for LaTeX output ------------------------------------------------
161 | #latex_engine = 'xelatex'
162 | 
163 | latex_elements = {
164 |     # The paper size ('letterpaper' or 'a4paper').
165 |     #
166 |     'papersize': 'letterpaper',
167 | 
168 |     # The font size ('10pt', '11pt' or '12pt').
169 |     #
170 |     'pointsize': '11pt',
171 | 
172 |     # Get unicode to work
173 |     #
174 |     'fontenc': '\\usepackage[LGR,T1]{fontenc}',
175 | 
176 |     # Latex figure (float) alignment
177 |     #
178 |     'figure_align': 'ht',
179 | }
180 | 
181 | # Grouping the document tree into LaTeX files. List of tuples
182 | # (source start file, target name, title,
183 | #  author, documentclass [howto, manual, or own class]).
184 | latex_documents = [
185 |     (master_doc, 'book.tex', u'TCP Congestion Control: A Systems Approach',
186 |      u'Peterson, Brakmo and Davie ', 'manual', True),
187 | ]
188 | 
189 | latex_toplevel_sectioning = 'chapter'
190 | 
191 | 
192 | # -- Options for manual page output ------------------------------------------
193 | 
194 | # One entry per manual page. List of tuples
195 | # (source start file, name, description, authors, manual section).
196 | man_pages = [
197 |     (master_doc, 'Systems Approach', u'Systems Approach',
198 |      [author], 1)
199 | ]
200 | 
201 | 
202 | # -- Options for Texinfo output ----------------------------------------------
203 | 
204 | # Grouping the document tree into Texinfo files. List of tuples
205 | # (source start file, target name, title, author,
206 | #  dir menu entry, description, category)
207 | texinfo_documents = [
208 |     (master_doc, 'TCP Congestion Control', u'Software-Defined Networks',
209 |      author, 'Peterson, Brakmo, and Davie', 'A Systems Approach',
210 |      'Miscellaneous'),
211 | ]
212 | 
213 | 
214 | # -- Options for Epub output -------------------------------------------------
215 | epub_title = project
216 | epub_description = 'Efficient Sharing of Network Resources'
217 | epub_cover = ('_static/cover.jpg', '')
218 | epub_show_urls = 'False'
219 | epub_use_index = False
220 | 
221 | imgmath_font_size = 10
222 | 
223 | # The unique identifier of the text. This can be a ISBN number
224 | # or the project homepage.
225 | #
226 | # epub_identifier = ''
227 | 
228 | # A unique identification for the text.
229 | #
230 | # epub_uid = ''
231 | 
232 | # A list of files that should not be packed into the epub file.
233 | epub_exclude_files = ['search.html','robots.txt']
234 | 
235 | 
236 | # -- Extension configuration -------------------------------------------------
237 | 
238 | # -- options for Intersphinx extension ---------------------------------------
239 | 
240 | intersphinx_mapping = {
241 |     'sphinx': ('https://www.sphinx-doc.org/en/master', None),
242 |     'aether': ('https://docs.aetherproject.org/master', None),
243 |     'sdcore': ('https://docs.sd-core.opennetworking.org/master', None),
244 |     'sdran': ('https://docs.sd-ran.org/master', None),
245 |     'sdran': ('https://docs.sd-fabric.org/master', None),
246 |     'sysapproach5g': ('https://5g.systemsapproach.org/', None),
247 |     'sysapproachnet': ('https://book.systemsapproach.org/', None),
248 |     'sysapproachsdn': ('https://sdn.systemsapproach.org/', None),
249 |     }
250 | 
251 | # -- Options for todo extension ----------------------------------------------
252 | # If true, `todo` and `todoList` produce output, else they produce nothing.
253 | todo_include_todos = True
254 | 
255 | 
256 | # -- Set up Google Analytics
257 | # -- using approach at https://stackoverflow.com/questions/9444342/adding-a-javascript-script-tag-some-place-so-that-it-works-for-every-file-in-sph/41885884#41885884
258 | 
259 | 
260 | GA_INVOKE_JS = """
261 |   window.dataLayer = window.dataLayer || [];
262 |   function gtag(){dataLayer.push(arguments);}
263 |   gtag('js', new Date());
264 | 
265 |   gtag('config', 'G-SQ9EK50CDR');
266 | """
267 | 
268 | def setup(app):
269 | 
270 |     app.add_js_file('https://www.googletagmanager.com/gtag/js?id=G-SQ9EK50CDR', loading_method="async")
271 |     app.add_js_file(None, body=GA_INVOKE_JS)
272 | 


--------------------------------------------------------------------------------
/design.rst:
--------------------------------------------------------------------------------
  1 | Chapter 3:  Design Space
  2 | ==========================
  3 | 
  4 | With the architectural foundation of TCP/IP in place, we are ready to
  5 | explore the design space for addressing congestion.  But to do this,
  6 | it is helpful to first take a step back and consider the bigger
  7 | picture. The Internet is a complex arrangement of compute, storage,
  8 | and communication resources that is shared among millions of
  9 | users. The challenge is how to assign those resources—specifically
 10 | switching capacity, buffer space, and link bandwidth—to end-to-end
 11 | packet flows.
 12 | 
 13 | Because the Internet originally adopted a best-effort service model,
 14 | and users (or more precisely, TCP running on their behalf) were free
 15 | to send as many packets into the network as they could generate, it
 16 | was not surprising that the Internet eventually suffered from the
 17 | *tragedy of the commons*. And with users starting to experience congestion
 18 | collapse, the natural response was to try to control it. Hence the
 19 | term *congestion control*, which can be viewed as an implicit
 20 | mechanism for allocating resources. It is implicit in the sense that
 21 | as the control mechanism detects resources
 22 | becoming scarce, it reacts in an effort to alleviate congestion.
 23 | 
 24 | A network service model in which resources are *explicitly* allocated
 25 | to packet flows is the obvious alternative; for example, an
 26 | application could make an explicit request for resources before
 27 | sending traffic.  The best-effort assumption of IP meant such an
 28 | approach was not immediately viable at the time congestion became a
 29 | serious issue. Subsequent work was done to retrofit more explicit
 30 | resource allocation mechanisms to the Internet's best-effort delivery
 31 | model, including the ability to make *Quality-of-Service (QoS)*
 32 | guarantees. It is instructive to consider the Internet's approach to
 33 | congestion in the context of such efforts. The first section does so
 34 | as it explores the set of design decisions that underlie the control
 35 | mechanisms outlined in this book.  We then define the criteria by
 36 | which different congestion-control mechanisms can be quantitatively
 37 | evaluated and compared.
 38 | 
 39 | 3.1 Implementation Choices
 40 | -------------------------------
 41 | 
 42 | We start by introducing four implementation choices that a congestion
 43 | control mechanism faces, and the design rationale behind the decisions
 44 | that were made for TCP/IP. Some of the decisions were "obvious" given
 45 | the circumstances under which they were made, but for completeness—and
 46 | because the Internet's continual evolution means circumstances
 47 | change—it is prudent to consider them all.
 48 | 
 49 | 3.1.1 Centralized versus Distributed
 50 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 51 | 
 52 | In principle, the first design decision is whether a network's
 53 | approach to resource allocation is centralized or distributed. In
 54 | practice, the Internet's scale—along with the autonomy of the
 55 | organizations that connect to it—dictated a distributed
 56 | approach. Indeed, distributed management of resources was an
 57 | explicitly stated goal of the Internet's design, as articulated by
 58 | Dave Clark. But acknowledging this default decision is important for
 59 | two reasons.
 60 | 
 61 | .. _reading_design:
 62 | .. admonition:: Further Reading
 63 | 
 64 |        D. Clark, `The Design Philosophy of the DARPA Internet
 65 |        Protocols <https://dl.acm.org/doi/10.1145/52324.52336>`__.
 66 |        ACM SIGCOMM, 1988.
 67 | 
 68 | First, while the Internet's approach to congestion control is
 69 | distributed across its millions of hosts and routers, it is fair to
 70 | think of them as cooperatively trying to achieve a globally optimal
 71 | solution.  From this perspective, there is a shared objective
 72 | function, and all the elements are implementing a distributed
 73 | algorithm to optimize that function. The various mechanisms described
 74 | throughout this book are simply defining different objective
 75 | functions, where a persistent challenge has been how to think about
 76 | competing objective functions when multiple mechanisms have been
 77 | deployed.
 78 | 
 79 | Second, while a centralized approach is not practical for the Internet
 80 | as a whole, it can be appropriate for limited domains. For example, a
 81 | logically centralized controller could collect information about the
 82 | state of the network's links and switches, compute a globally optimal
 83 | allocation, and then advise (or even police) end hosts as to how much
 84 | capacity is available to each of them. Such an approach would certainly
 85 | be limited by the time-scale in which the centralized controller could
 86 | be responsive to changes in the network, but it has been successfully
 87 | applied to the coarse-grained allocation decisions made by traffic
 88 | engineering mechanisms like B4 and SWAN.  Exactly where one draws a
 89 | line between coarse-grain traffic engineering decisions and fine-grain
 90 | congestion control decisions is not clear, but it's good to keep an
 91 | open mind about the spectrum of options that are available.
 92 | 
 93 | .. _reading_b4:
 94 | .. admonition:: Further Reading
 95 | 
 96 |    S. Jain, *et al*. `B4: Experience with a
 97 |    Globally-Deployed Software Defined WAN
 98 |    <https://cseweb.ucsd.edu/~vahdat/papers/b4-sigcomm13.pdf>`__.
 99 |    ACM SIGCOMM, August 2013.
100 | 
101 | Centralized control has also been used effectively in datacenters,
102 | which are an interesting environment for congestion control. First,
103 | they have very low RTTs (for traffic between servers in the
104 | datacenter, if not for flows heading in or out of the datacenter).
105 | Second, in many cases a datacenter can be treated as a greenfield,
106 | raising the possibility to try new approaches that don't have to
107 | coexist fairly with incumbent algorithms. Fastpass, developed in a
108 | collaboration between MIT and Facebook researchers, is a good example
109 | of such a centralized approach.
110 | 
111 | .. _reading_fastpass:
112 | .. admonition:: Further Reading
113 | 
114 |    J. Perry, *et al*. `Fastpass: A Centralized "Zero-Queue" Datacenter Network
115 |    <http://fastpass.mit.edu/Fastpass-SIGCOMM14-Perry.pdf>`__.
116 |    ACM SIGCOMM, August 2014.
117 | 
118 | 
119 | 3.1.2 Router-Centric versus Host-Centric
120 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
121 | 
122 | Given a distributed approach to resource allocation, the next question
123 | is whether to implement the mechanism inside the network (i.e., at
124 | the routers or switches) or at the edges of the network (i.e., in the
125 | hosts, perhaps as part of the transport protocol). This is not
126 | strictly an either/or situation. Both locations are involved, and the
127 | real issue is where the majority of the burden falls. Individual
128 | routers always take responsibility for deciding which packets to
129 | forward and which packets to drop. However, there is a range of options
130 | in how much the router involves the end hosts in specifying how this
131 | decision is made, or learning how this decision was made.
132 | 
133 | At one end of the spectrum, routers can allow hosts to reserve
134 | capacity and then ensure each flow's packets are delivered
135 | accordingly. They might do this, for example, by implementing a
136 | signalling protocol along with Fair
137 | Queuing, accepting new flows only when there is sufficient capacity,
138 | and policing hosts to make sure their flows stay within their
139 | reservations. This would correspond to a reservation-based approach in
140 | which the network is able to make QoS guarantees. We consider this
141 | out-of-scope for the purpose of this book.
142 | 
143 | At the other end of the spectrum is a host-centric approach. The
144 | router makes no guarantees and offers no explicit feedback about the
145 | available capacity (i.e., silently drops packets when its buffers are
146 | full) and it is the host's responsibility to observe the network
147 | conditions (e.g., how many packets they are successfully getting
148 | through the network) and adjust its behavior accordingly.
149 | 
150 | In the middle, routers can take more proactive action to assist the
151 | end hosts in doing their job, but not by reserving buffer space.  This
152 | involves the router sending *feedback* to the end hosts when its
153 | buffers are full. We describe some of these forms of *Active Queue
154 | Management (AQM)* in Chapter 6, but the host-centric mechanisms
155 | described in the next two chapters assume routers silently tail-drop
156 | packets when their buffers are full.
157 | 
158 | Historically, the host-centric approach has been implemented in the
159 | transport layer—usually by TCP, or by some other transport protocol
160 | that mimics TCP's algorithm, such as DCCP (datagram congestion control
161 | protocol) or QUIC (a relatively recent transport protocol designed for
162 | HTTP-based applications). However, it is also possible to implement
163 | congestion control in the application itself. *DASH (Dynamic Adaptive
164 | Streaming over HTTP)* is an example, although it is best viewed as a
165 | combination of congestion control in the transport layer (since it
166 | runs over TCP) and the application layer. Based on measured network
167 | performance, the server that is streaming video to a client switches
168 | among a range of different video encodings, thus changing the rate at
169 | which data is sent into the HTTP stream. In effect, TCP tries to find
170 | a sustainable bandwidth for the flow, and then the application adapts
171 | its sending rate to fully leverage that rate without sending more data
172 | than can be sustained under the current network conditions. Primary
173 | responsibility for congestion control falls to TCP, but the
174 | application aims to keep the pipe full while also maintaining a good
175 | user experience.
176 | 
177 | 
178 | 3.1.3 Window-Based versus Rate-Based
179 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
180 | 
181 | Having settled on a host-centric approach, the next implementation
182 | choice is whether the mechanism is *window-based* or *rate-based*.
183 | TCP uses a window-based mechanism to implement flow control, so the
184 | design decision for TCP congestion control seems obvious.  And in
185 | fact, the congestion-control mechanisms described in Chapter 4 are
186 | centered around an algorithm for computing a *congestion window*,
187 | where the sender is throttled by whichever is lesser: the advertised
188 | flow-control window or the computed congestion-control window.
189 | 
190 | But it is also possible to compute the rate at which the network is able
191 | to deliver packets, and to pace transmissions accordingly. The
192 | observed rate is just the number of bytes delivered over some time
193 | period, such as the measured RTT.  We point out this duality between
194 | rates and windows because a rate-based approach is more appropriate
195 | for multimedia applications that generate data at some average rate
196 | and which need at least some minimum throughput to be useful. For
197 | example, a video codec might generate video at an average rate of
198 | 1 Mbps with a peak rate of 2 Mbps.
199 | 
200 | A rate-based approach is the logical choice in a reservation-based
201 | system that supports different QoS levels, but even in a best-effort
202 | network like the Internet, it is possible to implement an adaptive
203 | rate-based congestion-control mechanism that informs the application
204 | when it needs to adjust it transmission rate, for example by adjusting
205 | its codec. This is the core idea of TCP-friendly rate control (TFRC),
206 | which extends the concepts of TCP congestion avoidance to applications
207 | that more naturally send packets at a specific rate (e.g., the bitrate
208 | produced by a video codec at a given quality level). TFRC is typically
209 | used in conjunction with RTP, a transport protocol designed for real-time
210 | applications. We will see examples of such mechanisms in Chapter 7.
211 | 
212 | Finally, one of the recent advances in TCP congestion control is BBR
213 | (Bottleneck Bandwidth and RTT) which uses a combination of
214 | window-based and rate-based control, in an effort to limit the build
215 | up of queues within the network. We examine this approach in some
216 | detail in Chapter 5.
217 | 
218 | 
219 | 
220 | 
221 | 3.1.4 Control-based versus Avoidance-based
222 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
223 | 
224 | The final implementation choice we draw attention to is somewhat
225 | subtle.  The challenge is for the end-host, based on feedback and
226 | observations, to compute how much capacity is available in the
227 | network, and adjust its sending rate accordingly. There are two
228 | general strategies for doing this: an aggressive approach that
229 | purposely sends packets at a rate that causes packet loss and then
230 | responds to it, and a conservative approach that tries to detect the
231 | onset of queue build-up and slow down before they actually overflow.
232 | We refer to the mechanisms of the first type as *control-based*, and
233 | we refer to mechanisms of the second type as *avoidance-based*.
234 | 
235 | .. _reading_avoidance:
236 | .. admonition:: Further Reading
237 | 
238 |        R. Jain and K. K. Ramakrishnan. `Congestion Avoidance in
239 |        Computer Networks with a Connectionless Network Layer:
240 |        Concepts, Goals and Methodology. <https://arxiv.org/pdf/cs/9809095.pdf>`__.
241 |        Computer Networking Symposium, April 1988.
242 | 
243 | This distinction was first called out by Raj Jain and
244 | K.K. Ramakrishnan Jain in 1988.  It is often overlooked—and the term
245 | "congestion control" is used generically to refer to both—but our take
246 | is that the distinction represents an important difference, and so we
247 | will call it out when appropriate.  Admittedly, we will also fall back
248 | to the generic use of "congestion control" when the distinction is not
249 | critical to the discussion.
250 | 
251 | Also note that the approaches we call "control-based" and
252 | "avoidance-based" are sometimes referred to as *loss-based* and
253 | *delay-based*, respectively, according to the criteria each uses as a
254 | signal that the congestion window needs to be adjusted. The former
255 | adjusts the window when it detects a loss and the latter adjusts the
256 | window when it detects a change in the delay gradient. When viewed
257 | from this perspective, each of the algorithms introduced over the next
258 | four chapters effectively refines the fidelity of these signals in one
259 | way or another.
260 | 
261 | 
262 | 3.2 Evaluation Criteria
263 | -----------------------
264 | 
265 | Having identified the set of design decisions that go into crafting a
266 | congestion-control mechanism, the next question is whether any given
267 | solution is good or not. Recall that in Chapter 1 we posed the
268 | question of how a network *effectively* and *fairly* allocates its
269 | resources. This suggests at least two broad measures by which a
270 | resource allocation scheme can be evaluated. We consider each in turn.
271 | 
272 | 3.2.1 Effectiveness
273 | ~~~~~~~~~~~~~~~~~~~
274 | 
275 | A good starting point for evaluating the effectiveness of a
276 | congestion-control mechanism is to consider the two principal metrics
277 | of networking: throughput and delay. Clearly, we want as much
278 | throughput and as little delay as possible. Unfortunately, these goals
279 | can be at odds with each other. One way to increase throughput is to
280 | allow as many packets into the network as possible, so as to drive the
281 | utilization of all the links up to 100%. We would do this to avoid the
282 | possibility of a link becoming idle because an idle link hurts
283 | throughput. The problem with this strategy is that increasing the
284 | number of packets in the network also increases the length of the
285 | queues at each router. Such *persistent queues* mean packets are
286 | delayed in the network, or worse, dropped. Having to drop packets in
287 | the middle of the network not only impacts delay but also hurts
288 | throughput because upstream link bandwidth has been wasted on a packet
289 | that was not successfully delivered all the way to the destination.\ [#]_
290 | 
291 | .. [#]
292 |        We sometimes use the term *goodput* instead of *throughput* to
293 |        emphasize that we care about data that is successfully delivered
294 |        through the network to the receiver, as opposed to just transmitted
295 |        by the sender.
296 | 
297 | The ratio of throughput to delay is a general metric for evaluating
298 | the effectiveness of a resource allocation scheme. This ratio is
299 | sometimes referred to as the *power* of the system:
300 | 
301 | .. math::
302 | 
303 |    \mathsf{Power = Throughput / Delay}
304 | 
305 | Intuitively, the objective is to maximize this ratio, which is a
306 | function of how much load you place on the system. The load, in turn,
307 | is set by the resource allocation mechanism. :numref:`Figure %s
308 | <fig-power>` gives a representative power curve, where, ideally, the
309 | resource allocation mechanism would operate at the peak of this
310 | curve. To the left of the peak, the mechanism is being too
311 | conservative; that is, it is not allowing enough packets to be sent to
312 | keep the links busy. To the right of the peak, so many packets are
313 | being allowed into the network that either (a) increases in delay
314 | (denominator) due to queuing are starting to dominate any small gains
315 | in throughput, or (b) throughput (numerator) actually starts to drop
316 | due to packets being dropped.
317 | 
318 | .. _fig-power:
319 | .. figure:: figures/f06-03-9780123850591.png
320 |    :width: 350px
321 |    :align: center
322 | 
323 |    Ratio of throughput to delay as a function of load.
324 | 
325 | Moreover, we need to be concerned about what happens even when the
326 | system is operating under heavy load—towards the right end of the
327 | curve in :numref:`Figure %s <fig-power>`. Ideally, we would like to
328 | avoid the situation in which the system throughput approaches
329 | zero. The goal is for the mechanism to be *stable*\ —where packets
330 | continue to get through the network even when it is operating under
331 | heavy load. If a mechanism is not stable under heavy load, the
332 | network will suffer from *congestion collapse*.
333 | 
334 | Note that while both "persistent queues" and "congestion collapse" are
335 | to be avoided, there is no precise definition for the threshold at
336 | which a network suffers from either. They are both subjective
337 | judgments about an algorithm's behavior, where at the end of the day,
338 | latency and throughput are the two performance indicators that matter.
339 | 
340 | 3.2.2 Fairness
341 | ~~~~~~~~~~~~~~~~~~~
342 | 
343 | The effective utilization of network resources is not the only criterion
344 | for judging a resource allocation scheme. We must also consider the
345 | issue of fairness. However, we quickly get into murky waters when we try
346 | to define what exactly constitutes fair resource allocation. For
347 | example, a reservation-based resource allocation scheme provides an
348 | explicit way to create controlled unfairness. With such a scheme, we
349 | might use reservations to enable a video stream to receive 1 Mbps across
350 | some link while a file transfer receives only 10 kbps over the same
351 | link.
352 | 
353 | In the absence of explicit information to the contrary, when several
354 | flows share a particular link, we would like for each flow to receive
355 | an equal share of the bandwidth. This definition presumes that a
356 | *fair* share of bandwidth means an *equal* share of bandwidth. But,
357 | even in the absence of reservations, equal shares may not equate to
358 | fair shares.  Should we also consider the length of the paths being
359 | compared? For example, as illustrated in :numref:`Figure %s
360 | <fig-path-len>`, what is fair when one four-hop flow is competing with
361 | three one-hop flows?
362 | 
363 | .. _fig-path-len:
364 | .. figure:: figures/Slide10.png
365 |    :width: 550px
366 |    :align: center
367 | 
368 |    One four-hop flow competing with three one-hop flows.
369 | 
370 | Assuming that the most fair situation would be one in which all flows
371 | receive the same bandwidth,
372 | networking researcher Raj Jain proposed a metric that can be used to
373 | quantify the fairness of a congestion-control mechanism. Jain’s fairness
374 | index is defined as follows. Given a set of flow throughputs
375 | 
376 | .. math::
377 | 
378 |    (x_{1}, x_{2}, \ldots , x_{n})
379 | 
380 | (measured in consistent units such as bits/second), the following
381 | function assigns a fairness index to the flows:
382 | 
383 | .. math::
384 | 
385 |    f(x_{1}, x_{2}, \ldots ,x_{n}) = \frac{( \sum_{i=1}^{n} x_{i}
386 |    )^{2}} {n  \sum_{i=1}^{n} x_{i}^{2}}
387 | 
388 | The fairness index always results in a number between 0 and 1, with 1
389 | representing greatest fairness. To understand the intuition behind this
390 | metric, consider the case where all *n* flows receive a throughput of
391 | 1 unit of data per second. We can see that the fairness index in this
392 | case is
393 | 
394 | .. math::
395 | 
396 |    \frac{n^2}{n \times n} = 1
397 | 
398 | Now, suppose one flow receives a throughput of :math:`1 + \Delta`.
399 | Now the fairness index is
400 | 
401 | .. math::
402 | 
403 |    \frac{((n - 1) + 1 + \Delta)^2}{n(n - 1 + (1 + \Delta)^2)}
404 |    = \frac{n^2 + 2n\Delta + \Delta^2}{n^2 + 2n\Delta + n\Delta^2}
405 | 
406 | Note that the denominator exceeds the numerator by :math:`(n-1)\Delta^2`.
407 | Thus, whether the odd flow out was getting more or less than all the
408 | other flows (positive or negative :math:`\Delta`), the fairness index has
409 | now dropped below one. Another simple case to
410 | consider is where only *k* of the *n* flows receive equal throughput,
411 | and the remaining *n-k* users receive zero throughput, in which case the
412 | fairness index drops to \ *k/n*.
413 | 
414 | .. _reading_jain:
415 | .. admonition:: Further Reading
416 | 
417 |        R. Jain, D. Chiu, and W. Hawe. `A Quantitative Measure of Fairness
418 |        and Discrimination for Resource Allocation in Shared Computer Systems
419 |        <https://www.cse.wustl.edu/~jain/papers/ftp/fairness.pdf>`__.
420 |        DEC Research Report TR-301, 1984.
421 | 
422 | In the next section we revisit the notion of fairness as it applies to
423 | the deployment of new congestion control algorithms. As noted above,
424 | it is not as clear-cut as it might first appear.
425 | 
426 | TCP-friendly rate control (TFRC) also uses the notion of
427 | fairness. TFRC uses the TCP throughput equation (discussed in Section
428 | 1.3) to estimate the share of a
429 | congested link's bandwidth that
430 | would be obtained by a flow that implemented TCP's congestion control
431 | scheme, and sets that as a target rate for an application to
432 | send data. The application can then make decisions to help it hit that
433 | target rate. For example, a video streaming application might choose among a
434 | set of different encoding quality levels to try to maintain an
435 | average rate at the "fair" level as determined by TFRC.
436 | 
437 | 3.3 Comparative Analysis
438 | ---------------------------
439 | 
440 | The first step in evaluating any congestion control mechanism is to
441 | measure its performance in isolation, including:
442 | 
443 | * The average throughput (goodput) flows are able to achieve.
444 | 
445 | * The average end-to-end delay flows experience.
446 | 
447 | * That the mechanism avoid persistent queues across a range of
448 |   operating scenarios.
449 | 
450 | * That the mechanism be stable across a range of operating scenarios.
451 | 
452 | * The degree to which flows receive a fair share of the available
453 |   capacity.
454 | 
455 | The inevitable second step is to compare two or more mechanisms. This
456 | is because, given the decentralized nature of the Internet, there is
457 | no way to ensure uniform adoption of just one mechanism.
458 | Comparing quantitative metrics like throughput is easy. The problem is
459 | how to evaluate multiple mechanisms that might coexist, competing with
460 | each other for network resources.
461 | 
462 | The question is not whether a given mechanism treats all of its flows
463 | fairly, but whether mechanism A is fair to flows managed by
464 | mechanism B. If mechanism A is able to measure improved throughput
465 | over B, but it does so by being more aggressive, and hence, stealing
466 | bandwidth from B's flows, then A's improvement is not fairly gained
467 | and may be discounted. It should be evident that the Internet's highly
468 | decentralized approach to congestion control
469 | works because a large number of flows respond in a cooperative way to
470 | congestion, which opens the door to more aggressive flows improving
471 | their performance at the expense of those which implement the
472 | accepted, less aggressive algorithms.
473 | 
474 | .. _reading_ware:
475 | .. admonition:: Further Reading
476 | 
477 |    R. Ware, *et al*. `Beyond Jain's Fairness Index: Setting the Bar for
478 |    the Deployment of Congestion Control Algorithms
479 |    <https://www.cs.cmu.edu/~rware/assets/pdf/ware-hotnets19.pdf>`__.
480 |    ACM SIGCOMM HotNets. November 2019.
481 | 
482 | Arguments like this have been made many times over the last 30 years,
483 | which has raised a high bar to the deployment of new algorithms. Even
484 | if global deployment of a new algorithm would be a net positive,
485 | incremental deployment (which is the only real option) could
486 | negatively impact flows using existing algorithms, leading to a
487 | reluctance to deploy new approaches. But such
488 | analysis suffers from three problems, as identified by Ranysha Ware and
489 | colleagues:
490 | 
491 | * **Ideal-Driven Goalposting:** A fairness-based threshold asserts
492 |   new mechanism B should equally share the bottleneck link with
493 |   currently deployed mechanism A. This goal is too idealistic in
494 |   practice, especially when A is sometimes unfair to its own flows.
495 | 
496 | * **Throughput-Centricity:** A fairness-based threshold focuses on
497 |   how new mechanism B impacts a competitor flow using mechanism A
498 |   by focusing on A’s achieved throughput.  However, this ignores other
499 |   important figures of merit for good performance, such as latency,
500 |   flow completion time, or loss rate.
501 | 
502 | * **Assumption of Balance:** Inter-mechanism interactions often have
503 |   some bias, but a fairness metric cannot tell whether the outcome
504 |   is biased for or against the status quo. It makes a difference in
505 |   terms of deployability whether a new mechanism B takes a larger
506 |   share of bandwidth than legacy mechanism A or leaves a larger
507 |   share for A to consume: the former might elicit complaints from
508 |   legacy users of A, where the latter would not. Jain’s Fairness
509 |   Index assigns an equivalent score to both scenarios.
510 | 
511 | Instead of a simple calculation of Jain's fairness index, Ware
512 | advocates for a threshold based on *harm*, as measured by a reduction
513 | in throughput or an increase in latency or jitter. Intuitively, if the amount of
514 | harm caused by flows using a new mechanism B on flows using existing
515 | mechanism A is within a bound derived from how much harm A-managed
516 | flows cause other A-managed flows, we can consider B deployable
517 | alongside A without harm. Ware goes on to propose concrete measures of
518 | acceptable harm, which turns out to be more complicated than it
519 | might first appear. Even with a single congestion control algorithm,
520 | the amount of harm that one flow causes another depends on factors
521 | such as its RTT, start time, and duration. Thus measures of harm need
522 | to take into account the range of impacts that different flows have on
523 | each other under the existing regime and aim to do no worse with a
524 | new algorithm.
525 | 
526 | 3.4 Experimental Methodology
527 | --------------------------------
528 | 
529 | Our approach to evaluating congestion-control mechanisms is to measure
530 | their performance on real systems, and as we pointed out in Chapter 1,
531 | the *de facto* specification of the respective mechanisms is the version
532 | implemented in Linux. We now describe one specific way to perform
533 | those measurements, illustrating one methodology that is widely
534 | practiced today. Our approach uses *Netesto (Network Test Toolkit)*, a
535 | collection of software tools available on GitHub. The alternative is
536 | simulation-based, with NS-3 being the most popular open source tool.
537 | 
538 | .. _reading_ns3:
539 | .. admonition:: Further Reading
540 | 
541 |       `Netesto <https://github.com/facebook/fbkutils/tree/master/netesto>`__
542 | 
543 |       `NS-3 Network Simulator <https://www.nsnam.org>`__
544 | 
545 | Note that while the experiments described in this section measure real
546 | congestion control algorithms (which, of course, we have not yet
547 | described in any detail), the intent is to outline how algorithms are
548 | evaluated, and not to actually draw any conclusions about specific
549 | mechanisms.
550 | 
551 | 3.4.1 Experimental Setup
552 | ~~~~~~~~~~~~~~~~~~~~~~~~
553 | 
554 | Our approach uses real TCP senders/receivers running on Linux hosts,
555 | with a range of behaviors studied using a combination of kernel
556 | packages like ``netem`` and ``tbf qdisc``. Performance data is then
557 | collected using ``tcpdump``. The network connecting the end-hosts is
558 | constructed from a combination of real switches and emulated elements,
559 | supporting for example, wide-area delays and low-bandwidth links.
560 | 
561 | The experiments can be characterized along two orthogonal
562 | dimensions. One is the topology of the network. This includes link
563 | bandwidths, RTTs, buffer sizes, and so on. The other dimension is the
564 | traffic workload we run on the network. This includes the number and
565 | duration of flows, as well as the characteristics of each flow (e.g.,
566 | stream vs. RPC).
567 | 
568 | With respect to network topology, we evaluate algorithms on three
569 | specific configurations:
570 | 
571 | * LAN with :math:`20\mu\rm{s}` RTT and 10-Gbps link bandwidth. This scenario
572 |   represents servers in the same datacenter rack.
573 | 
574 | * WAN with 10ms RTT and 10-Gbps link bandwidth, with delay introduced
575 |   on the receiver by configuring a 20,000 packet send queue. The
576 |   bottleneck is a real switch with shallow buffers (1-2 MB). This is a
577 |   good scenario to visualize the algorithm’s dynamics when looking at
578 |   two to three flows.
579 | 
580 | * WAN with 40ms RTT and 10/100-Mbps bottleneck bandwidth, with an
581 |   intermediate router introduced to reduce the link bandwidth to 10 or
582 |   100 Mbps.  This scenario reflects a connection an end-user might
583 |   experience on a modern network.
584 | 
585 | :numref:`Figure %s <fig-10gig>` shows the topology for the first two
586 | scenarios, where the senders and receivers are connected through a
587 | single switch. Delay is achieved for the second scenario using
588 | ``netem`` in the Receiver, which affects only the ACKs being sent
589 | back.
590 | 
591 | .. _fig-10gig:
592 | .. figure:: figures/Slide2.png
593 |    :width: 350px
594 |    :align: center
595 | 
596 |    Topology for 10-Gbps Tests, optionally with 10ms of delay introduced.
597 | 
598 | :numref:`Figure %s <fig-100meg>` shows the topology for the third
599 | scenario, where the router is implemented by a server-based forwarder
600 | that throttles outgoing link bandwidth using ``tbf qdisc``.
601 | 
602 | .. _fig-100meg:
603 | .. figure:: figures/Slide3.png
604 |    :width: 550px
605 |    :align: center
606 | 
607 |    Topology for 10-Mbps and 100-Mbps Tests with 10ms or 40ms of delay
608 |    introduced.
609 | 
610 | With respect to traffic workload, we evaluate the dynamics and
611 | fairness of algorithms with the following tests:
612 | 
613 | * 2-flow Test: The first flow lasts 60 seconds, and the second flow lasts
614 |   20 seconds and starts 22 seconds after the first one.
615 | 
616 | * 3-flow Test: The first flow lasts 60 seconds, the second flow lasts 40
617 |   seconds and starts 12 seconds after the first one, the third flow lasts
618 |   20 seconds and starts 26 seconds after the first one.
619 | 
620 | These tests make it possible to:
621 | 
622 | * Examine how quickly existing flows adapt to new flows.
623 | 
624 | * Examine how quickly flows adapt to released bandwidth from terminating flows.
625 | 
626 | * Measure fairness between flows with the same (or different) congestion algorithm(s).
627 | 
628 | * Measure levels of congestion.
629 | 
630 | * Identify conditions under which performance changes abruptly,
631 |   signalling a possible instability.
632 | 
633 | Additional tests include a combination of streaming, plus 10-KB and
634 | 1-MB RPCs. These tests allow us to see if the smaller RPC flows are
635 | penalized, and if so, by how much. These tests make it possible to:
636 | 
637 | * Study behavior under increasing loads.
638 | 
639 | * Measure the performance (throughput and latency) of 1-MB and 10-KB
640 |   flows, as well as how fairly is the available bandwidth divided
641 |   between them.
642 | 
643 | * Identify conditions when the retransmissions or latency change
644 |   abruptly, signalling an instability.
645 | 
646 | 
647 | 3.4.2 Example Results
648 | ~~~~~~~~~~~~~~~~~~~~~
649 | 
650 | The following shows some example results, selected to illustrate the
651 | evaluation process. We start with a simple 2-flow experiment, where
652 | both flows are managed by the same congestion-control algorithm.
653 | :numref:`Figure %s <fig-graph_1a>` shows the resulting goodput
654 | graph. As one would hope, once the second flow (in red) starts just
655 | after 20 seconds, the goodput of both flows converge towards a nearly
656 | equal sharing of the available bandwidth. This convergence is not
657 | immediate (the two plots cross over roughly ten seconds after the
658 | second flow begins), a behavior other algorithms try to correct (e.g.,
659 | by using explicit feedback from routers). On the plus side, the first
660 | flow does quickly adapt to the released bandwidth once the second flow
661 | terminates.
662 | 
663 | .. _fig-graph_1a:
664 | .. figure:: figures/Graph_1A.png
665 |    :width: 500px
666 |    :align: center
667 | 
668 |    Goodput (bytes per second delivered end-to-end) realized by two
669 |    flows running under the same congestion-control algorithm.
670 | 
671 | It is also possible to look more closely at these two flows, for
672 | example, by tracking the congestion window for each. The corresponding
673 | plot is shown in :numref:`Figure %s <fig-graph_1b>`. Not surprisingly,
674 | different algorithms would have different "patterns" to congestion
675 | windows over time, as we will see in the next chapter.
676 | 
677 | .. _fig-graph_1b:
678 | .. figure:: figures/Graph_1B.png
679 |    :width: 500px
680 |    :align: center
681 | 
682 |    Congestion window (measured in bytes) for two flows competing for
683 |    bandwidth under the same congestion-control algorithm.
684 | 
685 | We could repeat these experiments but vary the algorithm used by one
686 | of the flows. This would allow us to visualize how the two algorithms
687 | interact. If they are both fair, you would expect to see results
688 | similar to :numref:`Figure %s <fig-graph_1a>`. If not, you might see a
689 | graph similar to :numref:`Figure %s <fig-graph_6c>`, in which the
690 | second flow (Algorithm B) aggressively takes bandwidth away from the
691 | first flow (Algorithm A).
692 | 
693 | .. _fig-graph_6c:
694 | .. figure:: figures/Graph_6C.png
695 |    :width: 500px
696 |    :align: center
697 | 
698 |    Goodput (bytes per second delivered end-to-end) realized by two
699 |    flows running under different congestion-control algorithms, with
700 |    one flow receiving significantly less bandwidth than the other.
701 | 
702 | These experiments can be repeated with three concurrent flows, but we
703 | turn next to evaluating how various algorithms treat different
704 | workloads. In particular, we are interested in the question of *size
705 | fairness*, that is, how a given algorithm treats back-to-back 10-KB or
706 | 1-MB RPC calls when they have to compete with ongoing stream-based
707 | flows. Some example results are shown in :numref:`Figure %s
708 | <fig-graph_8b>` (1-MB RPCs) and :numref:`Figure %s <fig-graph_8c>`
709 | (10-KB RPCs). The figures show the performance of five different
710 | algorithms (represented by different colors), across test runs with 1,
711 | 2, 4, 8, and 16 concurrent streaming flows.
712 | 
713 | .. _fig-graph_8b:
714 | .. figure:: figures/Graph_8B.png
715 |    :width: 500px
716 |    :align: center
717 | 
718 |    Average goodput (measured in Gbps) realized by a sequence of
719 |    1-MB RPC calls for five different algorithms, when competing with
720 |    a varied number of TCP streams.
721 | 
722 | .. _fig-graph_8c:
723 | .. figure:: figures/Graph_8C.png
724 |    :width: 500px
725 |    :align: center
726 | 
727 |    Average goodput (measured in Gbps) realized by a sequence of
728 |    10-KB RPC calls for five different algorithms, when competing with
729 |    a varied number of TCP streams.
730 | 
731 | The 1-MB results are unsurprising, with no significant outliers across
732 | the five algorithms, and the average goodput decreasing as the RPCs
733 | compete with more and more streams. Although not shown in :numref:`Figure
734 | %s <fig-graph_8b>`, the fourth algorithm (green), which performs best
735 | when all flows are stream-based, suffers a significant number of
736 | retransmissions when sharing the available bandwidth among RPC calls.
737 | 
738 | The 10-KB results do have a significant outlier, with the third
739 | algorithm (yellow) performing significantly better; by a factor of
740 | 4x. If you plot latency rather than bandwidth—the more relevant metric
741 | for small-message RPC calls—it turns out the third algorithm both
742 | achieves the lowest latencies and does so consistently, with the 99th
743 | and 99.9-th percentiles being the same.
744 | 
745 | Finally, all of the preceding experiments can be repeated on a
746 | network topology that includes wide-area RTTs. Certainly inter-flow
747 | fairness and size fairness continue to be concerns, but there is also
748 | an increased likelihood that queuing delays become an issue. For
749 | example, :numref:`Figure %s <fig-graph_16b>` shows the 99% latencies
750 | for four different algorithms when the network topology includes a
751 | 10-Mbps bottleneck link and a 40ms RTT. One important observation
752 | about this result is that the second algorithm (red) performs poorly
753 | when there is less than one bandwidth-delay product of buffering
754 | available at the bottleneck router, calling attention to another
755 | variable that can influence your results.
756 | 
757 | .. _fig-graph_16b:
758 | .. figure:: figures/Graph_16B.png
759 |    :width: 500px
760 |    :align: center
761 | 
762 |    99th percentile latencies for 10-KB RPC calls when competing with a
763 |    single streaming flow on a 40ms WAN, measured for a different
764 |    number of buffers at the bottleneck router.
765 | 
766 | We conclude this discussion of experimental methodology by permitting
767 | ourselves one summary evaluation statement. When looking across a set
768 | of algorithms and a range of topology/traffic scenarios, we conclude
769 | that: *No single algorithm is better than all other algorithms under
770 | all conditions.* One explanation, as these examples demonstrate, is
771 | how many factors there are to take into consideration.  This also
772 | explains why congestion control continues to be a topic of interest
773 | for both network researchers and network practitioners.
774 | 


--------------------------------------------------------------------------------
/dict.txt:
--------------------------------------------------------------------------------
  1 | ACK
  2 | Aether
  3 | Athuraliya
  4 | BBR
  5 | BPF
  6 | eBPF
  7 | Bemmel
  8 | Braden
  9 | Brakmo
 10 | Cardwell
 11 | Centric
 12 | Cheng
 13 | Chiu
 14 | Connor
 15 | Cwnd
 16 | ECN
 17 | Fastpass
 18 | Gbps
 19 | Geng
 20 | GigE
 21 | Goodput
 22 | Gunn
 23 | Hawe
 24 | IPv
 25 | Janey
 26 | KBps
 27 | Kahn
 28 | Karels
 29 | Karn
 30 | Kleinrock
 31 | Kobayashi
 32 | Leffler
 33 | Mahdavi
 34 | Makefile
 35 | Mbps
 36 | Multipath
 37 | Multiprotocol
 38 | Nagle
 39 | Nico
 40 | O'Malley
 41 | Malley
 42 | OC
 43 | Ott
 44 | Paxson
 45 | Ph
 46 | QoS
 47 | Quarterman
 48 | RTP
 49 | RTT
 50 | Raj
 51 | Ramakrishnan
 52 | Ranysha
 53 | Realtime
 54 | Retransmit
 55 | Saltzer
 56 | Sanjeewa
 57 | Semke
 58 | TCP
 59 | TFRC
 60 | TFRC
 61 | Usenix
 62 | Vibert
 63 | VMware
 64 | Westwood
 65 | Yeganeh
 66 | Davie
 67 | Metcalf
 68 | Alizadeh
 69 | Shalunov
 70 | Cascone
 71 | Vachuska
 72 | Wi
 73 | Fi
 74 | Iyengar
 75 | Swett
 76 | Handley
 77 | Padhye
 78 | Widmer
 79 | Padhye
 80 | Firoiu
 81 | Towsley
 82 | Kurose
 83 | Haiqing
 84 | Jiang
 85 | Liu
 86 | Xie
 87 | Yi
 88 | Jamieson
 89 | Sunay
 90 | Gettys
 91 | Briscoe
 92 | Xu
 93 | Radhika
 94 | Mittal
 95 | Yihua
 96 | Gao
 97 | Wischik
 98 | Raiciu
 99 | Greenhalgh
100 | Micheloni
101 | vanBemmel
102 | Omer
103 | Shapira
104 | Giulio
105 | 
106 | 
107 | al
108 | asymptotes
109 | asymptoting
110 | backoff
111 | basestation
112 | basestations
113 | bitrate
114 | burstiness
115 | bufferbloat
116 | bursty
117 | centric
118 | codec
119 | connectionless
120 | cwnd
121 | datagram
122 | deployability
123 | deployable
124 | et
125 | goodput
126 | granularities
127 | halvings
128 | incrementing
129 | jitter
130 | kbps
131 | kilobits
132 | latencies
133 | misdelivered
134 | multipath
135 | nd
136 | piecewise
137 | pre
138 | queueing
139 | resends
140 | retransmission
141 | retransmissions
142 | retransmit
143 | retransmits
144 | retransmitted
145 | retransmitting
146 | rollout
147 | sawtooth
148 | subflow
149 | subflows
150 | teardown
151 | throughputs
152 | timeframe
153 | toolset
154 | topologies
155 | uncongested
156 | underutilizing
157 | unscaled
158 | utilizations
159 | virtualenv
160 | acked
161 | learnings
162 | intra
163 | misordering
164 | lossy
165 | todo
166 | th
167 | Vidhi
168 | Goel
169 | 


--------------------------------------------------------------------------------
/figures/Figure-sources.pptx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SystemsApproach/tcpcc/abb44dfa1a05e457243995dac0630e73febd82f5/figures/Figure-sources.pptx


--------------------------------------------------------------------------------
/figures/Graph_16B.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SystemsApproach/tcpcc/abb44dfa1a05e457243995dac0630e73febd82f5/figures/Graph_16B.png


--------------------------------------------------------------------------------
/figures/Graph_1A.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SystemsApproach/tcpcc/abb44dfa1a05e457243995dac0630e73febd82f5/figures/Graph_1A.png


--------------------------------------------------------------------------------
/figures/Graph_1B.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SystemsApproach/tcpcc/abb44dfa1a05e457243995dac0630e73febd82f5/figures/Graph_1B.png


--------------------------------------------------------------------------------
/figures/Graph_6C.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SystemsApproach/tcpcc/abb44dfa1a05e457243995dac0630e73febd82f5/figures/Graph_6C.png


--------------------------------------------------------------------------------
/figures/Graph_8B.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SystemsApproach/tcpcc/abb44dfa1a05e457243995dac0630e73febd82f5/figures/Graph_8B.png


--------------------------------------------------------------------------------
/figures/Graph_8C.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SystemsApproach/tcpcc/abb44dfa1a05e457243995dac0630e73febd82f5/figures/Graph_8C.png


--------------------------------------------------------------------------------
/figures/Slide1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SystemsApproach/tcpcc/abb44dfa1a05e457243995dac0630e73febd82f5/figures/Slide1.png


--------------------------------------------------------------------------------
/figures/Slide10.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SystemsApproach/tcpcc/abb44dfa1a05e457243995dac0630e73febd82f5/figures/Slide10.png


--------------------------------------------------------------------------------
/figures/Slide11.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SystemsApproach/tcpcc/abb44dfa1a05e457243995dac0630e73febd82f5/figures/Slide11.png


--------------------------------------------------------------------------------
/figures/Slide12.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SystemsApproach/tcpcc/abb44dfa1a05e457243995dac0630e73febd82f5/figures/Slide12.png


--------------------------------------------------------------------------------
/figures/Slide13.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SystemsApproach/tcpcc/abb44dfa1a05e457243995dac0630e73febd82f5/figures/Slide13.png


--------------------------------------------------------------------------------
/figures/Slide14.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SystemsApproach/tcpcc/abb44dfa1a05e457243995dac0630e73febd82f5/figures/Slide14.png


--------------------------------------------------------------------------------
/figures/Slide15.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SystemsApproach/tcpcc/abb44dfa1a05e457243995dac0630e73febd82f5/figures/Slide15.png


--------------------------------------------------------------------------------
/figures/Slide16.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SystemsApproach/tcpcc/abb44dfa1a05e457243995dac0630e73febd82f5/figures/Slide16.png


--------------------------------------------------------------------------------
/figures/Slide2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SystemsApproach/tcpcc/abb44dfa1a05e457243995dac0630e73febd82f5/figures/Slide2.png


--------------------------------------------------------------------------------
/figures/Slide3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SystemsApproach/tcpcc/abb44dfa1a05e457243995dac0630e73febd82f5/figures/Slide3.png


--------------------------------------------------------------------------------
/figures/Slide4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SystemsApproach/tcpcc/abb44dfa1a05e457243995dac0630e73febd82f5/figures/Slide4.png


--------------------------------------------------------------------------------
/figures/Slide5.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SystemsApproach/tcpcc/abb44dfa1a05e457243995dac0630e73febd82f5/figures/Slide5.png


--------------------------------------------------------------------------------
/figures/Slide6.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SystemsApproach/tcpcc/abb44dfa1a05e457243995dac0630e73febd82f5/figures/Slide6.png


--------------------------------------------------------------------------------
/figures/Slide7.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SystemsApproach/tcpcc/abb44dfa1a05e457243995dac0630e73febd82f5/figures/Slide7.png


--------------------------------------------------------------------------------
/figures/Slide8.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SystemsApproach/tcpcc/abb44dfa1a05e457243995dac0630e73febd82f5/figures/Slide8.png


--------------------------------------------------------------------------------
/figures/Slide9.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SystemsApproach/tcpcc/abb44dfa1a05e457243995dac0630e73febd82f5/figures/Slide9.png


--------------------------------------------------------------------------------
/figures/f03-16-9780123850591.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SystemsApproach/tcpcc/abb44dfa1a05e457243995dac0630e73febd82f5/figures/f03-16-9780123850591.png


--------------------------------------------------------------------------------
/figures/f05-03-9780123850591.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SystemsApproach/tcpcc/abb44dfa1a05e457243995dac0630e73febd82f5/figures/f05-03-9780123850591.png


--------------------------------------------------------------------------------
/figures/f05-04-9780123850591.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SystemsApproach/tcpcc/abb44dfa1a05e457243995dac0630e73febd82f5/figures/f05-04-9780123850591.png


--------------------------------------------------------------------------------
/figures/f05-05-9780123850591.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SystemsApproach/tcpcc/abb44dfa1a05e457243995dac0630e73febd82f5/figures/f05-05-9780123850591.png


--------------------------------------------------------------------------------
/figures/f05-08-9780123850591.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SystemsApproach/tcpcc/abb44dfa1a05e457243995dac0630e73febd82f5/figures/f05-08-9780123850591.png


--------------------------------------------------------------------------------
/figures/f05-10-9780123850591.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SystemsApproach/tcpcc/abb44dfa1a05e457243995dac0630e73febd82f5/figures/f05-10-9780123850591.png


--------------------------------------------------------------------------------
/figures/f06-03-9780123850591.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SystemsApproach/tcpcc/abb44dfa1a05e457243995dac0630e73febd82f5/figures/f06-03-9780123850591.png


--------------------------------------------------------------------------------
/figures/f06-05-9780123850591.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SystemsApproach/tcpcc/abb44dfa1a05e457243995dac0630e73febd82f5/figures/f06-05-9780123850591.png


--------------------------------------------------------------------------------
/figures/f06-08-9780123850591.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SystemsApproach/tcpcc/abb44dfa1a05e457243995dac0630e73febd82f5/figures/f06-08-9780123850591.png


--------------------------------------------------------------------------------
/figures/f06-09-9780123850591.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SystemsApproach/tcpcc/abb44dfa1a05e457243995dac0630e73febd82f5/figures/f06-09-9780123850591.png


--------------------------------------------------------------------------------
/figures/f06-10-9780123850591.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SystemsApproach/tcpcc/abb44dfa1a05e457243995dac0630e73febd82f5/figures/f06-10-9780123850591.png


--------------------------------------------------------------------------------
/figures/f06-11-9780123850591.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SystemsApproach/tcpcc/abb44dfa1a05e457243995dac0630e73febd82f5/figures/f06-11-9780123850591.png


--------------------------------------------------------------------------------
/figures/f06-12-9780123850591.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SystemsApproach/tcpcc/abb44dfa1a05e457243995dac0630e73febd82f5/figures/f06-12-9780123850591.png


--------------------------------------------------------------------------------
/figures/f06-13-9780123850591.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SystemsApproach/tcpcc/abb44dfa1a05e457243995dac0630e73febd82f5/figures/f06-13-9780123850591.png


--------------------------------------------------------------------------------
/figures/f06-14-9780123850591.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SystemsApproach/tcpcc/abb44dfa1a05e457243995dac0630e73febd82f5/figures/f06-14-9780123850591.png


--------------------------------------------------------------------------------
/figures/f06-15-9780123850591.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SystemsApproach/tcpcc/abb44dfa1a05e457243995dac0630e73febd82f5/figures/f06-15-9780123850591.png


--------------------------------------------------------------------------------
/figures/f06-16-9780123850591.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SystemsApproach/tcpcc/abb44dfa1a05e457243995dac0630e73febd82f5/figures/f06-16-9780123850591.png


--------------------------------------------------------------------------------
/figures/f06-17-9780123850591.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SystemsApproach/tcpcc/abb44dfa1a05e457243995dac0630e73febd82f5/figures/f06-17-9780123850591.png


--------------------------------------------------------------------------------
/figures/f06-18-9780123850591.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SystemsApproach/tcpcc/abb44dfa1a05e457243995dac0630e73febd82f5/figures/f06-18-9780123850591.png


--------------------------------------------------------------------------------
/figures/f06-19-9780123850591.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SystemsApproach/tcpcc/abb44dfa1a05e457243995dac0630e73febd82f5/figures/f06-19-9780123850591.png


--------------------------------------------------------------------------------
/foreword.rst:
--------------------------------------------------------------------------------
 1 | Foreword
 2 | ==========
 3 | 
 4 | Congestion control is unquestionably one of the most important, most
 5 | fundamental topics in computer networking.  It’s also one of most
 6 | challenging, as it requires controlling endpoints that are potentially
 7 | distributed around the globe, in different organizations, and
 8 | supporting different applications.  The role of the network layer in
 9 | supporting transport-layer congestion control is also a multi-faceted,
10 | nuanced challenge. And congestion control is needed in just about
11 | every Internet scenario one can imagine: from the public Internet that
12 | spans the globe and carries all types of traffic, to long “fat” pipes
13 | carrying massive amounts of file-transfer data, to specialized
14 | datacenter networks, to private commercial backbone networks, to
15 | mobile and wireless networks.
16 | 
17 | With all of these challenges, how does one make sense of the many
18 | (many!) approaches towards congestion control that have been
19 | developed?  What are the fundamental challenges these approaches are
20 | solving?  What is the role of the network layer, and more broadly what
21 | is the design space for congestion control protocols? Are there broad
22 | classes or approaches towards congestion control that can be
23 | identified?  Which approaches have been adopted in practice, and why?
24 | And among those many “flavors”/variations of TCP that you might have
25 | heard about—how do they differ and in what scenarios are they best
26 | used, and why? So many questions!
27 | 
28 | To make sense of this and to answer all of these questions (and more)
29 | would require not just a book, but a great book!  And now fortunately,
30 | there is such a book—this book! The three authors of *TCP Congestion
31 | Control: A Systems Approach* are among the most knowledgeable
32 | congestion control researchers on the planet—Brakmo and Peterson’s
33 | TCP Vegas protocol (you can learn more about that in section 5.1)
34 | pioneered the notion that endpoints could anticipate and avoid
35 | congestion, rather than react to observed congestion; TCP Vegas has
36 | served as a foundation on which more recent congestion avoidance
37 | protocols (such as the BBR protocol championed by Google, see Section
38 | 5.3) have been designed.  The authors are also absolutely fabulous
39 | writers (and I say this as a textbook author myself)—lucid, clear, and
40 | engaging, and able to organize and communicate complex ideas, with
41 | just the right amount of detail and discussion of practice.  The
42 | “systems approach” that Larry Peterson and Bruce Davie have championed
43 | is also exactly what is needed to truly understand congestion control,
44 | where deep, system-wide issues in network architecture come to the
45 | fore (e.g., the separation and interaction of network and transport
46 | layer functionalities; the question of implementing networks services,
47 | such as congestion control, in the either the application layer or in
48 | the network).
49 | 
50 | This book is a needed and most welcome addition to the fabulous set of
51 | open source, “systems approach” books that Larry, Bruce and others
52 | have been developing.  I hope you read it cover-to-cover, consult it
53 | again later as you need it in the future, and enjoy it as much as I
54 | have.
55 | 
56 | | Jim Kurose
57 | | Amherst, Massachusetts
58 | 


--------------------------------------------------------------------------------
/index.rst:
--------------------------------------------------------------------------------
 1 | .. image:: _static/SystemsApproachLogoURL.png
 2 |    :width: 300px
 3 |    :align: center
 4 |    :target: https://systemsapproach.org
 5 | 
 6 | |
 7 | 
 8 | TCP Congestion Control: A Systems Approach
 9 | =============================================
10 | 
11 | Peterson, Brakmo, and Davie
12 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
13 | 
14 | |
15 | 
16 | .. toctree::
17 |    :maxdepth: 2
18 |    :caption: Table of Contents
19 | 
20 |    foreword.rst
21 |    preface.rst
22 |    intro.rst
23 |    tcp_ip.rst
24 |    design.rst
25 |    algorithm.rst
26 |    avoidance.rst
27 |    aqm.rst
28 |    variants.rst
29 |    biblio.rst
30 |    README.rst
31 |    authors.rst
32 |    latest.rst
33 |    print.rst
34 | 


--------------------------------------------------------------------------------
/intro.rst:
--------------------------------------------------------------------------------
  1 | Chapter 1:  Introduction
  2 | ========================
  3 | 
  4 | The Internet is considered an engineering success with few peers, and
  5 | rightfully so. It has scaled to connect billions of devices, supports
  6 | every imagined communications application, and accommodates
  7 | transmission rates ranging from tens of bits per day to hundreds of
  8 | gigabits per second. But at its core is a thorny technical challenge
  9 | that has drawn widespread attention for the last 30-plus years, from
 10 | both practitioners trying to make the Internet perform better and
 11 | theoreticians wanting to understand its mathematical underpinnings:
 12 | how the Internet’s resources are best allocated to all the competing
 13 | interests trying to use it.
 14 | 
 15 | Resource allocation is a hard problem in any computer system, but
 16 | especially so for a system as complex as the Internet. The problem was
 17 | not top-of-mind when the Internet’s TCP/IP protocol stack was first
 18 | deployed in the early 1980s.  By the end of the decade, however, with the
 19 | Internet gaining serious use in universities (but predating
 20 | the World Wide Web's invention by several years), the network began
 21 | to experience a
 22 | phenomenon known as *congestion collapse*. A solution—congestion
 23 | control—was developed and deployed in the late 1980s and the
 24 | immediate crisis was addressed. The
 25 | Internet community has been studying and refining its approach to
 26 | congestion control ever since. This book is about that journey.
 27 | 
 28 | The most famous early efforts to manage congestion were undertaken by two
 29 | researchers, Van Jacobson and Mike Karels. The resulting paper,
 30 | *Congestion Avoidance and Control*, published in 1988, is one of the
 31 | most cited papers in networking of all time. There are
 32 | good reasons for that. One is that congestion collapse really did
 33 | threaten the nascent Internet, and the work undertaken to
 34 | address it was foundational to the Internet's ultimate
 35 | success. Without that work it's unlikely we'd have the global Internet
 36 | we have today.
 37 | 
 38 | Another reason for the citation impact of this work is that congestion
 39 | control has been an amazingly fruitful area of research for over three
 40 | decades. Congestion control, and resource allocation more broadly, are
 41 | wide open design spaces with plenty of room for innovation. Decades of
 42 | research and implementation have built on the early foundations, and
 43 | it seems fair to assume that new approaches
 44 | or refinements to the existing approaches will continue to appear for
 45 | as long as the Internet exists.
 46 | 
 47 | In this book, we explore the design space for congestion control in
 48 | the Internet and present a description of the major approaches to
 49 | managing or avoiding congestion that
 50 | have been developed over the last three decades.
 51 | 
 52 | 
 53 | .. _reading_vj:
 54 | .. admonition:: Further Reading
 55 | 
 56 |    V. Jacobson. `Congestion Avoidance and Control
 57 |    <https://dl.acm.org/doi/10.1145/52324.52356>`__.
 58 |    ACM SIGCOMM '88 Symposium, August 1988.
 59 | 
 60 | 
 61 | 1.1 What is Congestion?
 62 | ------------------------
 63 | 
 64 | Anyone who has driven on a highway at rush hour has experienced
 65 | congestion. There is a limited resource—the space on the highway—and a
 66 | set of cars, trucks, etc. that compete for that resource. As rush hour
 67 | gets underway, more traffic arrives but the road keeps working as
 68 | intended, just with more vehicles on it. But there
 69 | comes a point where the number of vehicles becomes so large that
 70 | everyone has to slow down (because there is no longer enough space for
 71 | everyone to keep a safe distance at the speed limit) at which point the
 72 | road actually becomes *less effective* at moving vehicles. So, just at
 73 | the point when you would be wanting more capacity, there is actually
 74 | less capacity to move traffic, as illustrated in :numref:`Figure %s <fig-collapse>`. This is the essence of *congestion
 75 | collapse*, when congestion is so bad that the system starts to perform
 76 | significantly worse than it did without congestion. The mechanism of congestion collapse is quite a bit different for
 77 | packet networks than for highways, but it is equally problematic [#]_.
 78 | 
 79 | 
 80 | .. _fig-collapse:
 81 | .. figure:: figures/Slide1.png
 82 |    :width: 400px
 83 |    :align: center
 84 | 
 85 |    As load increases, throughput rises then falls at the point of
 86 |    congestion collapse.
 87 | 
 88 | 
 89 | .. [#] Networking people like making analogies between real-world
 90 |        congestion and network congestion, but it's important to
 91 |        recognize that analogies are imperfect.
 92 | 
 93 | This book focuses on congestion control for packet-switched
 94 | networks. A fundamental aspect of packet switching is *multiplexing*,
 95 | which is the means by which a system resource—such as a link or a
 96 | queue in a router—is shared among multiple users or applications.  In
 97 | the case of the Internet, packet networks are *statistically
 98 | multiplexed*, which means that, as packets show up somewhat randomly,
 99 | we rely on the statistical properties of those arrivals to ensure that
100 | we don't run out of resources. The existence of congestion collapse
101 | shows that sometimes the statistics don't quite work out as we'd like.
102 | 
103 | To see how this might work, consider the simple network illustrated in
104 | :numref:`Figure %s <fig-mux>`, where the three hosts on the left side
105 | of the network (senders S1-S3) are sending data to the three hosts on
106 | the right (receivers R1-R3) by sharing a switched network that
107 | contains only one physical link. (For simplicity, assume that host S1
108 | is sending data to host R1, and so on.) In this situation, three flows
109 | of data—corresponding to the three pairs of hosts—are multiplexed onto
110 | a single physical link by switch 1 and then *demultiplexed* back into
111 | separate flows by switch 2. Note that we are being intentionally vague
112 | about exactly what a “flow of data” corresponds to for now, but we
113 | will make this more precise in later chapters.
114 | 
115 | .. _fig-mux:
116 | .. figure:: figures/Slide11.png
117 |    :width: 400px
118 |    :align: center
119 | 
120 |    Multiplexing multiple logical flows over a single
121 |    physical link.
122 | 
123 | Statistical multiplexing means that all the hosts in this network send
124 | packets whenever it suits them, and if it happens that several packets
125 | turn up at the same time at a switch, one of them will be transmitted
126 | first while the others are placed into a queue. So both the link and
127 | the queue are shared resources, and both are finite. The link can
128 | only carry so many bits per second, and the queue can only hold so
129 | many packets (or bytes) before it has to start discarding
130 | packets. Managing the access to these shared resources, and trying to
131 | do so in a way that prevents congestion collapse, is the essence
132 | of congestion control. A switch that occasionally puts packets in a
133 | queue is operating normally. A switch that has large numbers of
134 | packets in its queues all or most
135 | of the time is congested. We'll get to the definition of congestion
136 | collapse for networks later on, but it starts with congested switches,
137 | routers or links.
138 | 
139 | For a deeper introduction to statistical multiplexing, and why it's
140 | the approach of choice for packet networks, we refer to the
141 | following text.
142 | 
143 | .. _reading_statmux:
144 | .. admonition:: Further Reading
145 | 
146 |       `Requirements
147 |       <https://book.systemsapproach.org/foundation/requirements.html>`__.
148 |       *Computer Networks: A Systems Approach*, 2020.
149 | 
150 | 
151 | When a switch builds a queue of packets awaiting transmission, it
152 | needs to decide which packet gets sent next.  Each switch in a
153 | packet-switched network makes this decision independently, on a
154 | packet-by-packet basis. One of the issues that arises is how to make
155 | this decision in a fair manner. For example, many switches are
156 | designed to service packets on a first-in, first-out (FIFO)
157 | basis. Another approach would be to transmit the packets from each of
158 | the different flows that are currently sending data through the switch
159 | in a round-robin manner. This might be done to ensure that certain
160 | flows receive a particular share of the link’s bandwidth or that they
161 | never have their packets delayed in the switch for more than a certain
162 | length of time. A network that attempts to allocate bandwidth to
163 | particular flows is sometimes said to support *Quality-of-Service
164 | (QoS)*.
165 | 
166 | One thing to take away from this discussion is that it is in the
167 | nature of packet-switched networks that they will sometimes be
168 | congested. The focus of this book is on the large body of work that
169 | has been done to mitigate congestion, either by responding to it in
170 | effective ways to lessen it, or by preventing it before it occurs.
171 | 
172 | 1.2 Controlling Congestion
173 | ---------------------------
174 | 
175 | Resource allocation and congestion control are complex issues that have
176 | been the subject of much study ever since the first network was
177 | designed. They are still active areas of research. One factor that makes
178 | these issues complex is that they are not isolated to a single level
179 | of a protocol hierarchy. Resource allocation is partially implemented in
180 | the routers, switches, and links inside the network and partially in the
181 | transport protocol running on the end hosts. End systems may use
182 | signalling protocols to convey their resource requirements to network
183 | nodes, which respond with information about resource
184 | availability. Application protocols may themselves be designed to mitigate
185 | congestion, for example, by changing the resolution of video transmission
186 | based on the current network conditions. This is a canonical example
187 | of a *systems issue*: you can't fully understand congestion without
188 | looking at all the places in the system that it touches.
189 | 
190 | We should clarify our terminology before going any further. By *resource
191 | allocation*, we mean the process by which network elements try to meet
192 | the competing demands that applications have for network
193 | resources—primarily link bandwidth and buffer space in routers or
194 | switches. Of course, it will often not be possible to meet all the
195 | demands, meaning that some users or applications may receive fewer
196 | network resources than they want. Part of the resource allocation
197 | problem is deciding when to say no and to whom.
198 | 
199 | We use the term *congestion control* to describe the efforts made by
200 | network nodes (including end systems) to prevent or respond to overload conditions. Since
201 | congestion is generally bad for everyone, the first order of business is
202 | making congestion subside, or preventing it in the first place. This
203 | might be achieved simply by persuading a few hosts to stop sending, thus
204 | improving the situation for everyone else. However, it is more common
205 | for congestion-control mechanisms to have some aspect of fairness—that
206 | is, they try to share the pain among all users, rather than causing
207 | great pain to a few. Thus, we see that many congestion-control
208 | mechanisms have some sort of resource allocation built into them.
209 | 
210 | It is also important to understand the difference between flow control
211 | and congestion control. Flow control involves keeping a fast sender from
212 | overrunning a slow receiver. Congestion control, by contrast, is
213 | intended to keep a set of senders from sending too much data *into the
214 | network* because of lack of resources at some point. These two concepts
215 | are often confused; as we will see, they also share some mechanisms.
216 | 
217 | Given all the different places and layers where congestion control and resource
218 | allocation can be implemented, it is helpful to start with a simple
219 | approach, which is pretty much what Jacobson and Karels did (although
220 | their solution ended up having quite a few moving parts).
221 | 
222 | In the early Internet, routers implemented the most basic resource
223 | allocation approach possible: FIFO queuing with tail drop. There was
224 | no awareness of flows or applications, so they simply accepted packets
225 | as they arrived, placed them in a queue whenever the outbound link
226 | capacity was less than the arrival rate, served the queue by the FIFO
227 | discipline, and dropped arriving packets if the queue was full
228 | ("tail-drop"). This is still the most common form of queuing
229 | today; we will discuss other approaches to queuing including
230 | *Active Queue Management* in a later chapter.
231 | 
232 | The reason that congestion collapse occurred in the early Internet is that
233 | dropped packets are not just discarded and forgotten. When the
234 | end-to-end transport protocol is TCP, as it is for most Internet
235 | traffic, a dropped packet will be retransmitted. So as congestion
236 | rises, the number of retransmitted packets rises; in other words, the
237 | number of packets sent into the network increases even if there is no
238 | real increase in the offered load from users and applications. More
239 | packets lead to more drops leading to more retransmissions and so
240 | on. You can see how this leads to collapse.
241 | 
242 | A useful term in this context is *goodput*, which is distinguished
243 | from throughput in the sense that only packets doing useful work are
244 | counted towards goodput. So, for example, if a link is running at 100%
245 | utilization, but 60% of the packets on that link are retransmitted due
246 | to earlier losses, you could say the goodput was only 40%.
247 | 
248 | The key insight of early researchers on congestion control was that it
249 | was possible and necessary for TCP to do something other than blindly retransmit
250 | lost packets during times of congestion. TCP would have to detect the
251 | congestion—which it can do, for example, by noticing the loss of
252 | packets—and then respond to the congestion by *reducing* the amount of
253 | traffic sent into the network. This interaction between the end-to-end
254 | protocol and the network during times of congestion formed the basis
255 | for much of today's congestion control and avoidance approaches. We'll
256 | get into the specifics of how these approaches work in subsequent
257 | chapters.
258 | 
259 | 
260 | 1.3 Theoretical Underpinnings
261 | ------------------------------
262 | 
263 | There has been a lot of important theoretical work done to understand
264 | congestion. At the core of congestion is queuing, and there is a huge
265 | body of theory behind queuing, much of which extends into other
266 | physical realms such as supermarket checkouts and road congestion. The
267 | standard reference on queuing for packet networks was written by one
268 | of the early pioneers of the ARPANET, Leonard Kleinrock.
269 | 
270 | .. _reading_queue:
271 | .. admonition:: Further Reading
272 | 
273 |    L. Kleinrock. `Queueing Systems, Volume 2
274 |    <https://archive.org/details/queueingsystems02klei>`__.
275 | 
276 | As packet networks became more widespread in the 1980s, there was a
277 | great deal of interest in how traffic behaved, with a growing
278 | realization that it might be more complex than had first been
279 | thought. One of the most popular models for data traffic was the
280 | Poisson model, which had worked well for various systems like call
281 | arrivals in the telephone network and people arriving at a queue in a
282 | supermarket. But the more that people studied the Internet and other
283 | packet networks, the worse the Poisson model started to look. There
284 | are a number of seminal papers that make the case for more complex
285 | models, of which the following are two.
286 | 
287 | .. _reading_pfail:
288 | .. admonition:: Further Reading
289 | 
290 |    V. Paxson and S. Floyd. `Wide-Area Traffic: The Failure of Poisson Modeling
291 |    <https://www.icir.org/vern/papers/poisson.TON.pdf>`__.
292 |    IEEE/ACM Transactions on Networking, June 1995.
293 | 
294 | 
295 |    W. Leland *et al*,  `On the self-similar nature of Ethernet
296 |    traffic
297 |    <https://doi.org/10.1145/167954.166255>`__.
298 |    ACM SIGCOMM '93 Symposium, August 1993.
299 | 
300 | These papers and others contributed to the consensus that Internet
301 | traffic is much more “bursty”—packets arrive in clumps—than had been
302 | assumed by early models.  Furthermore, this burstiness displays
303 | *self-similarity*—a property of fractals, whereby when you zoom in,
304 | you keep seeing similar complexity at finer resolutions. For Internet
305 | traffic, this means that at any time scale, from microseconds to
306 | hours, you will see similar sorts of patterns.
307 | 
308 | This research had a number of practical consequences, such as the
309 | realization that packet queues might get to be very long indeed, and
310 | thus routers and switches should have reasonably large packet
311 | buffers. (Correctly sizing those buffers became its own research
312 | topic.) Link utilizations could not be reliably kept close to 100% all
313 | the time, because you had to allow room for unpredictable bursts.
314 | 
315 | Two topics of particular importance when thinking about congestion
316 | avoidance are *fairness* and *stability*. When the network is
317 | congested, it's going to be necessary for some users or flows to send
318 | less. It is clearly worth asking: which flows should send less? Should
319 | all flows share the pain equally? And what happens if some flows pay
320 | more attention to congestion signals than others? These questions are
321 | at the heart of the fairness issue. Jain's *fairness index* is one of
322 | the widely accepted ways to measure how fair a network is. We dig into
323 | this topic in Chapter 3.
324 | 
325 | Stability is a critical property for any sort of control system, which
326 | is what congestion control is. Congestion is detected, some action is
327 | taken to reduce the total amount of traffic, causing congestion to
328 | ease, at which point it would seem reasonable to start sending more
329 | traffic again, leading back to more congestion. You can imagine that
330 | this sort of oscillation between congested and uncongested states
331 | could go on forever, and would be quite detrimental if the network is
332 | swinging from underutilized to collapsing.  We really want it to find
333 | an equilibrium where the network is busy but not so much so that
334 | congestion collapse occurs. Finding these stable control loops has
335 | been one of the key challenges for congestion control system designers
336 | over the decades. The quest for stability features heavily in the
337 | early work of Jacobson and Karels and stability remains a requirement
338 | that subsequent approaches have to meet.
339 | 
340 | Once the initial congestion control algorithms of TCP were implemented
341 | and deployed, researchers began to build mathematical models of TCP's
342 | behavior. This enabled the relationship between packet loss rate,
343 | round-trip time, and throughput to be established. The foundation was
344 | laid in the paper by Mathis and colleagues, but there has been a body
345 | of work that is ongoing as the congestion control algorithms
346 | evolve. The idea that TCP would converge to a certain throughput given
347 | stable conditions of RTT and loss also formed the basis for
348 | *TCP-friendly rate control (TFRC)*. TFRC extends TCP-like congestion
349 | control to applications that don't use TCP, based on the idea that
350 | they can still share available capacity in a fair way with those that
351 | do. We return to this topic in Chapter 7.
352 | 
353 | .. _reading_mathis_eqn:
354 | .. admonition:: Further Reading
355 | 
356 |    M. Mathis, J. Semke, J. Mahdavi, and T. Ott. `The Macroscopic
357 |    Behavior of the TCP Congestion Avoidance Algorithm
358 |    <https://dl.acm.org/doi/abs/10.1145/263932.264023>`__.
359 |    SIGCOMM CCR, 27(3), July 1997.
360 | 
361 | Finally, much of the theoretical work on congestion control has framed
362 | the problem as *"a distributed algorithm to share network resources
363 | among competing sources, where the goal is to choose source rate so as
364 | to maximize aggregate source utility subject to capacity
365 | constraints."* Formulating a congestion-control mechanism as an algorithm
366 | to optimize an objective function is traceable to a paper by Frank
367 | Kelly in 1997, and later extended by Sanjeewa Athuraliya and Steven
368 | Low to take into account both traffic sources (TCP) and router queuing
369 | techniques (AQM).
370 | 
371 | .. _reading_kelly_low:
372 | .. admonition:: Further Reading
373 | 
374 |    F. Kelly. `Charging and Rate Control for Elastic Traffic
375 |    <http://www.statslab.cam.ac.uk/~frank/elastic.pdf>`__.
376 |    European Transactions on Telecommunications, 8:33–37, 1997.
377 | 
378 |    S. Athuraliya and S. Low, `An Empirical Validation of a Duality
379 |    Model of TCP and Active Queue Management Algorithms
380 |    <https://ieeexplore.ieee.org/document/977445>`__.  Proceedings of the
381 |    Winter Simulation Conference, 2001.
382 | 
383 | This book does not pursue the mathematical formulation outlined in
384 | these papers (and the large body of work that followed), but we do
385 | find it helpful to recognize that there is an established connection
386 | between optimizing a utility function and the pragmatic aspects of the
387 | mechanisms described in this book. Congestion control is an area of
388 | networking in which theory and practice have been productively linked
389 | to explore the solution space and develop robust approaches to the
390 | problem.
391 | 
392 | 1.4 Congestion Control Today
393 | ----------------------------
394 | 
395 | It sometimes feels like networking protocols have all been nailed down
396 | and standardized for decades, but few areas have remained as dynamic
397 | as congestion control. While the early work by Jacobson, Karels and
398 | others laid the foundation, there has been a long series of
399 | innovations that continue today. We cover many of these in detail in
400 | subsequent chapters, but you can rest assured that new ideas in
401 | congestion control will continue to emerge for years to come.
402 | 
403 | Sometimes innovations are necessitated by changes in the
404 | landscape. For example, as bandwidths increased from megabits to
405 | gigabits per second, the amount of data in flight at any instant
406 | increased, which raises the stakes for detecting and responding to
407 | congestion quickly.  High latency links, such as trans-oceanic cables
408 | and satellite links added to this problem by raising the round-trip
409 | time (RTT). These
410 | situations led to such innovations as using delay (and changes to
411 | delay) as a congestion signal (first seen in TCP Vegas). Also, with these "fatter pipes", there is a
412 | greater incentive to get the pipe filled quickly; you don't want to
413 | spend 10 RTTs figuring out how quickly you can send data
414 | if your message could have been sent in one or two RTTs. This led to
415 | efforts to more quickly determine the bottleneck bandwidth, such as
416 | XCP, RCP, and Quick-start for TCP.
417 | 
418 | Wireless networks, which became mainstream long after the early days
419 | of TCP, added a new issue to the mix: packet loss was no longer a
420 | reliable congestion signal, but could instead be attributed to a noisy
421 | radio channel. This led to a range of approaches to either hide the
422 | loss from the TCP hosts or to improve the mechanisms by which TCP
423 | detects congestion.
424 | 
425 | Cloud datacenters became another "use case" for congestion-control
426 | mechanisms. Unlike the Internet in general, where end-to-end latencies
427 | are highly variable, the RTT in a datacenter is both predictable and
428 | relatively small (<10ms). And because the network is highly regular in
429 | structure (e.g., a leaf-spine fabric), it is well-understood where and
430 | under what circumstances congestion is likely to occur. This makes TCP
431 | running in a datacenter ripe for a purpose-tuned algorithm rather than
432 | having to use the general-purpose mechanism that runs on the global
433 | Internet.
434 | 
435 | New applications have also contributed to the interest in improving
436 | congestion control. One salient example is the rise of video streaming
437 | as the (currently) dominant source of traffic on the Internet. Again,
438 | there were many approaches developed to make video work better under
439 | conditions of congestion. One that has enjoyed great success is
440 | *Dynamic Adaptive Streaming over HTTP (DASH)*, in which the server
441 | delivering the video switches from one quality of encoding to another
442 | (and hence from one bit-rate to another) in response to the measured
443 | congestion on the path to the receiver. This moves the congestion
444 | control loop up to the application layer, or rather, it adds a second
445 | control loop on top of the one already provided by TCP.
446 | 
447 | This quick tour of innovations is hardly exhaustive, and we will see
448 | more detail on these and other approaches in the coming chapters. The
449 | important thing to understand at this point is that congestion control
450 | continues to evolve as the technology landscape and application
451 | requirements change.
452 | 
453 | 
454 | 1.5 Reference Implementation
455 | -------------------------------
456 | 
457 | We saw in Section 1.3 that there is a rich body of literature studying
458 | the mathematical properties of congestion-control algorithms, yet
459 | congestion control remains a highly pragmatic concern. It is estimated
460 | that TCP connections carry 85% of the traffic on the Internet, and
461 | those connections are anchored in software implementations of TCP
462 | running in every imaginable OS (e.g., Linux, Windows, MacOS, iOS,
463 | Android). As a practical matter, the very specification of the
464 | congestion-control mechanisms we discuss in this book is represented
465 | in kernel-level code, typically implemented in C. The theory defines
466 | abstract models of this code, but the code *specifies* the algorithm.
467 | 
468 | If the implementation is effectively the specification, then which
469 | implementation is authoritative; which is the *reference
470 | implementation?* The answer has been the dominant open source
471 | implementation of the day. This was originally the *Berkeley Software
472 | Distribution (BSD)* implementation of Unix, and in fact, the initial
473 | algorithm proposed by Jacobson and Karels was a noteworthy feature of
474 | the Tahoe release of BSD 4.3 in 1988. This connection between BSD Unix
475 | and the TCP congestion-control algorithms was so strong that the
476 | variants of algorithm became known (named) according to the BSD
477 | release: e.g., TCP Tahoe, and later TCP Reno.
478 | 
479 | .. _reading_bsd:
480 | .. admonition:: Further Reading
481 | 
482 |    S.J. Leffler, M.K. McKusick, M.J. Karels, and J.S Quarterman.  `The
483 |    Design and Implementation of the 4.3 BSD UNIX Operating System
484 |    <https://www.goodreads.com/en/book/show/5770.The_Design_and_Implementation_of_the_4_3BSD_UNIX_Operating_System>`__.
485 |    Addison-Wesley. January 1989.
486 | 
487 | .. sidebar:: Berkeley Unix
488 | 
489 |        *Any student of the Internet should have an appreciation for
490 |        the role Berkeley Unix (aka BSD) played in the success of the
491 |        Internet. Unix, of course, originated at AT&T Bell Labs in the
492 |        early 1970s, but it was an investment by DARPA to support an
493 |        open source implementation of Unix—which was to include the
494 |        fledgling TCP/IP protocol stack—that proved to be
495 |        transformative.*
496 | 
497 |        *At the time, the success of the Internet was not a foregone
498 |        conclusion. It was viewed by many as a research curiosity, and
499 |        certainly did not enjoy much support within the computing and
500 |        telecommunication incumbents of the day. It was largely because
501 |        universities (and their students) had access to an open
502 |        implementation of the Internet protocol stack, and affordable
503 |        hardware to run it on, that TCP/IP took root. Seeding
504 |        transformative technology through open source software and
505 |        readily available hardware has proven to be a powerful
506 |        strategy, of which BSD is an early success story.*
507 | 
508 | BSD and its descendants continue to this day (notably as FreeBSD), but it was eventually
509 | overtaken by Linux, in the early 2000s, as the *de facto* open source,
510 | Unix-based OS. All the variants of TCP congestion control described in
511 | this book are available (and can be optionally activated) in the Linux
512 | kernel. They have become the reference implementation of those
513 | algorithms, which leads us to our final point: The standard for
514 | evaluating TCP congestion-control mechanisms is empirical, by running
515 | real traffic between Linux-based implementations of TCP senders and
516 | receivers. The open question is: What traffic, and over what network?
517 | 
518 | While useful insights can often be gained by observing the behavior of
519 | TCP connections running across the actual Internet, the wide
520 | variability (in both time and space) of "the Internet" makes
521 | controlled experiments virtually impossible. Instead, the current
522 | best-practice is to run a collection of "representative flows" over
523 | isolated but "representative network topologies." There is no
524 | established gold standard for either the set of flows or the set of
525 | network topologies, so experimental results are never definitive.  But
526 | the body of evidence collected using this methodology has proven
527 | sufficient to advance the state-of-the-art over the years.
528 | 
529 | For the purposes of this book, we use the experimental methodology
530 | described in Chapter 3. We use it to both visualize the behavior of
531 | the various algorithms (helping to build intuition) and to highlight
532 | problematic scenarios that continue to make congestion control such a
533 | challenging and interesting technical problem.
534 | 


--------------------------------------------------------------------------------
/latest.rst:
--------------------------------------------------------------------------------
 1 | .. role:: pop
 2 | 
 3 | :pop:`Read The Latest!`
 4 | ========================
 5 | 
 6 | `Systems Approach Newsletter: <https://systemsapproach.org/newsletter>`__ Stay
 7 | up to date with the latest developments by subscribing to the
 8 | `Systems Approach Newsletter
 9 | <https://systemsapproach.org/newsletter>`__, where the authors
10 | connect the concepts and lessons in this book to what's happening in
11 | the Internet today.
12 | 
13 | `Book Series: <https://systemsapproach.org/books/>`__ Also check out
14 | our companion books that cover emerging topics in more depth.
15 | 
16 | * `Private 5G: A Systems Approach <https://5G.systemsapproach.org>`__
17 | 
18 | * `Software-Defined Networks: A Systems Approach <https://sdn.systemsapproach.org>`__
19 | 
20 | * `Edge Cloud Operations: A Systems Approach <https://ops.systemsapproach.org>`__
21 | 
22 | .. * `TCP Congestion Control: A Systems Approach <https://tcpcc.systemsapproach.org>`__
23 | 


--------------------------------------------------------------------------------
/preface.rst:
--------------------------------------------------------------------------------
 1 | Preface
 2 | =======
 3 | 
 4 | Congestion control has been one of the most active areas of research
 5 | in computer networking from the earliest days of packet switching. The
 6 | work of Jacobson and Karels in the 1980s laid the foundation for
 7 | decades of subsequent work by introducing a suite of congestion
 8 | control mechanisms into TCP. This was done at a time of crisis, with
 9 | the Internet showing signs of congestion collapse.  Ethernet inventor
10 | Bob Metcalf famously predicted the Internet would collapse in the
11 | 1990s and followed up on his promise to eat his words when it did
12 | not. But it was clear even then that congestion control was not a
13 | fully solved problem, and improvements to the algorithms on which the
14 | Internet's smooth functioning depends have multiplied ever since.
15 | 
16 | This book grew out of our own involvement in developing congestion
17 | control algorithms over the last three decades. There have been so
18 | many developments in congestion control over that time that it’s
19 | nearly impossible to include all of them. What we have tried to do in
20 | this book is provide a framework for understanding congestion control
21 | as a systems problem, and to characterize the many approaches along a
22 | few main themes. For example, our work on TCP Vegas opened up a line
23 | of research that continues today, where the aim is to avoid severe
24 | congestion rather than react after it has set in. We thus consider
25 | avoidance-based approaches as one of the main categories of congestion
26 | control.
27 | 
28 | We expect this to be an evolving manuscript. There are many efforts in
29 | congestion control that are not currently covered, the algorithms that
30 | are covered continue to be refined, and new approaches will likely
31 | emerge to address new use cases.  We will update the book as necessary
32 | to reflect the state of the field. Please help by submitting your
33 | comments and feedback. We also welcome contributions to the on-line
34 | annotated bibliography.
35 | 
36 | Finally, we extend our thanks to those who have contributed to the
37 | open source effort to improve this book. They include:
38 | 
39 | - Bill Fisher
40 | - Giulio Micheloni
41 | - J van Bemmel
42 | - Omer Shapira
43 | - Nico Vibert
44 | - Vik Vanderlinden
45 | - Vidhi Goel
46 | 
47 | Please send
48 | us your comments and feedback using the `Issues Link
49 | <https://github.com/SystemsApproach/tcpcc/issues>`__. See the `Wiki
50 | <https://github.com/SystemsApproach/tcpcc/wiki>`__ for the latest todo
51 | list.
52 | 
53 | | Larry Peterson, Lawrence Brakmo, and Bruce Davie
54 | | May 2022
55 | 
56 | 


--------------------------------------------------------------------------------
/print.rst:
--------------------------------------------------------------------------------
 1 | .. role:: pop
 2 | 
 3 | :pop:`Print Copies`
 4 | ===========================
 5 | 
 6 | We make all books in the *Systems Approach* series available as both
 7 | print and e-books. This book is available via Amazon: `TCP Congestion Control: A Systems Approach <https://amzn.to/3UTYi3T>`__
 8 | 
 9 | `Book Series: <https://systemsapproach.org/books/>`__ Also check out
10 | our companion books that cover networking and emerging topics in more depth.
11 | 
12 | * `Computer Networks: A Systems Approach <https://amzn.to/3CtG81U>`__
13 | 
14 | * `Software-Defined Networks: A Systems Approach
15 |   <https://amzn.to/3rmLdCP>`__
16 | 
17 | * `Private 5G: A Systems Approach <https://amzn.to/3BBAQA6>`__
18 | 
19 | * `Edge Cloud Operations: A Systems Approach <https://amzn.to/3MfvK13>`__
20 | 
21 | .. * `TCP Congestion Control: A Systems Approach <https://amzn.to/3UTYi3T>`__
22 | 
23 | As participants in the Amazon Associate program we may earn income from qualifying purchases using the links above.
24 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | Sphinx~=5.3.0
2 | doc8~=0.10.1
3 | docutils~=0.17.1
4 | reuse~=0.14.0
5 | sphinx-rtd-theme~=1.0.0
6 | sphinxcontrib-spelling~=7.3.2
7 | sphinx-multiversion~=0.2.4
8 | pytz~=2023.3
9 | 


--------------------------------------------------------------------------------