├── .gitignore ├── LICENSE ├── README.md ├── docs ├── extra.css ├── history.md ├── img │ ├── 10gbe.jpg │ ├── Network_data_flow_through_kernel.png │ ├── bandwidth-limit.png │ ├── bsd.png │ ├── callgraph.png │ ├── client-limit-fq.png │ ├── client-limit.png │ ├── direct-10gbe.png │ ├── freebsd-cwnd-limit.png │ ├── freebsd-newreno.png │ ├── freebsd-slow-start.png │ ├── freebsd-throughput.png │ ├── mininet.png │ ├── packet-flow.png │ ├── profile-chargen.html │ ├── profile-chargen.png │ ├── profile-discard.html │ ├── profile-discard.png │ ├── profile-loopback6.html │ ├── profile-loopback6.png │ ├── server-limit.png │ ├── slow-start-10ms-large-rwnd.png │ ├── slow-start-10ms.png │ ├── slow-start.png │ ├── sndbuf-limit.png │ ├── sndwnd-limit.png │ ├── sockets.png │ └── statemachine.png ├── index.md ├── links.md ├── nagle.md ├── profile.md ├── profiles │ ├── chargen_epoll_ipv4_10g.pb.gz │ ├── chargen_ipv6_10g.pb.gz │ ├── chargen_send_ipv6_10g.pb.gz │ ├── discard_epoll_ipv4_10g.pb.gz │ ├── discard_ipv4_10g.pb.gz │ ├── discard_recv_ipv4_10g.pb.gz │ ├── echo_epoll_ipv4_10g.pb.gz │ ├── roundtrip_ipv6_10g.pb.gz │ └── tcp_rr_client_ipv4_10g_kernel4.19.pb.gz ├── recovery.md ├── reno.md ├── slowstart.md ├── sockets.md ├── stevens.md ├── throughput.md └── walkthrough.md ├── mkdocs.yml └── papers ├── MIT-LCS-TR-494.pdf ├── TCP-misbehaving-receiver-CCR99.pdf ├── TCP_Congestion_Control_Comparison.pdf ├── cardwell-modeling-TCP-latency-infocom2000.pdf ├── compare-autotune02.pdf ├── congavoid.pdf ├── cubic08.pdf ├── dynamics-91.pdf ├── ff96-sacks.pdf ├── mathis-tcpautotune-sigcomm98.pdf ├── mathis-tcpmodel-ccr97.pdf ├── paxson-e2e-packets-sigcomm97.pdf ├── paxson-tcpanaly-sigcomm97.pdf ├── reneging.pdf └── traffic-policing-sigcomm16.pdf /.gitignore: -------------------------------------------------------------------------------- 1 | *.swp 2 | 3 | # Byte-compiled / optimized / DLL files 4 | __pycache__/ 5 | *.py[cod] 6 | *$py.class 7 | 8 | # C extensions 9 | *.so 10 | 11 | # Distribution / packaging 12 | .Python 13 | build/ 14 | develop-eggs/ 15 | dist/ 16 | downloads/ 17 | eggs/ 18 | .eggs/ 19 | lib/ 20 | lib64/ 21 | parts/ 22 | sdist/ 23 | var/ 24 | wheels/ 25 | pip-wheel-metadata/ 26 | share/python-wheels/ 27 | *.egg-info/ 28 | .installed.cfg 29 | *.egg 30 | MANIFEST 31 | 32 | # PyInstaller 33 | # Usually these files are written by a python script from a template 34 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 35 | *.manifest 36 | *.spec 37 | 38 | # Installer logs 39 | pip-log.txt 40 | pip-delete-this-directory.txt 41 | 42 | # Unit test / coverage reports 43 | htmlcov/ 44 | .tox/ 45 | .nox/ 46 | .coverage 47 | .coverage.* 48 | .cache 49 | nosetests.xml 50 | coverage.xml 51 | *.cover 52 | *.py,cover 53 | .hypothesis/ 54 | .pytest_cache/ 55 | 56 | # Translations 57 | *.mo 58 | *.pot 59 | 60 | # Django stuff: 61 | *.log 62 | local_settings.py 63 | db.sqlite3 64 | db.sqlite3-journal 65 | 66 | # Flask stuff: 67 | instance/ 68 | .webassets-cache 69 | 70 | # Scrapy stuff: 71 | .scrapy 72 | 73 | # Sphinx documentation 74 | docs/_build/ 75 | 76 | # PyBuilder 77 | target/ 78 | 79 | # Jupyter Notebook 80 | .ipynb_checkpoints 81 | 82 | # IPython 83 | profile_default/ 84 | ipython_config.py 85 | 86 | # pyenv 87 | .python-version 88 | 89 | # pipenv 90 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 91 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 92 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 93 | # install all needed dependencies. 94 | #Pipfile.lock 95 | 96 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 97 | __pypackages__/ 98 | 99 | # Celery stuff 100 | celerybeat-schedule 101 | celerybeat.pid 102 | 103 | # SageMath parsed files 104 | *.sage.py 105 | 106 | # Environments 107 | .env 108 | .venv 109 | env/ 110 | venv/ 111 | ENV/ 112 | env.bak/ 113 | venv.bak/ 114 | 115 | # Spyder project settings 116 | .spyderproject 117 | .spyproject 118 | 119 | # Rope project settings 120 | .ropeproject 121 | 122 | # mkdocs documentation 123 | /site 124 | 125 | # mypy 126 | .mypy_cache/ 127 | .dmypy.json 128 | dmypy.json 129 | 130 | # Pyre type checker 131 | .pyre/ 132 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | BSD 3-Clause License 2 | 3 | Copyright (c) 2020, Shuo Chen 4 | All rights reserved. 5 | 6 | Redistribution and use in source and binary forms, with or without 7 | modification, are permitted provided that the following conditions are met: 8 | 9 | 1. Redistributions of source code must retain the above copyright notice, this 10 | list of conditions and the following disclaimer. 11 | 12 | 2. Redistributions in binary form must reproduce the above copyright notice, 13 | this list of conditions and the following disclaimer in the documentation 14 | and/or other materials provided with the distribution. 15 | 16 | 3. Neither the name of the copyright holder nor the names of its 17 | contributors may be used to endorse or promote products derived from 18 | this software without specific prior written permission. 19 | 20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 21 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 23 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 24 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 26 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 27 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 28 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 29 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 30 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # tcpip-study 2 | Shuo's learning notes on Linux TCP/IP stack 3 | 4 | https://chenshuo.github.io/tcpip-study/ 5 | 6 | ```bash 7 | $ sudo python3 -m pip install mkdocs mkdocs-graphviz 8 | ``` 9 | -------------------------------------------------------------------------------- /docs/extra.css: -------------------------------------------------------------------------------- 1 | p { 2 | line-height:1.5em; 3 | margin:0 0 1.0ex; 4 | } 5 | 6 | .wy-nav-content { 7 | max-width: 1024px; 8 | } 9 | 10 | .rst-content code { 11 | color: #080; 12 | } 13 | 14 | .rst-content blockquote { 15 | color: #666; 16 | padding: 0 1em; 17 | margin-left: 1em; 18 | margin-bottom: 0; 19 | border-left: .25em solid; 20 | } 21 | 22 | .rst-content h2 { 23 | margin-top: 24px; 24 | margin-bottom: 12px; 25 | } 26 | 27 | .rst-content .section ul { 28 | margin-bottom: 12px; 29 | } 30 | 31 | .hljs-comment, .hljs-quote 32 | { 33 | color: #080; 34 | font-style: normal; 35 | } 36 | -------------------------------------------------------------------------------- /docs/history.md: -------------------------------------------------------------------------------- 1 | # History 2 | 3 | * [A Protocol for Packet Network Intercommunication](https://www.cs.princeton.edu/courses/archive/fall06/cos561/papers/cerf74.pdf), 1974-05. 4 | * [RFC675] Specification of Internet Transmission Control Program, 70 pp. 1974-12. 5 | * [RFC761] Transmission Control Protocol, iii + 84 pp., 1980-01. 6 | * [RFC793] Transmission Control Protocol, iii + 85 pp., 1981-09. 7 | * [RFC9293] Transmission Control Protocol (TCP), Wesley M. Eddy, 2022-08. 8 | 9 | In 2004, [Vinton Cerf](https://en.wikipedia.org/wiki/Vint_Cerf) and [Robert Kahn](https://en.wikipedia.org/wiki/Bob_Kahn) received the ACM Turing Award for their foundational work on TCP/IP. 10 | -------------------------------------------------------------------------------- /docs/img/10gbe.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chenshuo/tcpip-study/5ac37b62e77316334e7aa2a1973fa902443a5f51/docs/img/10gbe.jpg -------------------------------------------------------------------------------- /docs/img/Network_data_flow_through_kernel.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chenshuo/tcpip-study/5ac37b62e77316334e7aa2a1973fa902443a5f51/docs/img/Network_data_flow_through_kernel.png -------------------------------------------------------------------------------- /docs/img/bandwidth-limit.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chenshuo/tcpip-study/5ac37b62e77316334e7aa2a1973fa902443a5f51/docs/img/bandwidth-limit.png -------------------------------------------------------------------------------- /docs/img/bsd.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chenshuo/tcpip-study/5ac37b62e77316334e7aa2a1973fa902443a5f51/docs/img/bsd.png -------------------------------------------------------------------------------- /docs/img/callgraph.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chenshuo/tcpip-study/5ac37b62e77316334e7aa2a1973fa902443a5f51/docs/img/callgraph.png -------------------------------------------------------------------------------- /docs/img/client-limit-fq.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chenshuo/tcpip-study/5ac37b62e77316334e7aa2a1973fa902443a5f51/docs/img/client-limit-fq.png -------------------------------------------------------------------------------- /docs/img/client-limit.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chenshuo/tcpip-study/5ac37b62e77316334e7aa2a1973fa902443a5f51/docs/img/client-limit.png -------------------------------------------------------------------------------- /docs/img/direct-10gbe.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chenshuo/tcpip-study/5ac37b62e77316334e7aa2a1973fa902443a5f51/docs/img/direct-10gbe.png -------------------------------------------------------------------------------- /docs/img/freebsd-cwnd-limit.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chenshuo/tcpip-study/5ac37b62e77316334e7aa2a1973fa902443a5f51/docs/img/freebsd-cwnd-limit.png -------------------------------------------------------------------------------- /docs/img/freebsd-newreno.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chenshuo/tcpip-study/5ac37b62e77316334e7aa2a1973fa902443a5f51/docs/img/freebsd-newreno.png -------------------------------------------------------------------------------- /docs/img/freebsd-slow-start.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chenshuo/tcpip-study/5ac37b62e77316334e7aa2a1973fa902443a5f51/docs/img/freebsd-slow-start.png -------------------------------------------------------------------------------- /docs/img/freebsd-throughput.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chenshuo/tcpip-study/5ac37b62e77316334e7aa2a1973fa902443a5f51/docs/img/freebsd-throughput.png -------------------------------------------------------------------------------- /docs/img/mininet.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chenshuo/tcpip-study/5ac37b62e77316334e7aa2a1973fa902443a5f51/docs/img/mininet.png -------------------------------------------------------------------------------- /docs/img/packet-flow.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chenshuo/tcpip-study/5ac37b62e77316334e7aa2a1973fa902443a5f51/docs/img/packet-flow.png -------------------------------------------------------------------------------- /docs/img/profile-chargen.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chenshuo/tcpip-study/5ac37b62e77316334e7aa2a1973fa902443a5f51/docs/img/profile-chargen.png -------------------------------------------------------------------------------- /docs/img/profile-discard.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chenshuo/tcpip-study/5ac37b62e77316334e7aa2a1973fa902443a5f51/docs/img/profile-discard.png -------------------------------------------------------------------------------- /docs/img/profile-loopback6.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chenshuo/tcpip-study/5ac37b62e77316334e7aa2a1973fa902443a5f51/docs/img/profile-loopback6.png -------------------------------------------------------------------------------- /docs/img/server-limit.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chenshuo/tcpip-study/5ac37b62e77316334e7aa2a1973fa902443a5f51/docs/img/server-limit.png -------------------------------------------------------------------------------- /docs/img/slow-start-10ms-large-rwnd.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chenshuo/tcpip-study/5ac37b62e77316334e7aa2a1973fa902443a5f51/docs/img/slow-start-10ms-large-rwnd.png -------------------------------------------------------------------------------- /docs/img/slow-start-10ms.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chenshuo/tcpip-study/5ac37b62e77316334e7aa2a1973fa902443a5f51/docs/img/slow-start-10ms.png -------------------------------------------------------------------------------- /docs/img/slow-start.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chenshuo/tcpip-study/5ac37b62e77316334e7aa2a1973fa902443a5f51/docs/img/slow-start.png -------------------------------------------------------------------------------- /docs/img/sndbuf-limit.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chenshuo/tcpip-study/5ac37b62e77316334e7aa2a1973fa902443a5f51/docs/img/sndbuf-limit.png -------------------------------------------------------------------------------- /docs/img/sndwnd-limit.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chenshuo/tcpip-study/5ac37b62e77316334e7aa2a1973fa902443a5f51/docs/img/sndwnd-limit.png -------------------------------------------------------------------------------- /docs/img/sockets.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chenshuo/tcpip-study/5ac37b62e77316334e7aa2a1973fa902443a5f51/docs/img/sockets.png -------------------------------------------------------------------------------- /docs/img/statemachine.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chenshuo/tcpip-study/5ac37b62e77316334e7aa2a1973fa902443a5f51/docs/img/statemachine.png -------------------------------------------------------------------------------- /docs/index.md: -------------------------------------------------------------------------------- 1 | # Shuo Chen's notes on Linux TCP/IP stack 2 | 3 | Source of this site: 4 | 5 | The TCP state machine has three inputs: Sockets API, Timers, and Data arrives. 6 | 7 | ![TCP](img/statemachine.png) 8 | 9 | Overview of packet flow: 10 | 11 | ![](img/packet-flow.png) 12 | 13 | A closer look at Linux networking datapath callgraph: 14 | 15 | ![Callgraph](img/callgraph.png) 16 | 17 | ## TCP/IP Reference 18 | 19 | _TCP/IP Illustrated (vol. 1): The Protocols, 2nd ed._ by [Kevin R. Fall](http://kfall.com/) and [W. Richard Stevens](http://www.kohala.com/start/), 2011-11. 20 | 21 | * [RFC793](https://tools.ietf.org/html/rfc793) Transmission Control Protocol, Jon Postel, 1981-09. 22 | * [RFC813](https://tools.ietf.org/html/rfc813) Window and Acknowledgement Strategy in TCP, David D. Clark, 1982-07. 23 | * [RFC1122](https://tools.ietf.org/html/rfc1122) Requirements for Internet Hosts --- Communication Layers, 1989-10 24 | * [RFC5681](https://tools.ietf.org/html/rfc5681) TCP Congestion Control, 2009-09 25 | * [RFC6093](https://tools.ietf.org/html/rfc6093) On the Implementation of the TCP Urgent Mechanism, 2011-01, 26 | which recommends against the use of urgent mechanism. 27 | * [RFC6582](https://www.rfc-editor.org/rfc/rfc6582) The NewReno Modification to TCP's Fast Recovery Algorithm, 2012-04 28 | * [RFC7323](https://tools.ietf.org/html/rfc7323) TCP Extensions for High Performance, obsoletes [RFC1323](https://tools.ietf.org/html/rfc1323) 29 | * [RFC7413](https://tools.ietf.org/html/rfc7413) TCP Fast Open 30 | * [RFC7414](https://tools.ietf.org/html/rfc7414) TCP Roadmap 31 | * [RFC8289](https://tools.ietf.org/html/rfc8289) Controlled Delay Active Queue Management 32 | * [RFC8312](https://tools.ietf.org/html/rfc8312) CC-[CUBIC](https://en.wikipedia.org/wiki/CUBIC_TCP), 33 | Linux's default [congestion control algorithm](https://en.wikipedia.org/wiki/TCP_congestion_control) since 2.6.19, 34 | replaced [BIC](https://en.wikipedia.org/wiki/BIC_TCP) (default from 2.6.8 till 2.6.18.x). FreeBSD will use CUBIC as [the new default](https://reviews.freebsd.org/D36537), replacing [NewReno](https://freebsdfoundation.org/wp-content/uploads/2021/05/TCP-Cubic-is-Ready-to-Take-Flight.pdf). 35 | * [RFC8985](https://tools.ietf.org/html/rfc8985) RACK-TLP Loss Detection Algorithm for TCP 36 | * [RFC9293](https://tools.ietf.org/html/rfc9293) Transmission Control Protocol (TCP), obsoletes RFC793 after 40+ yrs, 2022-08. 37 | * Many others in 'Links' page. 38 | 39 | ## TCP/IP Implementations 40 | 41 | AFAIK, there are four independent mainstream TCP/IP stacks: BSD, Linux, Windows, 42 | and [Solaris](https://github.com/kofemann/opensolaris/blob/master/usr/src/uts/common/inet/tcp/tcp.c) ([Mentat TCP](https://en.wikipedia.org/wiki/Mentat_Portable_Streams) and [archived page](https://web.archive.org/web/19990422220032/http://www.mentat.com/tcp/tcpfaq.html). 43 | I guess BSD stack is also used on macOS and iOS, Android uses Linux stack. 44 | So I guess most of traffic on Internet happens between the first three TCP/IP stacks. 45 | 46 | * BSD family, [BSD family tree](https://svnweb.freebsd.org/base/head/share/misc/bsd-family-tree?view=markup) 47 | ![bsd](img/bsd.png) 48 | * [4.2BSD](https://www.tuhs.org/cgi-bin/utree.pl?file=4.1cBSD/a/sys/netinet) was the first widely available TCP/IP implementation. 49 | * [4.4BSD-Lite](https://github.com/chenshuo/4.4BSD-Lite2/tree/master/sys/netinet) is convered in great detail in 50 | _TCP/IP Illustrated (vol. 2): The Implementation_ by Gary R. Wright and W. Richard Stevens, 1995. 51 | * [FreeBSD](https://cgit.freebsd.org/src/tree/sys/netinet), 52 | * User space TCP/IP stack from FreeBSD 11.0, . 53 | 54 | * Linux, some early history 55 | * First in 0.98 by Ross Biro, [`net/tcp`](https://elixir.bootlin.com/linux/0.98/source/net/tcp), 1992-09-29 56 | * Switched to a new one (NET2) by Fred van Kempen in 0.99.10, [`net/inet`](https://elixir.bootlin.com/linux/0.99.10/source/net/inet), 1993-06-07 57 | * NET3 by Swansea University Computer Society (Alan Cox) took place in 1.1.4. 58 | * In 1.2.10 -> 1.3.0, moved from `net/inet` to [`net/ipv4`](https://elixir.bootlin.com/linux/latest/source/net/ipv4). 59 | Last update to [`net/inet`](https://elixir.bootlin.com/linux/1.2.13/source/net/inet) was in 1.2.13 60 | * In 2.1.8, added `net/ipv6` 61 | * In 2.2.0pre5, renamed to NET4, early 1999. 62 | * 63 | * 64 | 65 | * [lwIP](https://en.wikipedia.org/wiki/LwIP) / uIP / [picoTCP](https://github.com/tass-belgium/picotcp) 66 | * For microcontrollers, small footprint 67 | * gvisor / [netstack](https://github.com/google/gvisor/tree/master/pkg/tcpip) 68 | * User space, in Golang 69 | * Others, mostly user space 70 | * 71 | * 72 | * Educational OSes 73 | * Minix [2.x](https://github.com/chenshuo/old-minix/tree/master/src/inet) has its own TCP/IP stack, 3.x uses lwIP instead. 74 | * Xinu [code](ftp://ftp.cs.purdue.edu/pub/comer/TCPIP-vol2.dist.tar.gz), 75 | covered in _Internetworking With TCP/IP Volume II: Design, Implementation, and Internals, 3rd ed._ by 76 | [Douglas E. Comer](https://www.cs.purdue.edu/homes/comer/netbooks.html) and David L. Stevens, 1999. 77 | * Toy implementations 78 | * 79 | * 80 | * Stanford CS144 81 | 82 | ## Tools 83 | 84 | * [packetdrill](https://github.com/google/packetdrill) is a unittest for entire TCP/IP stack. 85 | * [neper](https://github.com/google/neper) is a performance testing tool to generate workloads. 86 | 87 | ## Recent changes 88 | 89 | Recent changes that I am aware of. 90 | 91 | * **EDT** [netdev](https://netdevconf.info/) [0x12](https://netdevconf.info/0x12/) [Keynote](https://netdevconf.info/0x12/session.html?evolving-from-afap-teaching-nics-about-time): 92 | Evolving from AFAP: Teaching NICs about time by [Van Jacobson](https://en.wikipedia.org/wiki/Van_Jacobson), 93 | [slides](https://www.files.netdevconf.info/d/4ee0a09788fe49709855/) and [video](https://youtu.be/MAni0_lN7zE). 94 | * [Linux 4.20](https://kernelnewbies.org/Linux_4.20#TCP:_switch_to_Early_Departure_Time_model) [switched](https://lwn.net/ml/netdev/20180921155154.49489-1-edumazet@google.com/) to 95 | Early Departure Time model in 2018/09, and [refined](https://lwn.net/ml/netdev/20181015163758.232436-1-edumazet@google.com/) in 2018/10. 96 | -------------------------------------------------------------------------------- /docs/links.md: -------------------------------------------------------------------------------- 1 | # Links 2 | 3 | Literatures I have or haven't read. 4 | 5 | * [Sizing Router Buffers - Small is the New Big](https://community.juniper.net/blogs/sharada-yeluri/2023/02/22/sizing-router-buffers) 6 | * [Optimizing TCP for high WAN throughput while preserving low latency](https://blog.cloudflare.com/optimizing-tcp-for-high-throughput-and-low-latency/) 7 | * [Tuning Linux TCP for data-center networks](https://lpc.events/event/16/contributions/1343/attachments/1027/1972/Tuning%20Linux%20TCP%20for%20data-center%20networks%20%283%29.pdf) by Yuchung Cheng, 8 | Linux Plumbers 2022. 9 | * [Socksdirect: datacenter sockets can be fast and compatible](https://dlnext.acm.org/doi/abs/10.1145/3341302.3342071), SIGCOMM '19. 10 | [PDF](https://www.microsoft.com/en-us/research/uploads/prod/2019/08/p90-li.pdf) 11 | * [Busypolling next generation](https://netdevconf.info/2.1/session.html?dumazet) by Eric Dumazet, 2017. 12 | * [Making Linux TCP Fast](https://netdevconf.info/1.2/session.html?yuchung-cheng) by Yuchung Cheng and Neal Cardwell, 2016. 13 | [paper](https://netdevconf.info/1.2/papers/bbr-netdev-1.2.new.new.pdf) 14 | * [TCP Options for Low Latency: Maximum ACK Delay and Microsecond Timestamps](https://www.ietf.org/proceedings/97/slides/slides-97-tcpm-tcp-options-for-low-latency-00.pdf), by Neal Cardwell, Yuchung Cheng and Eric Dumazet, 2016. 15 | * [Kernel Networking Walkthrough](https://www.slideshare.net/ThomasGraf5/linuxcon-2015-linux-kernel-networking-walkthrough) by Thomas Graf (tglx), LinuxCon 2015. 16 | Nice and short (~20 slides) intro to NAPI, RSS, RPS, GRO, TSO, FastOpen with pictures. 17 | * Next year: [Kernel Networking Explained](https://www.slideshare.net/ThomasGraf5/linux-networking-explained) also by Thomas Graf, LinuxCon 2016, 27 slides. 18 | * [Linux Networking Architecture](https://www.slideshare.net/hugolu/the-linux-networking-architecture/63) slides by Hugo Lu, 2014. 19 | * [Queueing in the Linux Network Stack](https://www.coverfire.com/articles/queueing-in-the-linux-network-stack/), 2013. 20 | * [TCP small queues](https://lwn.net/Articles/507065/), LWN 2012. 21 | * [Controlling Queue Delay](https://queue.acm.org/detail.cfm?id=2209336) by Kathleen Nichols and Van Jacobson, ACM Queue May 2012. 22 | * [EECS 489: Computer Networks at umich.edu](https://www.eecs.umich.edu/courses/eecs489/w10/syllabus.html) 23 | * [Tuning TCP Parameters for the 21st Century](https://www.ietf.org/proceedings/75/slides/tcpm-1.pdf), H.K. Jerry Chu, IETF 75, 2009. 24 | * [You don't know jack about Network Performance](https://queue.acm.org/detail.cfm?id=1066069) by Kevin Fall and Steve McCanne, ACM Queue June 2005. Clearly explains four types of [network delay](https://en.wikipedia.org/wiki/Network_delay). 25 | * [TCP Implementation in Linux: A Brief Tutorial](http://www.ece.virginia.edu/mv/research/DOE09/publications/TCPlinux.pdf), 2008. Nice two-page overview of TCP/IP stack in Linux 2.6.19. 26 | * [Scaling in the Linux Networking Stack](https://www.kernel.org/doc/Documentation/networking/scaling.txt), kernel doc that describes RSS, RPS, RFS, XPS, etc. 27 | * [Segmentation Offloads in the Linux Networking Stack](https://www.kernel.org/doc/Documentation/networking/segmentation-offloads.txt), about TSO, GSO, GRO, etc. 28 | * [Programming with the Netpoll API](http://people.redhat.com/~jmoyer/netpoll-linux_kongress-2005.pdf) by Jeff Moyer, Linux Kongress 2005. 29 | * [Kernel data flow of 2.6.20](https://wiki.linuxfoundation.org/networking/kernel_flow) ![img](img/Network_data_flow_through_kernel.png) 30 | * [Computer Networks: A Systems Approach 6/e](https://book.systemsapproach.org/) by Larry Peterson and Bruce Davie, 2020. 31 | 32 | 33 | ## RFCs 34 | * [RFC1958](https://tools.ietf.org/html/rfc1958) Architectural Principles of the Internet, 1996-06. 35 | * [RFC2525](https://tools.ietf.org/html/rfc2525) Known TCP Implementation Problems, 1999-03. 36 | * [RFC2544](https://tools.ietf.org/html/rfc2544) Benchmarking Methodology for Network Interconnect Devices, 1999-03. 37 | * [RFC3150](https://tools.ietf.org/html/rfc3150) End-to-end Performance Implications of Slow Links, 2001-07. 38 | * [RFC3439](https://tools.ietf.org/html/rfc3439) Some Internet Architectural Guidelines and Philosophy, 2002-12. 39 | "Layering Considered Harmful." linked from [Internet protocol suite](https://en.wikipedia.org/wiki/Internet_protocol_suite#Layer_names_and_number_of_layers_in_the_literature) 40 | * [RFC6349](https://www.rfc-editor.org/rfc/rfc6349) Framework for TCP Throughput Testing, 2011-08. 41 | * [RFC8900](https://tools.ietf.org/html/rfc8900) IP Fragmentation Considered Fragile 42 | * [RFC9006](https://tools.ietf.org/html/rfc9006) TCP Usage Guidance in the Internet of Things (IoT), 2021-03. 43 | 44 | 45 | ## RFC drafts 46 | * , early version of [RFC8985](https://tools.ietf.org/html/rfc8985)? 47 | * 48 | 49 | 50 | ## Congestion Control 51 | 52 | * [TCP Congestion Control: A Systems Approach](https://tcpcc.systemsapproach.org/) by Peterson, Brakmo, and Davie. 53 | * [The Great Internet TCP Congestion Control Census](https://www.comp.nus.edu.sg/~ayush/images/sigmetrics2020-gordon.pdf) by Ayush Mishra, et al., SIGMETRICS 2020. 54 | [Slides 1](https://www.comp.nus.edu.sg/~bleong/slides/sigmetrics19-gordon-slides.pdf), 55 | [slides 2](https://datatracker.ietf.org/meeting/109/materials/slides-109-iccrg-the-great-internet-tcp-congestion-control-census-00). 56 | * [The classification and evolution of variants of TCP congestion control](https://www.researchgate.net/figure/The-classification-and-evolution-of-variants-of-TCP-congestion-control-Afanasyev-et-al_fig1_262053709) 57 | 58 | 59 | ## Emulation of lossy link 60 | 61 | * 62 | * [High Speed Network Protocols and Security Workshop](http://ce.sc.edu/cyberinfra/workshop_2020.html), 2020. 63 | * [Network Tools and Protocols Lab Manual](http://ce.sc.edu/cyberinfra/workshops/Material/NTP/NTP.pdf) 64 | * [High-Speed Networks: A Tutorial](https://link.springer.com/book/10.1007/978-3-030-88841-1), 2022. 65 | * 66 | * 67 | * set up a bridge and introduce packet loss and delay. 68 | 69 | 70 | ## Posts 71 | 72 | * [Segmentation and Checksum Offloading: Turning Off with ethtool](https://sandilands.info/sgordon/segmentation-offloading-with-wireshark-and-ethtool) by Dr Steven Gordon, 2010 73 | * [Reply from David Miller](https://seclists.org/tcpdump/2009/q3/14) about capturing packets when GSO is on. 74 | * [Coping with the TCP TIME-WAIT state on busy Linux servers](https://vincent.bernat.ch/en/blog/2014-tcp-time-wait-state-linux) 75 | * [Harping on ARP](https://lwn.net/Articles/45373/), [Multiple Interfaces on Same Ethernet Broadcast Network](https://www.kernel.org/doc/html/v4.18/networking/e100.html#multiple-interfaces-on-same-ethernet-broadcast-network) 76 | * [Increase HTTP Performance by Fitting In the Initial TCP Slow Start Window](https://sirupsen.com/napkin/problem-15) 77 | * [Experimenting with TCP Congestion control](https://dipsingh.github.io/TCP-Congestion-Experiment/) 78 | -------------------------------------------------------------------------------- /docs/nagle.md: -------------------------------------------------------------------------------- 1 | # Nagle's Algorithm 2 | 3 | [Nagle's Algorithm](https://en.wikipedia.org/wiki/Nagle%27s_algorithm) 4 | 5 | * [RFC896](https://tools.ietf.org/html/rfc896): Congestion Control in IP/TCP Internetworks, John Nagle , 1984-01. 6 | * [Minshall's update](https://datatracker.ietf.org/doc/html/draft-minshall-nagle), 1999. 7 | 8 | 9 | * [The trouble with the Nagle algorithm ](https://developers.slashdot.org/comments.pl?sid=174457&cid=14515105) by John Nagle, 2006. 10 | 11 | * [Nginx Optimization: understanding sendfile, tcp_nodelay and tcp_nopush](https://thoughts.t37.net/nginx-optimization-understanding-sendfile-tcp-nodelay-and-tcp-nopush-c55cdd276765) 12 | 13 | * In reply to above post, John Nagle on Nagle's algorithm , 2015. 14 | 15 | > Sigh. If you're doing bulk file transfers, you never hit that problem. If you're sending enough data to fill up outgoing buffers, there's no delay. If you send all the data and close the TCP connection, there's no delay after the last packet. If you do send, reply, send, reply, there's no delay. If you do bulk sends, there's no delay. If you do send, send, reply, there's a delay. 16 | 17 | ``` 18 | # 100ms RTT 19 | 00.000000 IP 10.0.0.1.47748 > 10.0.0.2.2009: Flags [S], seq 2158272677, win 42340, options [mss 1460,sackOK,TS val 1746926009 ecr 0,nop,wscale 9], length 0 20 | 00.100765 IP 10.0.0.2.2009 > 10.0.0.1.47748: Flags [S.], seq 2484846425, ack 2158272678, win 43440, options [mss 1460,sackOK,TS val 4273081757 ecr 1746926009,nop,wscale 9], length 0 21 | 00.100823 IP 10.0.0.1.47748 > 10.0.0.2.2009: Flags [.], ack 1, win 83, options [nop,nop,TS val 1746926110 ecr 4273081757], length 0 22 | 23 | # Send 2000 bytes, 1 RTT 24 | 02.397737 IP 10.0.0.1.47748 > 10.0.0.2.2009: Flags [P.], seq 1:2001, ack 1, win 83, length 2000 25 | 02.498120 IP 10.0.0.2.2009 > 10.0.0.1.47748: Flags [.], ack 2001, win 83, length 0 26 | 27 | # Send 1000 + 1000 bytes, 2 * RTT 28 | 08.085472 IP 10.0.0.1.47748 > 10.0.0.2.2009: Flags [P.], seq 2001:3001, ack 1, win 83, length 1000 29 | 08.185750 IP 10.0.0.2.2009 > 10.0.0.1.47748: Flags [.], ack 3001, win 83, length 0 30 | 08.185780 IP 10.0.0.1.47748 > 10.0.0.2.2009: Flags [P.], seq 3001:4001, ack 1, win 83, length 1000 31 | 08.286105 IP 10.0.0.2.2009 > 10.0.0.1.47748: Flags [.], ack 4001, win 83, length 0 32 | 33 | # Set TCP_NODELAY, send 1000 + 1000 bytes 34 | 20.869188 IP 10.0.0.1.47748 > 10.0.0.2.2009: Flags [P.], seq 4001:5001, ack 1, win 83, length 1000 35 | 20.869312 IP 10.0.0.1.47748 > 10.0.0.2.2009: Flags [P.], seq 5001:6001, ack 1, win 83, length 1000 36 | 20.970120 IP 10.0.0.2.2009 > 10.0.0.1.47748: Flags [.], ack 5001, win 83, length 0 37 | 20.970169 IP 10.0.0.2.2009 > 10.0.0.1.47748: Flags [.], ack 6001, win 82, length 0 38 | 39 | # Bye 40 | 22.501407 IP 10.0.0.1.47748 > 10.0.0.2.2009: Flags [F.], seq 6001, ack 1, win 83, length 0 41 | 22.601930 IP 10.0.0.2.2009 > 10.0.0.1.47748: Flags [F.], seq 1, ack 6002, win 83, length 0 42 | 22.601975 IP 10.0.0.1.47748 > 10.0.0.2.2009: Flags [.], ack 2, win 83, length 0 43 | ``` 44 | -------------------------------------------------------------------------------- /docs/profile.md: -------------------------------------------------------------------------------- 1 | # Profiling Linux TCP/IP stack with perf and pprof 2 | 3 | At home, I have two Linux hosts with Mellanox 10GbE nic (ConnectX EN 10GigE MT26448, bought used from Ebay in 2017) 4 | directly connected using SPF cable. 5 | 6 | ![10gbe](img/10gbe.jpg) 7 | 8 | ![10gbe](img/direct-10gbe.png) 9 | 10 | Thoughput was about 1100MiB/s over 10GbE, both machine runs ~40% CPU utilization in one thread. 11 | 12 | For comparison, run `openssl speed sha` on the Rx side machine, an i7-3770 @ 3.4GHz. 13 | 14 | ```bash 15 | $ openssl speed sha 16 | OpenSSL 1.1.1f 31 Mar 2020 17 | The 'numbers' are in 1000s of bytes per second processed. 18 | type 16 bytes 64 bytes 256 bytes 1024 bytes 8192 bytes 16384 bytes 19 | sha1 137355.08k 323943.17k 603290.54k 770941.95k 843352.75k 849619.63k 20 | sha256 75659.30k 167113.02k 289437.70k 354737.49k 379652.78k 381676.20k 21 | sha512 51745.33k 206941.63k 329443.07k 468301.82k 533897.22k 539525.12k 22 | ``` 23 | 24 | In short, sending data through TCP is faster than calculating SHA1 locally. 25 | 26 | Rx is more expensive, it uses about 2x CPU cycles than Tx. 27 | 28 | ## Tx path 29 | 30 | Profile taken on Debian bullseye (testing, pre-release 11) w/ kernel 5.6.14. 31 | Run the [chargen](https://github.com/chenshuo/recipes/blob/master/tpc/bin/chargen.cc) program 32 | to keep sending data to a `discard` server. 33 | 34 | [![chargen](img/profile-chargen.png)](img/profile-chargen.html) 35 | 36 | ## Rx path 37 | 38 | Profile taken on Ubuntu 18.04 w/ kernel 4.15 39 | Run the [discard](https://github.com/chenshuo/recipes/blob/master/tpc/bin/discard.cc) program 40 | to keep reading the socket. 41 | 42 | [![discard](img/profile-discard.png)](img/profile-discard.html) 43 | 44 | ## Loopback w/ IPv6 45 | 46 | Profile taken on Ubuntu 20.04 w/ kernel 5.4. 47 | Run both `chargen` and `discard` on the same i7-3770 host, 48 | throughput was about 3300MiB/s. `chargen` ran at 100% CPU, `discard` was about 74%. 49 | 50 | [![loopback](img/profile-loopback6.png)](img/profile-loopback6.html) 51 | -------------------------------------------------------------------------------- /docs/profiles/chargen_epoll_ipv4_10g.pb.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chenshuo/tcpip-study/5ac37b62e77316334e7aa2a1973fa902443a5f51/docs/profiles/chargen_epoll_ipv4_10g.pb.gz -------------------------------------------------------------------------------- /docs/profiles/chargen_ipv6_10g.pb.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chenshuo/tcpip-study/5ac37b62e77316334e7aa2a1973fa902443a5f51/docs/profiles/chargen_ipv6_10g.pb.gz -------------------------------------------------------------------------------- /docs/profiles/chargen_send_ipv6_10g.pb.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chenshuo/tcpip-study/5ac37b62e77316334e7aa2a1973fa902443a5f51/docs/profiles/chargen_send_ipv6_10g.pb.gz -------------------------------------------------------------------------------- /docs/profiles/discard_epoll_ipv4_10g.pb.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chenshuo/tcpip-study/5ac37b62e77316334e7aa2a1973fa902443a5f51/docs/profiles/discard_epoll_ipv4_10g.pb.gz -------------------------------------------------------------------------------- /docs/profiles/discard_ipv4_10g.pb.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chenshuo/tcpip-study/5ac37b62e77316334e7aa2a1973fa902443a5f51/docs/profiles/discard_ipv4_10g.pb.gz -------------------------------------------------------------------------------- /docs/profiles/discard_recv_ipv4_10g.pb.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chenshuo/tcpip-study/5ac37b62e77316334e7aa2a1973fa902443a5f51/docs/profiles/discard_recv_ipv4_10g.pb.gz -------------------------------------------------------------------------------- /docs/profiles/echo_epoll_ipv4_10g.pb.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chenshuo/tcpip-study/5ac37b62e77316334e7aa2a1973fa902443a5f51/docs/profiles/echo_epoll_ipv4_10g.pb.gz -------------------------------------------------------------------------------- /docs/profiles/roundtrip_ipv6_10g.pb.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chenshuo/tcpip-study/5ac37b62e77316334e7aa2a1973fa902443a5f51/docs/profiles/roundtrip_ipv6_10g.pb.gz -------------------------------------------------------------------------------- /docs/profiles/tcp_rr_client_ipv4_10g_kernel4.19.pb.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chenshuo/tcpip-study/5ac37b62e77316334e7aa2a1973fa902443a5f51/docs/profiles/tcp_rr_client_ipv4_10g_kernel4.19.pb.gz -------------------------------------------------------------------------------- /docs/recovery.md: -------------------------------------------------------------------------------- 1 | # TCP loss recovery 2 | 3 | There are a number of ways to recovery from packet (segment) loss, here's a list that I am aware of. 4 | 5 | * RTO, since 1981 (RFC793), most basic, catch-all. 6 | * Fast retransmit, since 1988. 3 duplicated ACKS. 7 | * SACK 8 | * FACK 9 | * Early retransmit 10 | * RACK and TLP 11 | 12 | -------------------------------------------------------------------------------- /docs/reno.md: -------------------------------------------------------------------------------- 1 | # Reno Congestion Control 2 | 3 | Classic Reno CC algorithm fits in one page from `linux/net/ipv4/tcp_cong.c` 4 | 5 | ``` 6 | /* 7 | * TCP Reno congestion control 8 | * This is special case used for fallback as well. 9 | */ 10 | /* This is Jacobson's slow start and congestion avoidance. 11 | * SIGCOMM '88, p. 328. 12 | */ 13 | void tcp_reno_cong_avoid(struct sock *sk, u32 ack, u32 acked) 14 | { 15 | struct tcp_sock *tp = tcp_sk(sk); 16 | 17 | if (!tcp_is_cwnd_limited(sk)) 18 | return; 19 | 20 | /* In "safe" area, increase. */ 21 | if (tcp_in_slow_start(tp)) { 22 | acked = tcp_slow_start(tp, acked); 23 | if (!acked) 24 | return; 25 | } 26 | /* In dangerous area, increase slowly. */ 27 | tcp_cong_avoid_ai(tp, tcp_snd_cwnd(tp), acked); 28 | } 29 | EXPORT_SYMBOL_GPL(tcp_reno_cong_avoid); 30 | 31 | /* Slow start is used when congestion window is no greater than the slow start 32 | * threshold. We base on RFC2581 and also handle stretch ACKs properly. 33 | * We do not implement RFC3465 Appropriate Byte Counting (ABC) per se but 34 | * something better;) a packet is only considered (s)acked in its entirety to 35 | * defend the ACK attacks described in the RFC. Slow start processes a stretch 36 | * ACK of degree N as if N acks of degree 1 are received back to back except 37 | * ABC caps N to 2. Slow start exits when cwnd grows over ssthresh and 38 | * returns the leftover acks to adjust cwnd in congestion avoidance mode. 39 | */ 40 | u32 tcp_slow_start(struct tcp_sock *tp, u32 acked) 41 | { 42 | u32 cwnd = min(tcp_snd_cwnd(tp) + acked, tp->snd_ssthresh); 43 | 44 | acked -= cwnd - tcp_snd_cwnd(tp); 45 | tcp_snd_cwnd_set(tp, min(cwnd, tp->snd_cwnd_clamp)); 46 | 47 | return acked; 48 | } 49 | EXPORT_SYMBOL_GPL(tcp_slow_start); 50 | 51 | /* In theory this is tp->snd_cwnd += 1 / tp->snd_cwnd (or alternative w), 52 | * for every packet that was ACKed. 53 | */ 54 | void tcp_cong_avoid_ai(struct tcp_sock *tp, u32 w, u32 acked) 55 | { 56 | // I guess "ai" here stands for additive increase 57 | // ... 58 | } 59 | 60 | /* Slow start threshold is half the congestion window (min 2) */ 61 | u32 tcp_reno_ssthresh(struct sock *sk) 62 | { 63 | const struct tcp_sock *tp = tcp_sk(sk); 64 | 65 | return max(tcp_snd_cwnd(tp) >> 1U, 2U); 66 | } 67 | EXPORT_SYMBOL_GPL(tcp_reno_ssthresh); 68 | ``` 69 | 70 | Where `tcp_in_slow_start()` and `tcp_is_cwnd_limited()` are defined in `linux/include/net/tcp.h': 71 | 72 | 73 | ``` 74 | static inline bool tcp_in_slow_start(const struct tcp_sock *tp) 75 | { 76 | return tcp_snd_cwnd(tp) < tp->snd_ssthresh; 77 | } 78 | 79 | /* We follow the spirit of RFC2861 to validate cwnd but implement a more 80 | * flexible approach. The RFC suggests cwnd should not be raised unless 81 | * it was fully used previously. And that's exactly what we do in 82 | * congestion avoidance mode. But in slow start we allow cwnd to grow 83 | * as long as the application has used half the cwnd. 84 | * Example : 85 | * cwnd is 10 (IW10), but application sends 9 frames. 86 | * We allow cwnd to reach 18 when all frames are ACKed. 87 | * This check is safe because it's as aggressive as slow start which already 88 | * risks 100% overshoot. The advantage is that we discourage application to 89 | * either send more filler packets or data to artificially blow up the cwnd 90 | * usage, and allow application-limited process to probe bw more aggressively. 91 | */ 92 | static inline bool tcp_is_cwnd_limited(const struct sock *sk) 93 | { 94 | const struct tcp_sock *tp = tcp_sk(sk); 95 | 96 | if (tp->is_cwnd_limited) 97 | return true; 98 | 99 | /* If in slow start, ensure cwnd grows to twice what was ACKed. */ 100 | if (tcp_in_slow_start(tp)) 101 | return tcp_snd_cwnd(tp) < 2 * tp->max_packets_out; 102 | 103 | return false; 104 | } 105 | ``` 106 | 107 | `tcp_reno_cong_avoid()` gets called from `tcp_rcv_established()`. 108 | 109 | ``` 110 | tcp_rcv_established() 111 | -> tcp_queue_rcv() // If income segment have payload 112 | -> tcp_ack() // This routine deals with incoming acks, but not outgoing ones. 113 | -> tcp_cong_control() // Common entry point, called toward the end of 114 | // processing an ACK with precise rate info. 115 | // All transmission or retransmission are delayed afterwards. 116 | -> if tcp_in_cwnd_reduction(): 117 | tcp_cwnd_reduction() 118 | elif tcp_may_raise_cwnd(): 119 | tcp_cong_avoid(): icsk->icsk_ca_ops->cong_avoid() 120 | -> tcp_reno_cong_avoid() or cubictcp_cong_avoid() 121 | -> tcp_data_snd_check() // If we can send more data 122 | -> tcp_push_pending_frames() 123 | -> tcp_check_space() // check if new space made available 124 | -> tcp_new_space() // if SOCK_NOSPACE is set in sk->sk_socket->flags 125 | -> if tcp_should_expand_sndbuf(): 126 | tcp_sndbuf_expand() 127 | -> __tcp_ack_snd_check() // Check if sending an ack is needed. 128 | ``` 129 | -------------------------------------------------------------------------------- /docs/slowstart.md: -------------------------------------------------------------------------------- 1 | # TCP Slow Start 2 | 3 | Standard slow start [RFC5681](https://www.rfc-editor.org/rfc/rfc5681.html#section-3.1): 4 | 5 | * Cwnd increases MSS upon receipt of an ACK covering new data of MSS. But Linux and FreeBSD differ when `bytes_acked > 2 * MSS`. 6 | * Effectively Cwnd doubles every round-trip time, Cwnd = IW * 2 ^ nRTT. 7 | * Exits when a packet loss is detected, sets ssthresh to Cwnd/2, as per Reno CC. 8 | * Linux and FreeBSD don't increase Cwnd if transmitting is not limited by Cwnd, see `cubictcp_cong_avoid()` in `tcp_cubic.c` and `tcp_reno_cong_avoid()` in `tcp_cong.c`). 9 | 10 | With Initial Window = 10: 11 | 12 | ![](img/slow-start.png) 13 | 14 | ## Kernel knobs 15 | 16 | ``` 17 | $ sysctl -A |grep tcp_.mem 18 | # min default max 19 | net.ipv4.tcp_rmem = 4096 131072 6291456 20 | net.ipv4.tcp_wmem = 4096 16384 4194304 21 | ``` 22 | 23 | For `tcp_wmem[1] == 16K`, the default `sndbuf` is 16K. 24 | Which is greater than 10 (initial window) * 1460 (typical IPv4 MSS), 25 | works well for slow start. 26 | 27 | Linux 4.20 changed `tcp_rmem[1]` from 87k to 128KiB, [commit by Yuchung Cheng in 2018-09](https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=a337531b942bd8a03e7052444d7e36972aac2d92). 28 | So SYN rwin increased from 43k to 65k (when `tcp_adv_win_scale == 1`). 29 | 30 | With IW=10 and MSS=1.4k, during slow start, 31 | the old setting will hit window full on 3rd RTT, 32 | limit sent bytes to ~70kB. 33 | In new setting, 1.4 * (10+20+40) ~= 100k can be sent in 3RTT. 34 | 35 | ![](img/slow-start-10ms.png) 36 | 37 | ![](img/slow-start-10ms-large-rwnd.png) 38 | 39 | 40 | From `linux-stable/Documentation/networking/ip-sysctl.rst`: 41 | 42 | ``` 43 | tcp_adv_win_scale - INTEGER 44 | Count buffering overhead as bytes/2^tcp_adv_win_scale 45 | (if tcp_adv_win_scale > 0) or bytes-bytes/2^(-tcp_adv_win_scale), 46 | if it is <= 0. Default: 1 47 | ``` 48 | 49 | In other words, 50 | 51 | | `tcp_adv_win_scale` | Advertised window ratio | Max adv win when `tcp_rmem[2] == 8M` | 52 | | ------------------ | ----------------------- | --- | 53 | | 0 | 100% | 8M | 54 | | 1 (default since Linux 3.4) | 50% | 4M | 55 | | 2 | 75% | 6M | 56 | | 3 | 87.5% | 7M | 57 | | -1 | 50% | 4M | 58 | | -2 | 25% | 2M | 59 | | -3 | 12.5% | 1M | 60 | 61 | This value was changed in Linux 3.4 from 2 to 1. 62 | Here's a brief history: 63 | 64 | | Kernel Version | `tcp_adv_win_scale` | sysctl `tcp_rmem[]` | Initial advertised rcvwnd | Max rcvwnd | commit | 65 | | --- | --- | --- | --- | --- | --- | 66 | | Before 3.4 | 2 | "4096 87380 4MiB" | 65535 = (87380 * 0.75) | 3MiB = (4MiB * 0.75) | | 67 | | 3.4 to 4.19 | 1 | "4096 87380 6MiB" | 43800 = (87380 * 0.5) | 3MiB = (6MiB * 0.5) | [2012-05](https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=b49960a05e32121d29316cfdf653894b88ac9190) | 68 | | Since 4.20 | 1 | "4096 128KiB 6MiB" | 64Ki = (128Ki * 0.5) | 3MiB = (6MiB * 0.5) | [2018-09](https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=a337531b942bd8a03e7052444d7e36972aac2d92)| 69 | 70 | ## HyStart++ 71 | 72 | Stardard slow start ends when a packet loss is detected, but this often causes overshoot. 73 | 74 | _HyStart++ uses "increase in round-trip delay" as a heuristic to find an exit point before possible overshoot._ 75 | 76 | * [RFC9406](https://tools.ietf.org/html/rfc9406) HyStart++: Modified Slow Start for TCP, 2023-05. 77 | * Linux incorporated HyStart++ to CUBIC in v2.6.29, 2009. [commit by Sangtae Ha](https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=ae27e98a51526595837ab7498b23d6478a198960). 78 | * FreeBSD adds HyStart++ to its newreno CC in 2021, but not released as of 13.2. 79 | FreeBSD will switch to CUBIC (in release 14?). 80 | * 81 | 82 | ## FreeBSD 83 | 84 | As noticed in [Low throughput due to small cwnd](throughput.md#small-cwnd), 85 | FreeBSD slow-start is sometimes much slower than Linux, 86 | and underutilizes the bandwidth of a link with long delay (say 50ms ~ 100ms). 87 | 88 | ![](img/freebsd-throughput.png) 89 | 90 | As I analyzed in , 91 | it's due to bad interaction with LRO and delayed-ACKs of receiver side. 92 | 93 | RFC 5681 states that 94 | 95 | ```text 96 | During slow start, a TCP increments cwnd by at most SMSS bytes for 97 | each ACK received that cumulatively acknowledges new data. While 98 | traditionally TCP implementations have increased cwnd by precisely 99 | SMSS bytes upon receipt of an ACK covering new data, we RECOMMEND 100 | that TCP implementations increase cwnd, per: 101 | 102 | cwnd += min (N, SMSS) 103 | 104 | where N is the number of previously unacknowledged bytes acknowledged 105 | in the incoming ACK. 106 | ``` 107 | 108 | If one ACK is generated per SMSS, the `cwnd` grows exponentially. 109 | In old times, 110 | 111 | ``` 112 | RTT1: cwnd = 1, send 1 MSS 113 | RTT2: got 1st ACK, cwnd = 2, send 2 MSS 114 | RTT3: got 2nd ACK, cwnd = 3, send 2 MSS 115 | got 3rd ACK, cwnd = 4, send 2 MSS 116 | RTT4: got 4 ACKs, cwnd = 8, send 8 MSS 117 | RTT5: got 8 ACKs, cwnd = 16, send 16 MSS 118 | ``` 119 | 120 | RFC 5681 also requires that 121 | 122 | > A receiver SHOULD generate an ACK for **at least every second full-sized segment.** 123 | 124 | If receiver follows this, the slow-start should work just fine. 125 | 126 | But in case of LRO and TSO, the sender sees much less ACKs than the old days. 127 | 128 | Really slow start: 129 | 130 | ``` 131 | RTT1: cwnd = 10, send segment of 10 MSS with TSO 132 | RTT2: got 1st ACK (LRO in receiver), cwnd = 12, send 12 MSS with TSO 133 | RTT3: got 2nd ACK, cwnd = 14, send 14 MSS with TSO 134 | RTT4: got 3rd ACK, cwnd = 16, send 16 MSS with TSO 135 | RTT5: got 4th ACK, cwnd = 18, send 18 MSS with TSO 136 | ``` 137 | 138 | So `cwnd` grows more-or-less linearly until it reaches max TSO segments (65k / 1.4k =~ 45). 139 | I intuitively guess `cwnd` grows quadratically after that, 140 | i.e. two segments/ACKs per RTT, then three segments/ACKs per RTT, and so on. 141 | 142 | FreeBSD 13.x TCP sender closely follows RFC 5681 with RFC 3465 extension, 143 | It also addressed the LRO of the sender side (multiple ACKs being coalesced into one). 144 | `sys/netinet/cc/cc_newreno.c` 145 | 146 | ```c 147 | static void 148 | newreno_ack_received(struct cc_var *ccv, uint16_t type) 149 | { 150 | // ... 151 | /* 152 | * Regular in-order ACK, open the congestion window. 153 | * Method depends on which congestion control state we're 154 | * in (slow start or cong avoid) and if ABC (RFC 3465) is 155 | * enabled. 156 | * 157 | * slow start: cwnd <= ssthresh 158 | * cong avoid: cwnd > ssthresh 159 | * 160 | * slow start and ABC (RFC 3465): 161 | * Grow cwnd exponentially by the amount of data 162 | * ACKed capping the max increment per ACK to 163 | * (abc_l_var * maxseg) bytes. 164 | * 165 | * slow start without ABC (RFC 5681): 166 | * Grow cwnd exponentially by maxseg per ACK. 167 | * 168 | * ... 169 | */ 170 | 171 | // In slow-start 172 | if (V_tcp_do_rfc3465) { 173 | uint16_t abc_val; 174 | 175 | if (ccv->flags & CCF_USE_LOCAL_ABC) 176 | abc_val = ccv->labc; 177 | else 178 | abc_val = V_tcp_abc_l_var; // sysctl value, default = 2 179 | 180 | // abc_val is 1 by default 181 | // ccv->nsegs is number of ACKs being aggregated due to LRO 182 | 183 | if (CCV(ccv, snd_nxt) == CCV(ccv, snd_max)) 184 | incr = min(ccv->bytes_this_ack, 185 | ccv->nsegs * abc_val * 186 | CCV(ccv, t_maxseg)); 187 | else 188 | incr = min(ccv->bytes_this_ack, CCV(ccv, t_maxseg)); 189 | } 190 | 191 | // incr == 2*1448 = 2896 in normal start start case 192 | ``` 193 | 194 | As discussed in sec3.2 of RFC 3465, L=2*SMSS bytes exactly balances 195 | the negative impact of the delayed ACK algorithm. 196 | 197 | But if the receiver (sink) also does LRO, it won't generate enough ACKs to open cwnd of the sender. 198 | Often we observe that FreeBSD is ``slower`` when sending data using TCP, comparing to Linux or even Windows. 199 | 200 | 201 | As suggested in , 202 | a large `abc_l_var` should help in this situation. 203 | 204 | ``` 205 | # TCP Slow start gradually increases the data send rate until the TCP 206 | # congestion algorithm (CDG, H-TCP) calculates the networks maximum carrying 207 | # capacity without dropping packets. TCP Congestion Control with Appropriate 208 | # Byte Counting (ABC) allows our server to increase the maximum congestion 209 | # window exponentially by the amount of data ACKed, but limits the maximum 210 | # increment per ACK to (abc_l_var * maxseg) bytes. An abc_l_var of 44 times a 211 | # maxseg of 1460 bytes would allow slow start to increase the congestion window 212 | # by more than 64 kilobytes per step; 65535 bytes is the TCP receive buffer 213 | # size of most hosts without TCP window scaling. 214 | # 215 | net.inet.tcp.abc_l_var=44 # (default 2) if net.inet.tcp.mssdflt = 1460 216 | ``` 217 | 218 | ### Traces 219 | 220 | When `h_ertt` is on, it will momentarily disable TSO whilst marking a 221 | packet to use for a new RTT measurement, resulting in more segments being sent, 222 | and more ACK received (from Linux), then cwnd could increase faster. 223 | 224 | Linux enters 'quickack' mode right after connection establishment, 225 | disabling delayed-ACKing for a while, to help peer measuring RTT and accelerate slow-start. 226 | But FreeBSD won't, it always delays ACKing (IIUC). 227 | 228 | ``` 229 | // linux-stable/include/net/tcp.h 230 | 231 | /* Maximal number of ACKs sent quickly to accelerate slow-start. */ 232 | #define TCP_MAX_QUICKACKS 16U 233 | ``` 234 | 235 | ``` 236 | // linux-stable/net/ipv4/tcp_input.c 237 | 238 | static void tcp_event_data_recv(struct sock *sk, struct sk_buff *skb) 239 | { 240 | // ... 241 | 242 | if (!icsk->icsk_ack.ato) { 243 | /* The _first_ data packet received, initialize 244 | * delayed ACK engine. 245 | */ 246 | tcp_incr_quickack(sk, TCP_MAX_QUICKACKS); 247 | icsk->icsk_ack.ato = TCP_ATO_MIN; 248 | 249 | ``` 250 | 251 | * FreeBSD 13 sender, Linux receiver 252 | 253 | Really slow start on a 100ms link: 254 | 255 | ```c 256 | 0.000000 IP freebsd13 > linux: Flags [S], seq 205083268, win 65535, options [mss 1460,nop,wscale 10,sackOK,TS val 495212525 ecr 0], len 0 257 | 0.100786 IP linux > freebsd13: Flags [S.], seq 708257395, ack 205083269, win 65160, options [mss 1460,sackOK,TS val 563185696 ecr 495212525,nop,wscale 7], len 0 258 | 0.100800 IP freebsd13 > linux: Flags [.], ack 1, win 65, options [nop,nop,TS val 495212626 ecr 563185696], len 0 259 | 260 | // cwnd = 10 261 | 0.101062 IP freebsd13 > linux: Flags [.], seq 1:14481, ack 1, win 65, len 14480 262 | 0.201241 IP linux > freebsd13: Flags [.], ack 14481, win 427, len 0 263 | 264 | // cwnd = 12 265 | 0.201253 IP freebsd13 > linux: Flags [.], seq 14481:31857, ack 1, win 65, len 17376 266 | 0.301621 IP linux > freebsd13: Flags [.], ack 31857, win 411, len 0 267 | 268 | // cwnd = 14 269 | 0.301630 IP freebsd13 > linux: Flags [.], seq 31857:52129, ack 1, win 65, len 20272 270 | 0.402010 IP linux > freebsd13: Flags [.], ack 52129, win 395, len 0 271 | 272 | // cwnd = 16 273 | 0.402018 IP freebsd13 > linux: Flags [P.], seq 52129:73629, ack 1, win 65, len 21500 274 | 0.402026 IP freebsd13 > linux: Flags [.], seq 73629:75077, ack 1, win 65, len 1448 275 | 0.502392 IP linux > freebsd13: Flags [.], ack 75077, win 860, len 0 276 | 277 | // cwnd = 18 278 | 0.502398 IP freebsd13 > linux: Flags [.], seq 75077:101141, ack 1, win 65, len 26064 279 | 0.602775 IP linux > freebsd13: Flags [.], ack 101141, win 1267, len 0 280 | 281 | // cwnd = 20 282 | 0.602783 IP freebsd13 > linux: Flags [.], seq 101141:130101, ack 1, win 65, len 28960 283 | 0.703169 IP linux > freebsd13: Flags [.], ack 130101, win 1719, len 0 284 | 285 | // cwnd = 22 286 | 0.703177 IP freebsd13 > linux: Flags [P.], seq 130101:159297, ack 1, win 65, len 29196 287 | 0.703185 IP freebsd13 > linux: Flags [.], seq 159297:160745, ack 1, win 65, len 1448 288 | 0.803367 IP linux > freebsd13: Flags [.], ack 160745, win 2198, len 0 289 | 290 | // cwnd = 24 291 | 0.803375 IP freebsd13 > linux: Flags [.], seq 160745:195497, ack 1, win 65, len 34752 292 | 0.903754 IP linux > freebsd13: Flags [.], ack 195497, win 2741, len 0 293 | 294 | // cwnd = 26 295 | 0.903762 IP freebsd13 > linux: Flags [.], seq 195497:233145, ack 1, win 65, len 37648 296 | ``` 297 | 298 | * FreeBSD 14 sender, Linux receiver 299 | 300 | Faster than 13, because `h_ertt` is on by default. 301 | 302 | ```c 303 | 0.000000 IP freebsd14 > linux: Flags [S], seq 3748224575, win 65535, options [mss 1460,nop,wscale 6,sackOK,TS val 790917200 ecr 0], len 0 304 | 0.100788 IP linux > freebsd14: Flags [S.], seq 489912744, ack 3748224576, win 65160, options [mss 1460,sackOK,TS val 380822116 ecr 790917200,nop,wscale 7], len 0 305 | 0.100851 IP freebsd14 > linux: Flags [.], ack 1, win 1027, options [nop,nop,TS val 790917303 ecr 380822116], len 0 306 | 307 | // cwnd = 10 308 | 0.114635 IP freebsd14 > linux: Flags [.], seq 1:14481, ack 1, win 1027, len 14480 309 | 0.215265 IP linux > freebsd14: Flags [.], ack 14481, win 445, len 0 310 | 311 | // cwnd = 12, h_ertt is on, send two segments, 1 MSS + 11 MSS 312 | 0.215334 IP freebsd14 > linux: Flags [.], seq 14481:15929, ack 1, win 1027, len 1448 313 | 0.215363 IP freebsd14 > linux: Flags [.], seq 15929:31857, ack 1, win 1027, len 15928 314 | 315 | // got ACK for 1 MSS, cwnd = 13, send 2 segments 316 | 0.316142 IP linux > freebsd14: Flags [.], ack 15929, win 501, len 0 317 | 0.316210 IP freebsd14 > linux: Flags [.], seq 31857:33305, ack 1, win 1027, len 1448 318 | 0.316240 IP freebsd14 > linux: Flags [.], seq 33305:34753, ack 1, win 1027, len 1448 319 | 320 | // got ACK for 11 MSS, cwnd = 15, send 1 segment of 13 MSS 321 | 0.316248 IP linux > freebsd14: Flags [.], ack 31857, win 440, len 0 322 | 0.316279 IP freebsd14 > linux: Flags [.], seq 34753:53577, ack 1, win 1027, len 18824 323 | 324 | // got ACK for 1 MSS, cwnd = 16, send 2 segments 325 | 0.416970 IP linux > freebsd14: Flags [.], ack 33305, win 524, len 0 326 | 0.417047 IP freebsd14 > linux: Flags [.], seq 53577:55025, ack 1, win 1027, len 1448 327 | 0.417077 IP freebsd14 > linux: Flags [.], seq 55025:56473, ack 1, win 1027, len 1448 328 | 329 | // got ACK for 1 MSS, cwnd = 17, send 1 segment of 2 MSS 330 | 0.417085 IP linux > freebsd14: Flags [.], ack 34753, win 546, len 0 331 | 0.417103 IP freebsd14 > linux: Flags [.], seq 56473:59369, ack 1, win 1027, len 2896 332 | 333 | // got ACK for 13 MSS, cwnd = 19, send 1 segment of 15 MSS 334 | 0.417111 IP linux > freebsd14: Flags [.], ack 53577, win 840, len 0 335 | 0.417146 IP freebsd14 > linux: Flags [.], seq 59369:81089, ack 1, win 1027, len 21720 336 | 337 | // And so on 338 | 0.517759 IP linux > freebsd14: Flags [.], ack 55025, win 863, len 0 339 | 0.517833 IP freebsd14 > linux: Flags [.], seq 81089:82537, ack 1, win 1027, len 1448 340 | 0.517862 IP freebsd14 > linux: Flags [.], seq 82537:83985, ack 1, win 1027, len 1448 341 | ``` 342 | 343 | * FreeBSD 13 sender with `h_ertt` on, same as FreeBSD 14 above 344 | 345 | * FreeBSD 14 sender, FreeBSD receiver 346 | 347 | Not as fast as Linux reciever, because the receiver delays ACKs. 348 | 349 | 350 | 351 | -------------------------------------------------------------------------------- /docs/sockets.md: -------------------------------------------------------------------------------- 1 | # IPv6 Sockets Programming Notes 2 | 3 | ## `struct sockaddr_in6` is bigger than `struct sockaddr` 4 | 5 | BSD Sockets API is not type-safe w.r.t. socket address structs, 6 | probably because it predates function signature checking in C. 7 | We often need to cast a `struct sockaddr_in*` or `struct sockaddr_in6*` to `struct sockaddr*`, like: 8 | 9 | ``` 10 | // Sockets API often takes struct sockaddr*, which could point to 11 | // sockaddr_in, sockaddr_in6, or sockaddr_un. 12 | 13 | int bind(int sockfd, const struct sockaddr *addr, socklen_t addrlen); 14 | ``` 15 | 16 | ``` 17 | // IPv4 18 | struct sockaddr_in listen_addr = { 19 | .sin_family = AF_INET, 20 | .sin_port = htons(8000), 21 | .sin_addr.s_addr = htonl(INADDR_LOOPBACK), 22 | }; 23 | 24 | int sockfd = socket(AF_INET, SOCK_STREAM, 0); 25 | if (bind(sockfd, (struct sockaddr *) &listen_addr, sizeof listen_addr) < 0) { 26 | perror("bind"); 27 | } 28 | ``` 29 | 30 | ``` 31 | // IPv6 32 | struct sockaddr_in6 addr6 = { 33 | .sin6_family = AF_INET6, 34 | .sin6_port = htons(8000), 35 | .sin6_addr = in6addr_any, 36 | }; 37 | 38 | int sockfd = socket(AF_INET6, SOCK_STREAM, 0); 39 | if (bind(sockfd, (struct sockaddr *) &addr6, sizeof addr6) < 0) { 40 | perror("bind"); 41 | } 42 | ``` 43 | 44 | On platforms I tested, `sizeof(struct sockaddr) == sizeof(struct sockaddr_in) == 16`, 45 | while `sizeof(struct sockaddr_in6) == 28`. 46 | So sockets functions that take `struct sockaddr*` as input parameter usually work fine, e.g. 47 | `bind()`, `connect()`, `sendto()`. 48 | 49 | However, be careful about output parameter in `accept()`, `recvfrom()`, `getpeername()`, and `getsockname()`. 50 | Don't use `struct sockaddr` to hold the result, unless you know it's AF_INET not AF_INET6. 51 | 52 |
53 | Wrong UDP echo server for IPv6 54 | ``` 55 | void udp_echo(int sockfd) 56 | { 57 | struct sockaddr peerAddr; // *** Can't hold sockaddr_in6 58 | socklen_t addrLen; 59 | char message[1024]; 60 | 61 | while (true) { 62 | addrLen = sizeof peerAddr; 63 | bzero(&peerAddr, sizeof peerAddr); 64 | ssize_t nr = recvfrom(sockfd, message, sizeof message, 0, &peerAddr, &addrLen); 65 | // peerAddr is truncated in IPv6, so message won't be echoed back to the sender. 66 | if (nr >= 0) { 67 | ssize_t nw = sendto(sockfd, message, nr, 0, &peerAddr, addrLen); 68 | } 69 | } 70 | } 71 | ``` 72 |
73 | 74 | `strace(1)` output: 75 | ``` 76 | recvfrom(3, "hello", 1024, 0, {sa_family=AF_INET6, sa_data="\222R\0\0\0\0\0\0\0\0\0\0\0\0"}, [16->28]) = 5 77 | sendto(3, "hello", 5, 0, {sa_family=AF_INET6, sin6_port=htons(37458), sin6_flowinfo=htonl(0), inet_pton(AF_INET6, "::fa3b:3860:0:0", &sin6_addr), sin6_scope_id=539754}, 28) = 5 78 | ``` 79 | Note that the address was truncated `[16->28]` in `recvfrom()`. 80 | 81 | 82 |
83 | Even worse, an address violation could happen when reading a `struct sockaddr` as `struct sockaddr_in6`. 84 | ``` 85 | void printAddress(struct sockaddr* addr) 86 | { 87 | char buf[INET6_ADDRSTRLEN]; 88 | if (peerAddr.sa_family == AF_INET) { 89 | // ... 90 | } else if (peerAddr.sa_family == AF_INET6) { 91 | struct sockaddr_in6* addr6 = (struct sockaddr_in6*)&peerAddr; // *** 92 | if (inet_ntop(AF_INET6, &addr6->sin6_addr, buf, sizeof buf)) { 93 | printf("[%s]:%u\n", buf, ntohs(addr6->sin6_port)); 94 | } 95 | } 96 | } 97 | 98 | void printPeerNameWrong(int sockfd) 99 | { 100 | struct sockaddr peerAddr; 101 | socklen_t addrlen = sizeof peerAddr; 102 | if (getpeername(sockfd, &peerAddr, &addrlen) < 0) { 103 | perror("getpeername"); 104 | return; 105 | } 106 | // !!! should check addrlen <= sizeof peerAddr. 107 | printAddress(&peerAddr); 108 | } 109 | 110 | ``` 111 |
112 | 113 | 114 | AddressSanitizer output: 115 | ``` 116 | ==3735==ERROR: AddressSanitizer: stack-buffer-overflow on address 0x7ffd8e5a3380 at pc 0x7f0e484164c2 bp 0x7ffd8e5a3240 sp 0x7ffd8e5a29f0 117 | READ of size 16 at 0x7ffd8e5a3380 thread T0 118 | #0 0x7f0e484164c1 in __interceptor_inet_ntop ../../../../src/libsanitizer/sanitizer_common/sanitizer_common_interceptors.inc:2478 119 | #1 0x5595a07923d1 in printAddress (/home/schen/cpp/a.out+0x13d1) 120 | #2 0x5595a0792728 in printPeerNameWrong (/home/schen/cpp/a.out+0x1728) 121 | #3 0x5595a0792b1c in main (/home/schen/cpp/a.out+0x1b1c) 122 | #4 0x7f0e48237d09 in __libc_start_main ../csu/libc-start.c:308 123 | #5 0x5595a07921e9 in _start (/home/schen/cpp/a.out+0x11e9) 124 | 125 | Address 0x7ffd8e5a3380 is located in stack of thread T0 at offset 80 in frame 126 | #0 0x5595a0792645 in printPeerNameWrong (/home/schen/cpp/a.out+0x1645) 127 | 128 | This frame has 2 object(s): 129 | [48, 52) 'addrlen' (line 37) 130 | [64, 80) 'peerAddr' (line 36) <== Memory access at offset 80 overflows this variable 131 | SUMMARY: AddressSanitizer: stack-buffer-overflow ../../../../src/libsanitizer/sanitizer_common/sanitizer_common_interceptors.inc:2478 in __interceptor_inet_ntop 132 | ``` 133 | 134 | This stack buffer overflow is not detected by Valgrind, instead, the program prints some 135 | truncated address like `2601:647:4500:32bd::` when it should print `2601:647:4500:32bd:8a51:xxff:fexx:xxxx`. 136 | 137 | In both cases above, `struct sockaddr_storage` should be used instead. 138 | 139 | A `reinterpret_cast` is usually needed in C++, or two `static_cast` 140 | from `struct sockaddr_in*` to `void*`, then to `struct sockaddr*`, like this: 141 | `static_cast(static_cast(&addr))`. 142 | New functions like `inet_pton` and `inet_ntop` use `void*` for socket addresses, 143 | save us some casting. 144 | Also in C++, pass-by-reference (`struct sockaddr&`) vs. pass-by-value, 145 | the latter may be a victim of object-slicing problem. 146 | 147 | ## Host environment 148 | 149 | 1. IPv4 only 150 | * `socket(AF_INET, ...)` succeeds 151 | * `socket(AF_INET6, ...)` fails 152 | 2. IPv6 only 153 | * `socket(AF_INET, ...)` fails 154 | * `socket(AF_INET6, ...)` succeeds 155 | 3. Dual-stack 156 | * `socket(AF_INET, ...)` succeeds 157 | * `socket(AF_INET6, ...)` succeeds 158 | 159 | ## Client 160 | 161 | Usually easy to write protocol-independent code. 162 | 163 | 1. parse IP address & port from text, select `sockaddr_in` or `sockaddr_in6` based on address format. 164 | 1. create socket fd for saddr.sa_family. 165 | 1. connect to saddr 166 | 1. read/write as usual. 167 | 168 | ## Server 169 | 170 | 1. IPv4 only, listen on 0.0.0.0:8000, serves IPv4 clients only. 171 | 1. IPv6 dual-stack, listen on [::]:8000, serves both IPv4 and IPv6 clients. IPv4 clients show up as IPv4-mapped address, like `::ffff:10.0.0.118`. 172 | 1. IPv6 only, listen on [::]:8000, turn on `IPV6_V6ONLY`, serves IPv6 clients only. 173 | 1. Two sockets, one listens on [::]:8000, turn on `IPV6_V6ONLY`, another listens on 0.0.0.0:8000. 174 | Serves IPv6 and IPv4 clients, respectively.`sshd` and `nginx` do this by default. 175 | 176 | System wide `IPV6_V6ONLY` on Linux: 177 | 178 | * sysctl `net.ipv6.bindv6only` 179 | * `/proc/sys/net/ipv6/bindv6only` 180 | 181 | Linux has it off by default, but see ref below for different opinion from OpenBSD. 182 | 183 | ## `read(2)` vs. `recv(2)` 184 | 185 | Table 12.1 Sending and receiving data on a socket, 186 | from page 598 of *The Design and Implementation of the FreeBSD Operating System, 2nd ed.*. 187 | 188 | | syscall | Flags | Address | Scatter/Gather | Aux Data | 189 | | --- | --- | --- | --- | --- | 190 | | read/write | N | N | N | N | 191 | | readv/writev | N | N | Y | N | 192 | | recv/send | Y | N | N | N | 193 | | recvfrom/sendto | Y | Y | N | N | 194 | | recvmsg/sendmsg | Y | Y | Y | Y | 195 | 196 | On x86-64 Linux, `recv(2)` is not a syscall in fact, it's a wrapper of `recvfrom(..., NULL, NULL)`. 197 | Call graph as of Linux 5.15: 198 | 199 | ```dot 200 | digraph G { 201 | rankdir = TB; 202 | sys_read [label="__x64_sys_read\n__do_sys_read\nksys_read\nvfs_read\nnew_sync_read\n\nfs/read_write.c", shape=box] 203 | # read_iter [label="call_read_iter\nsock_read_iter"] 204 | "read()" -> sys_read -> "call_read_iter" -> "sock_read_iter" -> "sock_recvmsg" -> sock_recvmsg_nosec -> inet_recvmsg -> tcp_recvmsg 205 | inet_recvmsg[label="inet_recvmsg\nnet/ipv4/af_inet.c"] 206 | tcp_recvmsg[label="tcp_recvmsg\nnet/ipv4/tcp.c"] 207 | 208 | "call_read_iter" [label="call_read_iter\nlinux/fs.h"] 209 | sys_recvmsg [label="__x64_sys_recvmsg\n__se_sys_recvmsg\n__do_sys_recvmsg\n__sys_recvmsg\n\nnet/socket.c", shape=box] 210 | "recvmsg()" -> sys_recvmsg -> "sock_recvmsg" 211 | 212 | sys_readv [label="__x64_sys_readv\n__do_sys_readv\ndo_readv\nvfs_readv\ndo_iter_read\ndo_iter_readv_writev\n\nfs/read_write.c", shape=box] 213 | "readv()" -> sys_readv -> "call_read_iter" 214 | 215 | sys_recvfrom [label="__x64_sys_recvfrom\n__se_sys_recvfrom\n__do_sys_recvfrom\n__sys_recvfrom\n\nnet/socket.c", shape=box] 216 | "recvfrom()" -> sys_recvfrom -> "sock_recvmsg" 217 | 218 | 219 | "recv()" -> "__libc_recv\nglibc on x86-64" -> "recvfrom()" 220 | // "recv()" [style=filled] 221 | 222 | subgraph cluster_x { 223 | graph [style="dashed", label="net/socket.c"] 224 | sock_read_iter;sock_recvmsg;sock_recvmsg_nosec 225 | // sys_recvfrom;sys_recvmsg 226 | } 227 | } 228 | ``` 229 | 230 | ## References 231 | 232 | * [RFC3493](https://tools.ietf.org/html/rfc3493) Basic Socket Interface Extensions for IPv6, 2020/02 233 | * [lwn688462](https://lwn.net/Articles/688462/) Should distributors disable IPv4-mapped IPv6? 234 | * [UNP3e](http://unpbook.com/) _Unix Network Programming 3/e_, Chapter 12. 235 | 236 | -------------------------------------------------------------------------------- /docs/stevens.md: -------------------------------------------------------------------------------- 1 | # W. Richard Stevens 2 | 3 | [W. Richard Stevens](https://en.wikipedia.org/wiki/W._Richard_Stevens) (1951 ~ 1999) was an extraordinary tech writer and teacher. 4 | He's considered as [Guru of the Unix gurus](https://www.salon.com/2000/09/01/rich_stevens/). 5 | 6 | > "I really believe that my background is fundamental to the success of _Unix Network Programming_ and my other books," he said. 7 | > "That is, I was not one of the developers at Berkeley or AT&T, so the writing of UNP was not a 'memory dump.' 8 | > Everything that is in the book I had to dig out of somewhere and understand myself." 9 | 10 | ## Unix Network Programming 11 | 12 | Before he published _Unix Network Programming 1/e_ in 1990, there was very limited resource on how to program with BSD Sockets API. 13 | Besides manpages, I could only found two short tutorials from BSD documents, ~20 pages each. 14 | 15 | * [An Introductory 4.3BSD Interprocess Communication Tutorial](http://www.bitsavers.org/pdf/mtXinu/MT_XINU_PS1_Apr_1986.pdf), Stuart Sechrest, 1986. 16 | * [An Advanced 4.3BSD Interprocess Communication Tutorial](http://www2.units.it/mumolo/reti_nettuno/bsdsocket.pdf), Samuel J. Leffler, et al., 1986. 17 | 18 | Quote from [Usenix Lifetime Achievement Award](https://www.usenix.org/about/awards/flame): 19 | 20 | > He made network programming accessible, even possible, to learn. 21 | 22 | ## TCP/IP Illustrated 23 | 24 | Quote from [Usenix Lifetime Achievement Award](https://www.usenix.org/about/awards/flame): 25 | 26 | > His three volume work, _TCP/IP Illustrated_, was so complete and true to reality 27 | > that it is used as a reference by members of the Internet Engineering Task Force, 28 | > the group charged with creating and maintaining the standards for the Internet. 29 | 30 | For example, _TCP/IP Illustrated, vol. 1: The Protocols_ was published in Jan 1994. 31 | It often appeared as the reference of TCP, on many papers and RFCs published after that. 32 | 33 | * [TCP Vegas: End to End Congestion Avoidance on a Global Internet](https://sites.cs.ucsb.edu/~almeroth/classes/F05.276/papers/vegas.pdf), [Lawrence Brakmo](http://www.brakmo.org/lawrence/papers.html) and [Larry Peterson](https://www.cs.princeton.edu/~llp/), JSAC 1995. 34 | * [Performance Problems in BSD4.4 TCP](http://www.brakmo.org/lawrence/brakmo95bsdtcp.pdf), L. Brakmo and L. Peterson, 1995. 35 | * [RFC1948](https://datatracker.ietf.org/doc/html/rfc2018): Defending Against Sequence Number Attacks, 1996. 36 | * [Simulation-based Comparisons of Tahoe, Reno, and SACK TCP](https://ee.lbl.gov/papers/sacks.pdf), [Kevin Fall](http://www.kfall.com/) and [Sally Floyd](https://en.wikipedia.org/wiki/Sally_Floyd), 1996. 37 | * [RFC2018](https://datatracker.ietf.org/doc/html/rfc2018): TCP Selective Acknowledgment Options, 1996. 38 | * [Automated Packet Trace Analysis of TCP Implementations](http://conferences.sigcomm.org/sigcomm/1997/papers/p054.pdf), Vern Paxson, 1997. 39 | * [The Macroscopic Behavior of the TCP Congestion Avoidance Algorithm](https://www.cs.utexas.edu/users/lam/395t/papers/Mathis1998.pdf), [Matthew Mathis](https://dblp.org/pid/00/3534.html), et al., 1998. 40 | * [TCP Congestion Control with a Misbehaving Receiver](https://cseweb.ucsd.edu/~savage/papers/CCR99.pdf), [Stefan Savage](https://cseweb.ucsd.edu/~savage/), et al., 1999. 41 | * [RFC2525](https://datatracker.ietf.org/doc/html/rfc2525): Known TCP Implementation Problems, 1999. 42 | 43 | ## APUE 44 | 45 | Quote Dennis Ritchie's Foreword to the Second Edition 46 | 47 | > In fact, I would claim that a central reason for the [Unix] system’s longevity has been that 48 | > it has attracted remarkably talented writers to explain its beauties and mysteries. 49 | > Brian Kernighan is one of these; Rich Stevens is certainly another. 50 | > The first edition of this book, along with his series of books about networking, 51 | > are rightfully regarded as remarkably well-crafted works of exposition, and became hugely popular. 52 | 53 | 54 | ## RFCs 55 | 56 | Besides the classic books he wrote, W. Stevens authored and co-authored two series of RFCs. 57 | 58 | * TCP Congestion Control 59 | * RFC2001: TCP Slow Start, Congestion Avoidance, Fast Retransmit, and Fast Recovery Algorithms, W. Stevens, 1997 60 | * First RFC to document TCP congestion control, much of this memo is taken from TCPv1 and TCPv2 books. 61 | * RFC2581: TCP Congestion Control, M. Allman, V. Paxson, W. Stevens, 1999. 62 | 63 | In RFC5681, the current standard of TCP congestion control, in Acknowledgments: 64 | 65 | > W. Richard ("Rich") Stevens wrote the first version of this document 66 | > [RFC2001] and co-authored the second version [RFC2581]. This present 67 | > version much benefits from his clarity and thoughtfulness of 68 | > description, and we are grateful for Rich's contributions in 69 | > elucidating TCP congestion control, as well as in more broadly 70 | > helping us understand numerous issues relating to networking. 71 | 72 | 73 | * IPv6 Sockets API 74 | * RFC 2133: Basic Socket Interface Extensions for IPv6, Gilligan, R., Thomson, S., Bound, J., and W. Stevens, 1997. 75 | * RFC 2533: 1999 76 | * RFC 3493: 2003 77 | * RFC 2292: Advanced Sockets API for IPv6, W. Stevens and M. Thomas, 1998. 78 | * RFC 3542: 2003 79 | 80 | In other words, Rich Stevens not only documented the Sockets API, he helped design it. 81 | -------------------------------------------------------------------------------- /docs/throughput.md: -------------------------------------------------------------------------------- 1 | # TCP Throughput 2 | 3 | TCP Throughput <= Bytes in flight / RTT, where RTT = round-trip time. 4 | Max bytes in flight = min(Cwnd, Rwnd, sndbuf, BDP). 5 | 6 | ## TCP trace segment graph 7 | 8 | `tcptrace` is a tool written by [Shawn Ostermann](http://oucsace.cs.ohio.edu/~osterman/) at Ohio University, http://tcptrace.org. Wireshark can produce nice interactive graphs. 9 | 10 | ![FreeBSD](img/freebsd-newreno.png) 11 | 12 | Here we show FreeBSD's NewReno congestion control, slow start after packet loss. 13 | 14 | 15 | ## Mininet 16 | 17 | All graphs below are from Linux using CUBIC cc, running under [Mininet](http://mininet.org). 18 | 19 | ``` 20 | mininet> net 21 | h1 h1-eth0:s1-eth1 22 | h2 h2-eth0:s1-eth2 23 | s1 lo: s1-eth1:h1-eth0 s1-eth2:h2-eth0 24 | ``` 25 | 26 | ![mininet](img/mininet.png) 27 | 28 | Packets are captured at sender side `tcpdump -i s1-eth1 -s 128`. 29 | 30 | ``` 31 | mininet> h1 ping -c 4 h2 32 | PING 10.0.0.2 (10.0.0.2) 56(84) bytes of data. 33 | 64 bytes from 10.0.0.2: icmp_seq=1 ttl=64 time=0.252 ms 34 | 64 bytes from 10.0.0.2: icmp_seq=2 ttl=64 time=0.053 ms 35 | 64 bytes from 10.0.0.2: icmp_seq=3 ttl=64 time=0.059 ms 36 | 64 bytes from 10.0.0.2: icmp_seq=4 ttl=64 time=0.056 ms 37 | ``` 38 | 39 | ``` 40 | mininet> xterm h2 # then start `iperf3 -s` on h2 41 | 42 | mininet> h1 iperf3 -c h2 43 | Connecting to host 10.0.0.2, port 5201 44 | [ 5] local 10.0.0.1 port 37680 connected to 10.0.0.2 port 5201 45 | [ ID] Interval Transfer Bitrate Retr Cwnd 46 | [ 5] 0.00-1.00 sec 8.00 GBytes 68.7 Gbits/sec 0 666 KBytes 47 | [ 5] 1.00-2.00 sec 8.20 GBytes 70.4 Gbits/sec 0 940 KBytes 48 | [ 5] 2.00-3.00 sec 7.89 GBytes 67.7 Gbits/sec 0 1.15 MBytes 49 | [ 5] 3.00-4.00 sec 7.90 GBytes 67.9 Gbits/sec 0 1.28 MBytes 50 | [ 5] 4.00-5.00 sec 7.98 GBytes 68.6 Gbits/sec 0 1.42 MBytes 51 | [ 5] 5.00-6.00 sec 8.16 GBytes 70.1 Gbits/sec 0 1.70 MBytes 52 | [ 5] 6.00-7.00 sec 8.17 GBytes 70.2 Gbits/sec 0 1.70 MBytes 53 | [ 5] 7.00-8.00 sec 8.14 GBytes 69.9 Gbits/sec 0 1.79 MBytes 54 | [ 5] 8.00-9.00 sec 8.04 GBytes 69.0 Gbits/sec 0 1.79 MBytes 55 | [ 5] 9.00-10.00 sec 7.99 GBytes 68.7 Gbits/sec 0 1.88 MBytes 56 | - - - - - - - - - - - - - - - - - - - - - - - - - 57 | [ ID] Interval Transfer Bitrate Retr 58 | [ 5] 0.00-10.00 sec 80.5 GBytes 69.1 Gbits/sec 0 sender 59 | [ 5] 0.00-10.00 sec 80.5 GBytes 69.1 Gbits/sec receiver 60 | 61 | iperf Done. 62 | ``` 63 | 64 | Add 100ms latency using `netem delay 100ms`. 65 | 66 | ``` 67 | mininet> s1 tc qdisc replace dev s1-eth2 root netem delay 100ms 68 | 69 | mininet> h1 ping -c 4 h2 70 | PING 10.0.0.2 (10.0.0.2) 56(84) bytes of data. 71 | 64 bytes from 10.0.0.2: icmp_seq=1 ttl=64 time=101 ms 72 | 64 bytes from 10.0.0.2: icmp_seq=2 ttl=64 time=100 ms 73 | 64 bytes from 10.0.0.2: icmp_seq=3 ttl=64 time=100 ms 74 | 64 bytes from 10.0.0.2: icmp_seq=4 ttl=64 time=100 ms 75 | ``` 76 | 77 | ## Slow sender 78 | 79 | `iperf3 -c server --bitrate 10M` 80 | 81 | ![sender](img/client-limit.png) 82 | 83 | ## Slow sender using FQ pacing 84 | 85 | `iperf3 -c server --fq-rate 10M` 86 | 87 | 88 | ![sender-fq](img/client-limit-fq.png) 89 | 90 | No bursts. 91 | 92 | ## Slow receiver 93 | 94 | `iperf3 -s --server-bitrate-limit 10M` 95 | 96 | ![receiver](img/server-limit.png) 97 | 98 | Small window size. 99 | 100 | ## Small Rwnd 101 | 102 | ``` 103 | mininet> h1 bin/tcpperf -c h2 104 | Connected 10.0.0.1:37662 -> 10.0.0.2:2009, congestion control: cubic 105 | Time (s) Throughput Bitrate Cwnd Rwnd sndbuf ssthresh rtt/var 106 | 0.000s 0.00kB/s 0.00kbps 14.1Ki 42.4Ki 85.3Ki 2048Mi 201.2ms/100593 107 | 1.048s 21.0MB/s 168Mbps 10.5Mi 6094Ki 16.0Mi 2048Mi 100.2ms/56 retrans=3 108 | 2.050s 65.2MB/s 522Mbps 16.5Mi 6518Ki 16.0Mi 2048Mi 100.4ms/107 109 | 3.054s 66.6MB/s 533Mbps 16.5Mi 6546Ki 16.0Mi 2048Mi 100.4ms/80 110 | 4.058s 66.7MB/s 534Mbps 16.5Mi 6520Ki 16.0Mi 2048Mi 100.5ms/61 111 | 5.063s 66.7MB/s 533Mbps 16.5Mi 6546Ki 16.0Mi 2048Mi 100.5ms/80 112 | 6.066s 66.8MB/s 534Mbps 16.5Mi 6520Ki 16.0Mi 2048Mi 100.4ms/81 113 | 7.070s 66.6MB/s 533Mbps 16.5Mi 6546Ki 16.0Mi 2048Mi 100.4ms/69 114 | 8.074s 66.8MB/s 534Mbps 16.5Mi 6520Ki 16.0Mi 2048Mi 100.4ms/68 115 | 9.077s 66.9MB/s 535Mbps 16.5Mi 6552Ki 16.0Mi 2048Mi 100.3ms/77 116 | 10.081s 66.7MB/s 534Mbps 16.5Mi 6548Ki 16.0Mi 2048Mi 100.4ms/115 117 | Transferred 623MBytes in 10.238s, 4754 syscalls, 131072.0 Bytes/syscall 118 | ``` 119 | 120 | Throughput is limited by Rwnd (`snd_wnd`), 100ms * 66.7MB/s = 6.5MB. 121 | 122 | ![recv](img/sndwnd-limit.png) 123 | 124 | Window is filled up as soon as advertised. 125 | 126 | Set larger `tcp_rmem` on receiver, for larger Rwnd. 127 | 128 | ``` 129 | mininet> h2 sysctl -A |grep tcp_.mem 130 | net.ipv4.tcp_rmem = 10240 87380 16777216 131 | net.ipv4.tcp_wmem = 10240 87380 16777216 132 | mininet> h2 sysctl -A |grep tcp_adv 133 | net.ipv4.tcp_adv_win_scale = 1 134 | mininet> h2 sysctl -w net.ipv4.tcp_rmem="10240 131072 65536000" 135 | net.ipv4.tcp_rmem = 10240 131072 65536000 136 | ``` 137 | 138 | For `net.ipv4.tcp_adv_win_scale = 1`, `Rwnd = tcp_rmem[2] / 2 = 32MB`. 139 | 140 | 141 | ## Small sndbuf 142 | 143 | ``` 144 | mininet> h1 bin/tcpperf -c h2 145 | Connected 10.0.0.1:45092 -> 10.0.0.2:2009, congestion control: cubic 146 | Time (s) Throughput Bitrate Cwnd Rwnd sndbuf ssthresh rtt/var 147 | 0.000s 0.00kB/s 0.00kbps 14.1Ki 42.4Ki 85.3Ki 2048Mi 100.7ms/50326 148 | 1.045s 18.3MB/s 147Mbps 10.2Mi 8474Ki 16.0Mi 2048Mi 103.6ms/6137 retrans=2 149 | 2.034s 148MB/s 1184Mbps 31.4Mi 31.2Mi 16.0Mi 2048Mi 100.0ms/20 150 | 3.014s 159MB/s 1272Mbps 31.4Mi 31.2Mi 16.0Mi 2048Mi 100.0ms/1 151 | 4.005s 157MB/s 1255Mbps 31.4Mi 31.2Mi 16.0Mi 2048Mi 100.0ms/5 152 | 5.042s 158MB/s 1264Mbps 31.4Mi 31.2Mi 16.0Mi 2048Mi 100.0ms/16 153 | 6.020s 159MB/s 1275Mbps 31.4Mi 31.2Mi 16.0Mi 2048Mi 100.0ms/15 154 | 7.016s 154MB/s 1229Mbps 31.4Mi 31.2Mi 16.0Mi 2048Mi 100.0ms/2 155 | 8.055s 158MB/s 1261Mbps 31.4Mi 31.2Mi 16.0Mi 2048Mi 100.0ms/0 156 | 9.019s 164MB/s 1314Mbps 31.4Mi 31.2Mi 16.0Mi 2048Mi 100.0ms/15 157 | 10.034s 151MB/s 1206Mbps 31.4Mi 31.2Mi 16.0Mi 2048Mi 100.0ms/14 158 | Transferred 1425MBytes in 10.134s, 10869 syscalls, 131072.0 Bytes/syscall 159 | ``` 160 | 161 | Throughput is limited by sndbuf, 100ms * 160MB/s = 16MB. 162 | 163 | 164 | ![sndbuf](img/sndbuf-limit.png) 165 | 166 | Rwnd is only half filled, in a burst. 167 | 168 | Higher throughput achived by larger `sndbuf`. 169 | 170 | ``` 171 | mininet> h1 sysctl -w net.ipv4.tcp_wmem="10240 131072 65536000" 172 | net.ipv4.tcp_wmem = 10240 131072 65536000 173 | 174 | mininet> h1 bin/tcpperf -c h2 175 | Connected 10.0.0.1:53176 -> 10.0.0.2:2009, congestion control: cubic 176 | Time (s) Throughput Bitrate Cwnd Rwnd sndbuf ssthresh rtt/var 177 | 0.000s 0.00kB/s 0.00kbps 14.1Ki 42.4Ki 128Ki 2048Mi 100.4ms/50205 178 | 1.004s 32.1MB/s 257Mbps 7767Ki 11.0Mi 37.8Mi 2048Mi 100.5ms/47 179 | 2.051s 249MB/s 1995Mbps 68.3Mi 24.9Mi 62.5Mi 2048Mi 100.7ms/134 retrans=585 180 | 3.056s 260MB/s 2081Mbps 68.3Mi 24.9Mi 62.5Mi 2048Mi 100.6ms/147 181 | 4.061s 260MB/s 2082Mbps 68.3Mi 21.9Mi 62.5Mi 2048Mi 100.5ms/23 182 | 5.024s 274MB/s 2194Mbps 68.3Mi 24.3Mi 62.5Mi 2048Mi 100.2ms/64 183 | 6.028s 313MB/s 2500Mbps 68.3Mi 26.4Mi 62.5Mi 2048Mi 100.1ms/58 184 | 7.034s 325MB/s 2603Mbps 68.3Mi 29.6Mi 62.5Mi 2048Mi 100.1ms/73 185 | 8.040s 325MB/s 2602Mbps 68.3Mi 28.9Mi 62.5Mi 2048Mi 100.1ms/40 186 | 9.046s 326MB/s 2604Mbps 68.3Mi 29.7Mi 62.5Mi 2048Mi 100.2ms/112 187 | 10.051s 325MB/s 2603Mbps 68.3Mi 27.9Mi 62.5Mi 2048Mi 100.1ms/100 188 | Transferred 2703MBytes in 10.154s, 20625 syscalls, 131072.0 Bytes/syscall 189 | ``` 190 | 191 | ## Small Cwnd 192 | 193 | Congestion control algorithms decide congestion window (Cwnd). 194 | 195 | With RTT = 100ms, FreeBSD newreno CC sometimes increases Cwnd slowly. 196 | 197 | ![](img/freebsd-slow-start.png) 198 | 199 | In the following example, reaches max bandwidth after ~30 seconds. 200 | 201 | ``` 202 | freebsd:~/recipes/tpc % bin/tcpperf -c 172.16.0.59 -b 100G -t 30 203 | Connected 172.16.0.77:31839 -> 172.16.0.59:2009, congestion control: newreno 204 | Time (s) Throughput Bitrate Cwnd Rwnd sndbuf ssthresh rtt/var 205 | 0.000s 0.00kB/s 0.00kbps 14.1Ki 63.6Ki 32.8Ki 1024Mi 202.0ms/101000 206 | 1.104s 356kB/s 2849kbps 45.5Ki 435Ki 96.8Ki 1024Mi 123.8ms/42562 207 | 2.007s 580kB/s 4643kbps 71.2Ki 589Ki 137Ki 1024Mi 107.5ms/13937 208 | 3.010s 915kB/s 7318kbps 99.7Ki 593Ki 201Ki 1024Mi 102.3ms/3750 209 | 4.014s 1176kB/s 9404kbps 128Ki 593Ki 257Ki 1024Mi 100.9ms/1187 210 | 5.017s 1437kB/s 11.5Mbps 162Ki 1307Ki 321Ki 1024Mi 100.5ms/687 211 | 6.021s 1829kB/s 14.6Mbps 191Ki 1307Ki 377Ki 1024Mi 100.4ms/562 212 | 7.024s 2090kB/s 16.7Mbps 219Ki 1287Ki 441Ki 1024Mi 100.4ms/687 213 | 8.028s 2481kB/s 19.9Mbps 248Ki 1266Ki 497Ki 1024Mi 100.5ms/687 214 | 9.032s 2611kB/s 20.9Mbps 276Ki 1246Ki 553Ki 1024Mi 100.5ms/750 215 | 10.035s 3005kB/s 24.0Mbps 311Ki 1820Ki 617Ki 1024Mi 100.4ms/500 216 | 11.038s 3396kB/s 27.2Mbps 339Ki 1894Ki 681Ki 1024Mi 100.4ms/500 217 | 12.042s 3526kB/s 28.2Mbps 368Ki 1919Ki 737Ki 1024Mi 100.5ms/625 218 | 13.047s 4045kB/s 32.4Mbps 418Ki 1852Ki 857Ki 1024Mi 101.2ms/1375 219 | 14.050s 4701kB/s 37.6Mbps 480Ki 1868Ki 1009Ki 1024Mi 100.4ms/562 220 | 15.054s 5354kB/s 42.8Mbps 553Ki 2361Ki 1169Ki 1024Mi 101.3ms/2000 221 | 16.039s 6255kB/s 50.0Mbps 641Ki 2681Ki 1377Ki 1024Mi 100.6ms/687 222 | 17.002s 7212kB/s 57.7Mbps 727Ki 2696Ki 1561Ki 1024Mi 102.2ms/4062 223 | 18.006s 8098kB/s 64.8Mbps 813Ki 2684Ki 1769Ki 1024Mi 100.7ms/687 224 | 19.010s 8617kB/s 68.9Mbps 898Ki 2663Ki 1817Ki 1024Mi 100.5ms/625 225 | 20.013s 9662kB/s 77.3Mbps 1004Ki 2914Ki 1817Ki 1024Mi 100.4ms/500 226 | 21.017s 10.7MB/s 85.7Mbps 1118Ki 2893Ki 1817Ki 1024Mi 100.4ms/500 227 | 22.021s 11.9MB/s 95.0Mbps 1235Ki 3010Ki 1817Ki 1024Mi 100.4ms/500 228 | 23.025s 13.2MB/s 106Mbps 1369Ki 2989Ki 1817Ki 1024Mi 100.3ms/437 229 | 24.029s 14.5MB/s 116Mbps 1497Ki 2916Ki 1817Ki 1024Mi 100.5ms/687 230 | 25.033s 15.9MB/s 127Mbps 1634Ki 2930Ki 1817Ki 1024Mi 100.3ms/437 231 | 26.037s 17.2MB/s 138Mbps 1796Ki 3030Ki 1817Ki 1024Mi 100.4ms/562 232 | 27.001s 18.2MB/s 146Mbps 2010Ki 3066Ki 1817Ki 1024Mi 100.8ms/687 233 | 28.005s 18.5MB/s 148Mbps 2191Ki 3026Ki 1817Ki 1024Mi 100.7ms/687 234 | 29.008s 18.4MB/s 147Mbps 2419Ki 2986Ki 1817Ki 1024Mi 100.4ms/562 235 | 30.012s 18.5MB/s 148Mbps 2619Ki 2992Ki 1817Ki 1024Mi 100.7ms/625 236 | Transferred 234MBytes in 30.113s, 1787 syscalls, 131072.0 Bytes/syscall 237 | ``` 238 | 239 | Before Cwnd reaches sndbuf (1.8MB), throughput is dominated by Cwnd. 240 | e.g. at 20-th second: Cwnd = 1000K, throughput = 1000K / 0.1s = 10MB/s. 241 | 242 | After that, throughput is domnated by sndbuf in this case. 243 | e.g. at 30-th second, sndbuf = 1.8MB, throughput = 18MB/s. 244 | 245 | See discussion on freebsd-net mailing list 2023-05: 246 | 247 | and [Slow Start](slowstart.md#freebsd) 248 | 249 | ## Bandwidth limit 250 | 251 | ``` 252 | mininet> s1 tc qdisc replace dev s1-eth2 root netem delay 10ms rate 10Mbit 253 | 254 | mininet> h1 bin/tcpperf -c h2 255 | Connected 10.0.0.1:48982 -> 10.0.0.2:2009, congestion control: cubic 256 | Time (s) Throughput Bitrate Cwnd Rwnd sndbuf ssthresh rtt/var 257 | 0.000s 0.00kB/s 0.00kbps 14.1Ki 42.4Ki 128Ki 2048Mi 10.7ms/5341 258 | 1.120s 1638kB/s 13.1Mbps 132Ki 323Ki 790Ki 70.7Ki 106.2ms/574 259 | 2.094s 1481kB/s 11.8Mbps 189Ki 416Ki 1139Ki 70.7Ki 152.8ms/1331 260 | 3.018s 1418kB/s 11.3Mbps 243Ki 535Ki 1462Ki 70.7Ki 196.7ms/1030 261 | 4.226s 1411kB/s 11.3Mbps 314Ki 680Ki 1887Ki 70.7Ki 254.5ms/1929 262 | 5.196s 1351kB/s 10.8Mbps 370Ki 795Ki 2227Ki 70.7Ki 300.2ms/813 263 | 6.357s 1354kB/s 10.8Mbps 438Ki 955Ki 2635Ki 70.7Ki 355.6ms/856 264 | 7.009s 1408kB/s 11.3Mbps 475Ki 1021Ki 2856Ki 70.7Ki 386.2ms/374 265 | 8.444s 1461kB/s 11.7Mbps 560Ki 1213Ki 3366Ki 70.7Ki 455.5ms/1334 266 | 9.281s 1409kB/s 11.3Mbps 608Ki 1326Ki 3655Ki 70.7Ki 494.9ms/801 267 | 10.160s 1342kB/s 10.7Mbps 660Ki 1399Ki 3970Ki 70.7Ki 537.0ms/1078 268 | Transferred 14.5MBytes in 12.180s, 111 syscalls, 131072.0 Bytes/syscall 269 | ``` 270 | 271 | ![bandwidth](img/bandwidth-limit.png) 272 | 273 | This is probably the best case, as all availabe bandwidth is utilized. 274 | 275 | ## Performance 276 | 277 | According to [Understanding Host Network Stack Overheads](https://www.cs.cornell.edu/~ragarwal/pubs/network-stack.pdf) SIGCOMM'21, 278 | _modern Linux network stack can achieve ~42Gbps throughput-per-core_. In other words, a single TCP connection can sustain a 40Gbps NIC unidirectionally, but not an 100Gbps NIC. 279 | 280 | Eric reported [170Gbps single flow on 200Gbps NIC](https://netdevconf.info/0x16/session.html?State-of-the-union-in-TCP-land) in netdev conf 2022-10, with receiver zero copy and [BIG TCP](https://netdevconf.info/0x15/session.html?BIG-TCP). 281 | 282 | -------------------------------------------------------------------------------- /docs/walkthrough.md: -------------------------------------------------------------------------------- 1 | # Code Walkthrough 2 | 3 | ![Sockets](img/sockets.png) 4 | 5 | ## Blocking write 6 | 7 | `packetdrill/gtests/net/tcp/blocking/blocking-write.pkt` 8 | 9 | ```c 10 | // Test for blocking write. 11 | --tolerance_usecs=10000 12 | 13 | `../common/defaults.sh 14 | ../common/set_sysctls.py /proc/sys/net/ipv4/tcp_min_tso_segs=10 15 | ` 16 | 17 | // Establish a connection. 18 | 0 socket(..., SOCK_STREAM,PROTO_TCP) = 3 19 | +0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0 20 | +0 bind(3, ..., ...) = 0 21 | +0 listen(3, 1) = 0 22 | 23 | +.1 < S 0:0(0) win 50000 24 | +0 > S. 0:0(0) ack 1 25 | +.1 < . 1:1(0) ack 1 win 50000 26 | +0 accept(3, ..., ...) = 4 27 | 28 | // Kernel doubles our value -> sk->sk_sndbuf is set to 42000 29 | +0 setsockopt(4, SOL_SOCKET, SO_SNDBUF, [21000], 4) = 0 30 | +0 getsockopt(4, SOL_SOCKET, SO_SNDBUF, [42000], [4]) = 0 31 | 32 | // A write of 60000 does not block. 33 | +0...0.300 write(4, ..., 61000) = 61000 // this write() blocks 34 | 35 | +.1 < . 1:1(0) ack 10001 win 50000 36 | 37 | +.1 < . 1:1(0) ack 30001 win 50000 38 | 39 | // This ACK should wakeup the write(). An ACK of 35001 does not. 40 | +.1 < . 1:1(0) ack 36001 win 50000 41 | 42 | // Reset to sysctls defaults. 43 | `/tmp/sysctl_restore_${PPID}.sh` 44 | ``` 45 | 46 | `tcpdump -i any -n -ttt tcp port 8080` 47 | ```c 48 | // Three-way handshake 49 | 1 0.000000 remote:54321 > local:8080: [S], seq 0, win 50000, options [mss 1000,nop,wscale 0], length 0 50 | 2 0.000640 local:8080 > remote:54321: [S.], seq 12345, ack 1, win 65535, options [mss 1460,nop,wscale 8], length 0 51 | 3 0.111259 remote:54321 > local:8080: [.], ack 1, win 50000, length 0 52 | 53 | // cwnd = 10, mss = 1000, so send 10 * 1000 then wait for ACK. 54 | 4 0.017588 local:8080 > remote:54321: [P.], seq 1:5001, ack 1, win 256, length 5000 55 | 5 0.000199 local:8080 > remote:54321: [P.], seq 5001:10001, ack 1, win 256, length 5000 56 | 6 0.101236 remote:54321 > local:8080: [.], ack 10001, win 50000, length 0 57 | 58 | // slow-start, increase cwnd per ACK. cwnd = 20, so send 20 * 1000 then wait for ACK. 59 | 7 0.000573 local:8080 > remote:54321: [P.], seq 10001:20001, ack 1, win 256, length 10000 60 | 8 0.000276 local:8080 > remote:54321: [P.], seq 20001:30001, ack 1, win 256, length 10000 61 | 9 0.099876 remote:54321 > local:8080: [.], ack 30001, win 50000, length 0 62 | 63 | // slow-start, again. write() now blocks. 64 | 10 0.000490 local:8080 > remote:54321: [P.], seq 30001:35001, ack 1, win 256, length 5000 65 | 11 0.000456 local:8080 > remote:54321: [P.], seq 35001:45001, ack 1, win 256, length 10000 66 | 12 0.000182 local:8080 > remote:54321: [P.], seq 45001:55001, ack 1, win 256, length 10000 67 | 13 0.000157 local:8080 > remote:54321: [P.], seq 55001:60001, ack 1, win 256, length 5000 68 | 14 0.098661 remote:54321 > local:8080: [.], ack 36001, win 50000, length 0 69 | 70 | // the previous ACK unblocks write(). 71 | 15 0.001139 local:8080 > remote:54321: [P.], seq 60001:61001, ack 1, win 256, length 1000 72 | 16 0.325737 local:8080 > remote:54321: [.], seq 36001:37001, ack 1, win 256, length 1000 // Re-xmit 73 | 17 0.038498 local:8080 > remote:54321: [F.], seq 61001, ack 1, win 256, length 0 74 | ``` 75 | 76 | ```text 77 | tcp_sendmsg(size=61k), cwnd=10 78 | size_goal = 25k 79 | 1. copy 25k to skb 80 | sk_wmem_queued = 1280 + 25000 = 26280 81 | tcp_push_one 82 | tcp_write_xmit 83 | skb1 = 25k, pfrag = 25000/32768 84 | cwnd_quota = 5 85 | tso_fragment splits skb to 5k + 20k, sk_wmem_queued += 1280, skb1 = 5k, skb2 = 20k 86 | tcp_transmit_skb(5k) 87 | sk->tcp_rtx_queue.insert(skb1) 88 | 2. copy 5k, copy = size_goal - tcp_write_queue_tail(sk)->len = 25k - 20k = 5k 89 | skb2 += 5k, pfrag = 30000/32768 90 | sk_wmem_queued = 27560 + 5000 = 32560 91 | __tcp_push_pending_frames 92 | tcp_write_xmit 93 | cwnd_quota = 5 94 | tso_fragment splits skb to 5k + 20k, sk_wmem_queued += 1280, skb2 = 5k, skb3 = 20k 95 | tcp_transmit_skb(5k) 96 | sk->tcp_rtx_queue.insert(skb2) 97 | 3. copy 2768 (copied=32768) 98 | sk_wmem_queued = 33840 + 2768 = 36608 99 | sk->sk_write_queue = [skb3(len=22768)] 100 | 4. copy 2232 (copied=35000), size_goal - skb->len = 2232 101 | alloc a new page frag, WHY no new skb? 102 | sk_wmem_queued = 36608 + 2232 = 38840 103 | tcp_push_one 104 | cwnd_quota=0 105 | 5. copy 25k 106 | alloc a new skb, sk_wmem_queued += 1280 107 | sk_wmem_queued = 40120 + 25000 = 65120 108 | tcp_push_one 109 | cwnd_quota=0 110 | (copied=60k) 111 | 6. trying to copy remaining 1k 112 | sk->sk_wmem_queued (65120) > sk->sk_sndbuf (42000) 113 | wait_for_space 114 | 115 | 1st ack 10000, cwnd=20 116 | tcp_rcv_established 117 | tcp_ack 118 | tcp_clean_rtx_queue 119 | tcp_rtx_queue_unlink_and_free 120 | sk_wmem_free_skb 121 | tcp_data_snd_check 122 | tcp_push_pending_frames 123 | tcp_write_xmit 124 | tcp_transmit_skb(10k) 125 | tcp_transmit_skb(10k) 126 | sk->sk_wmem_queued: 65120 -> 55120 127 | tcp_check_space 128 | tcp_new_space 129 | sk_stream_write_space, stream_wspace=-13120 stream_min_wspace=27560) 130 | 131 | 2nd ack 30000, cwnd=40 132 | tcp_write_xmit 133 | tcp_transmit_skb(5k) 134 | tcp_transmit_skb(10k) 135 | tcp_transmit_skb(10k) 136 | tcp_transmit_skb(5k) 137 | sk->sk_wmem_queued: 55120 -> 35120 138 | sk_stream_write_space, stream_wspace=6880 stream_min_wspace=17560) 139 | 140 | 3rd ack 36000 141 | sk->sk_wmem_queued: 35120 -> 27840 142 | sk_stream_write_space, stream_wspace=14160 stream_min_wspace=13920 143 | __sk_stream_is_writeable=true, wake up 144 | tcp_sendmsg wakes up 145 | tcp_transmit_skb(1k) 146 | ``` 147 | 148 | Call trace of SYN, SYNACK 149 | ```text 150 | __do_softirq -> net_rx_action -> napi_poll -> virtnet_poll -> virtqueue_napi_complete 151 | -> napi_complete_done -> gro_normal_list -> netif_receive_skb_list_internal 152 | -> __netif_receive_skb_list -> __netif_receive_skb_list_core -> __netif_receive_skb_list_ptype 153 | -> ip_list_rcv -> ip_sublist_rcv -> ip_list_rcv_finish -> ip_sublist_rcv_finish 154 | -> dst_input -> ip_local_deliver -> ip_local_deliver_finish -> 155 | 156 | IPv4 157 | ip_local_deliver_finish -> ip_protocol_deliver_rcu -> tcp_v4_rcv -> tcp_v4_do_rcv 158 | -> tcp_rcv_state_process -> tcp_v4_conn_request -> tcp_conn_request -> tcp_v4_send_synack 159 | -> ip_output 160 | 161 | __do_softirq -> net_rx_action -> napi_poll -> process_backlog 162 | -> __netif_receive_skb -> __netif_receive_skb_one_core 163 | -> ip6_input -> 164 | 165 | IPv6 166 | ip6_input -> ip6_input_finish -> ip6_protocol_deliver_rcu -> tcp_v6_rcv -> tcp_v6_do_rcv 167 | -> tcp_rcv_state_process -> tcp_v6_conn_request -> tcp_conn_request -> tcp_v6_send_synack 168 | -> ip6_xmit -> dst_output -> ip6_output 169 | ``` 170 | 171 | 172 | Call trace of `read(2)` 173 | ```text 174 | entry_SYSCALL_64 -> do_syscall_64 -> ksys_read -> vfs_read -> new_sync_read -> call_read_iter 175 | -> sock_read_iter -> sock_recvmsg -> sock_recvmsg_nosec -> inet_recvmsg -> tcp_recvmsg 176 | ``` 177 | 178 | Call trace of `readv(2)` 179 | ```text 180 | entry_SYSCALL_64 -> do_syscall_x64 -> __x64_sys_readv -> __do_sys_readv -> do_readv -> vfs_readv 181 | -> do_iter_read -> do_iter_readv_writev -> call_read_iter 182 | -> sock_read_iter -> sock_recvmsg -> sock_recvmsg_nosec -> inet_recvmsg -> tcp_recvmsg 183 | ``` 184 | 185 | Call trace of `recvfrom` 186 | ```text 187 | entry_SYSCALL_64 -> do_syscall_64 -> __x64_sys_recvfrom -> __se_sys_recvfrom -> __do_sys_recvfrom 188 | -> __sys_recvfrom -> sock_recvmsg 189 | ``` 190 | 191 | Call trace of `recvmsg` 192 | ```text 193 | entry_SYSCALL_64 -> do_syscall_64 -> __sys_recvmsg -> ___sys_recvmsg -> ____sys_recvmsg 194 | -> sock_recvmsg 195 | ``` 196 | 197 | Call trace of `sendto` 198 | ```text 199 | entry_SYSCALL_64 -> do_syscall_64 -> __x64_sys_sendto -> __se_sys_sendto -> __do_sys_sendto 200 | -> __sys_sendto -> sock_sendmsg 201 | ``` 202 | 203 | Call trace of `sendmsg` 204 | ```text 205 | entry_SYSCALL_64 -> do_syscall_64 -> __sys_sendmsg -> ___sys_sendmsg -> ____sys_sendmsg 206 | -> sock_sendmsg 207 | ``` 208 | 209 | Call trace of `write(2)` 210 | ```text 211 | entry_SYSCALL_64 -> do_syscall_64 -> ksys_write -> vfs_write -> new_sync_write -> call_write_iter 212 | -> sock_write_iter -> sock_sendmsg -> sock_sendmsg_nosec -> tcp_sendmsg -> tcp_sendmsg_locked 213 | -> tcp_push -> __tcp_push_pending_frames -> tcp_write_xmit -> tcp_transmit_skb 214 | -> __tcp_transmit_skb -> ip_queue_xmit -> __ip_queue_xmit -> ip_local_out -> dst_output 215 | -> ip_output -> ip_finish_output -> __ip_finish_output -> ip_finish_output2 -> neigh_output 216 | 217 | IPv6 218 | __tcp_transmit_skb -> inet6_csk_xmit -> ip6_xmit -> dst_output -> ip6_output 219 | -> ip6_finish_output2 -> neigh_output -> neigh_hh_output -> dev_queue_xmit 220 | -> __dev_queue_xmit -> __dev_xmit_skb -> qdisc_run 221 | -> __qdisc_run -> qdisc_restart -> sch_direct_xmit -> dev_hard_start_xmit 222 | -> xmit_one -> netdev_start_xmit -> __netdev_start_xmit -> mlx4_en_* 223 | ``` 224 | 225 | Call trace of receiving ACK from packet 6 226 | ```text 227 | ip_local_deliver_finish -> ip_protocol_deliver_rcu -> tcp_v4_rcv -> tcp_v4_do_rcv 228 | -> tcp_rcv_established -> tcp_data_snd_check -> tcp_push_pending_frames 229 | -> __tcp_push_pending_frames -> ... 230 | ``` 231 | 232 | Call trace of `close(2)` 233 | ```text 234 | __fput -> sock_close -> __sock_release -> inet_release -> tcp_close -> tcp_send_fin 235 | -> __tcp_push_pending_frames -> tcp_write_xmit -> tcp_transmit_skb -> ... 236 | ``` 237 | -------------------------------------------------------------------------------- /mkdocs.yml: -------------------------------------------------------------------------------- 1 | site_name: Shuo's Linux TCP/IP notes 2 | theme: 3 | name: readthedocs 4 | analytics: 5 | gtag: G-4PL3B2ZGE8 6 | nav: 7 | - Home: index.md 8 | - Sockets: sockets.md 9 | - Throughput: throughput.md 10 | - Profiling: profile.md 11 | - Walkthrough: walkthrough.md 12 | - Nagle's Algothrim: nagle.md 13 | - Slow Start: slowstart.md 14 | - Loss Recovery: recovery.md 15 | - Reno CC: reno.md 16 | - W. Richard Stevens: stevens.md 17 | - Links: links.md 18 | - History: history.md 19 | extra_css: [extra.css] 20 | markdown_extensions: 21 | - mkdocs_graphviz: 22 | light_theme: 000000 23 | dark_theme: FFFFFF 24 | color: 000000 25 | bgcolor: none 26 | 27 | extra_javascript: 28 | - https://cdn.jsdelivr.net/gh/rod2ik/cdn@main/mkdocs/javascripts/mkdocs-graphviz.js 29 | -------------------------------------------------------------------------------- /papers/MIT-LCS-TR-494.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chenshuo/tcpip-study/5ac37b62e77316334e7aa2a1973fa902443a5f51/papers/MIT-LCS-TR-494.pdf -------------------------------------------------------------------------------- /papers/TCP-misbehaving-receiver-CCR99.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chenshuo/tcpip-study/5ac37b62e77316334e7aa2a1973fa902443a5f51/papers/TCP-misbehaving-receiver-CCR99.pdf -------------------------------------------------------------------------------- /papers/TCP_Congestion_Control_Comparison.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chenshuo/tcpip-study/5ac37b62e77316334e7aa2a1973fa902443a5f51/papers/TCP_Congestion_Control_Comparison.pdf -------------------------------------------------------------------------------- /papers/cardwell-modeling-TCP-latency-infocom2000.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chenshuo/tcpip-study/5ac37b62e77316334e7aa2a1973fa902443a5f51/papers/cardwell-modeling-TCP-latency-infocom2000.pdf -------------------------------------------------------------------------------- /papers/compare-autotune02.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chenshuo/tcpip-study/5ac37b62e77316334e7aa2a1973fa902443a5f51/papers/compare-autotune02.pdf -------------------------------------------------------------------------------- /papers/congavoid.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chenshuo/tcpip-study/5ac37b62e77316334e7aa2a1973fa902443a5f51/papers/congavoid.pdf -------------------------------------------------------------------------------- /papers/cubic08.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chenshuo/tcpip-study/5ac37b62e77316334e7aa2a1973fa902443a5f51/papers/cubic08.pdf -------------------------------------------------------------------------------- /papers/dynamics-91.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chenshuo/tcpip-study/5ac37b62e77316334e7aa2a1973fa902443a5f51/papers/dynamics-91.pdf -------------------------------------------------------------------------------- /papers/ff96-sacks.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chenshuo/tcpip-study/5ac37b62e77316334e7aa2a1973fa902443a5f51/papers/ff96-sacks.pdf -------------------------------------------------------------------------------- /papers/mathis-tcpautotune-sigcomm98.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chenshuo/tcpip-study/5ac37b62e77316334e7aa2a1973fa902443a5f51/papers/mathis-tcpautotune-sigcomm98.pdf -------------------------------------------------------------------------------- /papers/mathis-tcpmodel-ccr97.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chenshuo/tcpip-study/5ac37b62e77316334e7aa2a1973fa902443a5f51/papers/mathis-tcpmodel-ccr97.pdf -------------------------------------------------------------------------------- /papers/paxson-e2e-packets-sigcomm97.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chenshuo/tcpip-study/5ac37b62e77316334e7aa2a1973fa902443a5f51/papers/paxson-e2e-packets-sigcomm97.pdf -------------------------------------------------------------------------------- /papers/paxson-tcpanaly-sigcomm97.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chenshuo/tcpip-study/5ac37b62e77316334e7aa2a1973fa902443a5f51/papers/paxson-tcpanaly-sigcomm97.pdf -------------------------------------------------------------------------------- /papers/reneging.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chenshuo/tcpip-study/5ac37b62e77316334e7aa2a1973fa902443a5f51/papers/reneging.pdf -------------------------------------------------------------------------------- /papers/traffic-policing-sigcomm16.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chenshuo/tcpip-study/5ac37b62e77316334e7aa2a1973fa902443a5f51/papers/traffic-policing-sigcomm16.pdf --------------------------------------------------------------------------------