├── .editorconfig ├── .gitattributes ├── .github ├── CODEOWNERS ├── ISSUE_TEMPLATE │ └── quic-draft-issue.md ├── in-solidarity.yml ├── release-drafter.yml └── workflows │ ├── archive.yml │ ├── assign-to-project.yml │ ├── ghpages.yml │ ├── publish.yml │ └── update.yml ├── .gitignore ├── .lint.py ├── .travis.yml ├── CONTRIBUTING.md ├── Makefile ├── README.md ├── ietf.json ├── protection-samples.js ├── rfc8999.md ├── rfc9000.md ├── rfc9001.md ├── rfc9002.md ├── rfc9114.md ├── rfc9204.md ├── tag.sh ├── writeups └── base-drafts.md └── xml2rfc-tidy.py /.editorconfig: -------------------------------------------------------------------------------- 1 | # See http://editorconfig.org 2 | 3 | root = true 4 | 5 | [*.md] 6 | charset = utf-8 7 | end_of_line = lf 8 | indent_size = 2 9 | indent_style = space 10 | insert_final_newline = true 11 | max_line_length = 80 12 | trim_trailing_whitespace = true 13 | -------------------------------------------------------------------------------- /.gitattributes: -------------------------------------------------------------------------------- 1 | # Set the default behavior, in case people don't have core.autocrlf set. 2 | * text=auto 3 | 4 | # Explicitly declare text files you want to always be normalized and converted 5 | # to native line endings on checkout. 6 | *.md text 7 | *.xml text 8 | 9 | # Declare files that will always have LF line endings on checkout. 10 | *.sh text eol=lf 11 | *.mk txt eol=lf -------------------------------------------------------------------------------- /.github/CODEOWNERS: -------------------------------------------------------------------------------- 1 | * @quicwg/chairs 2 | 3 | draft-ietf-quic-http.md @MikeBishop 4 | 5 | draft-ietf-quic-invariants.md @martinthomson 6 | 7 | draft-ietf-quic-qpack.md @MikeBishop @afrind 8 | 9 | draft-ietf-quic-recovery.md @janaiyengar @ianswett 10 | 11 | draft-ietf-quic-tls.md @martinthomson @seanturner 12 | 13 | draft-ietf-quic-transport.md @janaiyengar @martinthomson 14 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/quic-draft-issue.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: QUIC Draft Issue 3 | about: File an issue with a QUIC draft document 4 | title: '' 5 | labels: '' 6 | assignees: '' 7 | 8 | --- 9 | 10 | Before opening an issue, please familiarise yourself with the QUIC WG [Contribution Guidlines](https://github.com/quicwg/base-drafts/blob/master/CONTRIBUTING.md) and [Late-Stage Process](https://github.com/quicwg/base-drafts/blob/master/CONTRIBUTING.md#late-stage-process). 11 | 12 | All documents in this repository follow this process. Before filing a new issue against any of them, please consider a few things: 13 | 14 | * Issues should be just that; issues with our deliverables, **not proposals, questions or support requests**. 15 | * Please review the issues list to make sure that you aren't filing a duplicate. Design issues that revisit a topic where there's already declared consensus (see https://github.com/quicwg/base-drafts/issues?q=is%3Aclosed+label%3Ahas-consensus) need to provide compelling reasons to warrant reopening the discussion. 16 | * If you're not sure how to phrase your issue, please ask on the [mailing list](https://www.ietf.org/mailman/listinfo/quic). 17 | -------------------------------------------------------------------------------- /.github/in-solidarity.yml: -------------------------------------------------------------------------------- 1 | _extends: ietf/terminology 2 | -------------------------------------------------------------------------------- /.github/release-drafter.yml: -------------------------------------------------------------------------------- 1 | categories: 2 | - title: Transport 3 | label: -transport 4 | - title: Recovery 5 | label: -recovery 6 | - title: TLS 7 | label: -tls 8 | - title: HTTP/3 9 | label: -http 10 | change-template: - $TITLE (#$NUMBER) 11 | template: | 12 | ## What’s Changed 13 | 14 | $CHANGES -------------------------------------------------------------------------------- /.github/workflows/archive.yml: -------------------------------------------------------------------------------- 1 | name: "Archive Issues and Pull Requests" 2 | 3 | on: 4 | schedule: 5 | - cron: '0 0 * * 0,2,4' 6 | repository_dispatch: 7 | types: [archive] 8 | workflow_dispatch: 9 | inputs: 10 | archive_full: 11 | description: 'Recreate the archive from scratch' 12 | default: false 13 | type: boolean 14 | 15 | jobs: 16 | build: 17 | name: "Archive Issues and Pull Requests" 18 | runs-on: ubuntu-latest 19 | steps: 20 | - name: "Checkout" 21 | uses: actions/checkout@v4 22 | 23 | # Note: No caching for this build! 24 | 25 | - name: "Update Archive" 26 | uses: martinthomson/i-d-template@v1 27 | env: 28 | ARCHIVE_FULL: ${{ inputs.archive_full }} 29 | with: 30 | make: archive 31 | token: ${{ github.token }} 32 | 33 | - name: "Update GitHub Pages" 34 | uses: martinthomson/i-d-template@v1 35 | with: 36 | make: gh-archive 37 | token: ${{ github.token }} 38 | 39 | - name: "Save Archive" 40 | uses: actions/upload-artifact@v4 41 | with: 42 | path: archive.json 43 | -------------------------------------------------------------------------------- /.github/workflows/assign-to-project.yml: -------------------------------------------------------------------------------- 1 | name: Auto Assign to Late Stage Processing Project 2 | 3 | on: 4 | issues: 5 | types: [opened, labeled] 6 | env: 7 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} 8 | 9 | jobs: 10 | assign_one_project: 11 | runs-on: ubuntu-latest 12 | name: Assign to One Project 13 | steps: 14 | - name: Assign NEW issues to Late Stage Processing 15 | uses: srggrs/assign-one-project-github-action@1.2.0 16 | if: github.event.action == 'opened' 17 | with: 18 | project: 'https://github.com/quicwg/base-drafts/projects/5' 19 | -------------------------------------------------------------------------------- /.github/workflows/ghpages.yml: -------------------------------------------------------------------------------- 1 | name: "Update Editor's Copy" 2 | 3 | on: 4 | push: 5 | paths-ignore: 6 | - README.md 7 | - CONTRIBUTING.md 8 | - LICENSE.md 9 | - .gitignore 10 | pull_request: 11 | paths-ignore: 12 | - README.md 13 | - CONTRIBUTING.md 14 | - LICENSE.md 15 | - .gitignore 16 | 17 | jobs: 18 | build: 19 | name: "Update Editor's Copy" 20 | runs-on: ubuntu-latest 21 | steps: 22 | - name: "Checkout" 23 | uses: actions/checkout@v4 24 | 25 | - name: "Setup" 26 | id: setup 27 | run: date -u "+date=%FT%T" >>"$GITHUB_OUTPUT" 28 | 29 | - name: "Caching" 30 | uses: actions/cache@v4 31 | with: 32 | path: | 33 | .refcache 34 | .venv 35 | .gems 36 | node_modules 37 | .targets.mk 38 | key: i-d-${{ steps.setup.outputs.date }} 39 | restore-keys: i-d- 40 | 41 | - name: "Build Drafts" 42 | uses: martinthomson/i-d-template@v1 43 | with: 44 | token: ${{ github.token }} 45 | 46 | - name: "Update GitHub Pages" 47 | uses: martinthomson/i-d-template@v1 48 | if: ${{ github.event_name == 'push' }} 49 | with: 50 | make: gh-pages 51 | token: ${{ github.token }} 52 | 53 | - name: "Archive Built Drafts" 54 | uses: actions/upload-artifact@v4 55 | with: 56 | path: | 57 | draft-*.html 58 | draft-*.txt 59 | -------------------------------------------------------------------------------- /.github/workflows/publish.yml: -------------------------------------------------------------------------------- 1 | name: "Publish New Draft Version" 2 | 3 | on: 4 | push: 5 | tags: 6 | - "draft-*" 7 | workflow_dispatch: 8 | inputs: 9 | email: 10 | description: "Submitter email" 11 | default: "" 12 | type: string 13 | 14 | jobs: 15 | build: 16 | name: "Publish New Draft Version" 17 | runs-on: ubuntu-latest 18 | steps: 19 | - name: "Checkout" 20 | uses: actions/checkout@v4 21 | 22 | # See https://github.com/actions/checkout/issues/290 23 | - name: "Get Tag Annotations" 24 | run: git fetch -f origin ${{ github.ref }}:${{ github.ref }} 25 | 26 | - name: "Setup" 27 | id: setup 28 | run: date -u "+date=%FT%T" >>"$GITHUB_OUTPUT" 29 | 30 | - name: "Caching" 31 | uses: actions/cache@v4 32 | with: 33 | path: | 34 | .refcache 35 | .venv 36 | .gems 37 | node_modules 38 | .targets.mk 39 | key: i-d-${{ steps.setup.outputs.date }} 40 | restore-keys: i-d- 41 | 42 | - name: "Build Drafts" 43 | uses: martinthomson/i-d-template@v1 44 | with: 45 | token: ${{ github.token }} 46 | 47 | - name: "Upload to Datatracker" 48 | uses: martinthomson/i-d-template@v1 49 | with: 50 | make: upload 51 | env: 52 | UPLOAD_EMAIL: ${{ inputs.email }} 53 | 54 | - name: "Archive Submitted Drafts" 55 | uses: actions/upload-artifact@v4 56 | with: 57 | path: "versioned/draft-*-[0-9][0-9].*" 58 | -------------------------------------------------------------------------------- /.github/workflows/update.yml: -------------------------------------------------------------------------------- 1 | name: "Update Generated Files" 2 | # This rule is not run automatically. 3 | # It can be run manually to update all of the files that are part 4 | # of the template, specifically: 5 | # - README.md 6 | # - CONTRIBUTING.md 7 | # - .note.xml 8 | # - .github/CODEOWNERS 9 | # - Makefile 10 | # 11 | # 12 | # This might be useful if you have: 13 | # - added, removed, or renamed drafts (including after adoption) 14 | # - added, removed, or changed draft editors 15 | # - changed the title of drafts 16 | # 17 | # Note that this removes any customizations you have made to 18 | # the affected files. 19 | on: workflow_dispatch 20 | 21 | jobs: 22 | build: 23 | name: "Update Files" 24 | runs-on: ubuntu-latest 25 | steps: 26 | - name: "Checkout" 27 | uses: actions/checkout@v4 28 | 29 | - name: "Update Generated Files" 30 | uses: martinthomson/i-d-template@v1 31 | with: 32 | make: update-files 33 | token: ${{ github.token }} 34 | 35 | - name: "Push Update" 36 | run: git push 37 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.html 2 | *.pdf 3 | *.redxml 4 | *.swp 5 | *.txt 6 | *.upload 7 | *~ 8 | .refcache 9 | .tags 10 | .targets.mk 11 | /*-[0-9][0-9].xml 12 | /lib 13 | /node_modules/ 14 | /old/ 15 | Gemfile.lock 16 | archive.json 17 | package-lock.json 18 | report.xml 19 | rfc8999.xml 20 | rfc9000.xml 21 | rfc9001.xml 22 | rfc9002.xml 23 | rfc9114.xml 24 | rfc9204.xml 25 | -------------------------------------------------------------------------------- /.lint.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import sys 4 | import argparse 5 | import re 6 | 7 | parser = argparse.ArgumentParser(description="Lint markdown drafts.") 8 | parser.add_argument("files", metavar="file", nargs="+", help="Files to lint") 9 | parser.add_argument("-l", dest="maxLineLength", default=80) 10 | parser.add_argument("-f", dest="maxFigureLineLength", default=66) 11 | 12 | args = parser.parse_args() 13 | 14 | foundError = False 15 | 16 | for inputfile in args.files: 17 | insideFigure = False 18 | beforeAbstract = True 19 | 20 | with open(inputfile, mode="rt", newline=None, encoding="utf-8") as draft: 21 | linenumber = 0 22 | lines = draft.readlines() 23 | 24 | abstract = re.compile("^--- abstract") 25 | table = re.compile("^\s*(?:\||{:)") 26 | figure = re.compile("^[~`]{3,}") 27 | 28 | for line in lines: 29 | line = line.rstrip("\r\n") 30 | linenumber += 1 31 | 32 | def err(msg): 33 | global foundError 34 | foundError = True 35 | sys.stderr.write("{0}:{1}: {2}\n".format(inputfile, linenumber, msg)) 36 | sys.stderr.write("{0}\n".format(line)) 37 | 38 | if line.find("\t") >= 0: 39 | err("Line contains HTAB") 40 | 41 | # Skip everything before abstract 42 | if beforeAbstract: 43 | matchObj = abstract.match(line) 44 | if matchObj: 45 | beforeAbstract = False 46 | continue 47 | 48 | # Skip tables 49 | matchObj = table.match(line) 50 | if matchObj: 51 | continue 52 | 53 | # Toggle figure state 54 | matchObj = figure.match(line) 55 | if matchObj: 56 | insideFigure = not insideFigure 57 | continue 58 | 59 | # Check length 60 | length = len(line) 61 | limit = ( 62 | int(args.maxFigureLineLength) 63 | if insideFigure 64 | else int(args.maxLineLength) 65 | ) 66 | if length > limit: 67 | err("Line is {0} characters; limit is {1}".format(length, limit)) 68 | 69 | sys.exit(1 if foundError else 0) 70 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: python 2 | sudo: false 3 | dist: trusty 4 | 5 | addons: 6 | apt: 7 | packages: 8 | - python-pip 9 | - xsltproc 10 | 11 | env: 12 | global: 13 | - GOPATH="${TRAVIS_BUILD_DIR}/.go_workspace" 14 | - mmark_src=github.com/miekg/mmark/mmark 15 | - mmark=./mmark 16 | 17 | install: 18 | - pip install xml2rfc 19 | - if head -1 -q *.md | grep '^\-\-\-' >/dev/null 2>&1; then gem install --no-doc kramdown-rfc2629; fi 20 | - if head -1 -q *.md | grep '^%%%' >/dev/null 2>&1; then go get "$mmark_src" && go build "$mmark_src"; fi 21 | 22 | script: make ghpages 23 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # QUIC version 1 is done 2 | 3 | The base-drafts repository is the historical home of the QUIC version 1 4 | specifications that were written by the IETF QUIC Working Group. 5 | 6 | The set of documents are described [here](https://github.com/quicwg/base-drafts/blob/main/README.md). 7 | 8 | **Be aware that all contributions fall under the "[NOTE WELL](#note-well)" terms 9 | outlined below and our [Code of Conduct](#code-of-conduct) applies.** 10 | 11 | # Engaging with the QUIC community 12 | 13 | The scope of work in the QUIC Working Group is described in our 14 | [charter](https://datatracker.ietf.org/wg/quic/about/) and it extends beyond the 15 | development of the documents held in this repository. Anyone is welcome to 16 | contribute to the QUIC Working Group; you don't have to join the Working Group, 17 | because there is no "membership" -- anyone who participates in the work **is** a 18 | part of the QUIC Working Group. 19 | 20 | Before doing so, please familiarize yourself with our 21 | [charter](https://datatracker.ietf.org/wg/quic/about/). If you're new to IETF 22 | work, you may also want to read the [Tao of the 23 | IETF](https://www.ietf.org/tao.html). 24 | 25 | ## Following Discussion 26 | 27 | The Working Group has a few venues for discussion: 28 | 29 | * We plan to meet at all [IETF meetings](https://www.ietf.org/meeting/) for the 30 | foreseeable future, and possibly hold interim meetings between them as 31 | required. Agendas, minutes, and presentations are available in our [meeting 32 | materials repository](https://github.com/quicwg/wg-materials) and the 33 | [official proceedings](https://datatracker.ietf.org/wg/quic/meetings/). 34 | 35 | * Our [mailing list](https://www.ietf.org/mailman/listinfo/quic) is used for 36 | most communication, including notifications of meetings, new drafts, consensus 37 | calls and other business, as well as issue discussion. 38 | 39 | * We maintain several repositories in our GitHub organization 40 | [Github](https://github.com/quicwg/). Specific issues are discussed on the 41 | relevant issues list. If you don't want to use Github to follow these 42 | discussions, you can subscribe to the [issue announce 43 | list](https://www.ietf.org/mailman/listinfo/quic-issues). 44 | 45 | * The [quicdev Slack](https://quicdev.slack.com/) is used for more realtime 46 | communication, typcially amongst implementers, operators and researchers. 47 | Contact the [WG chairs](quic-chairs@ietf.org) for an invitation. Note that 48 | discussions on Slack are subject to the contribution guideline described in 49 | this document. 50 | 51 | To be active in the Working Group, you can participate in any of these places. 52 | Most activity takes place on the mailing list, but if you just want to comment 53 | on and raise issues, that's fine too. 54 | 55 | ## Code of Conduct 56 | 57 | The [IETF Guidelines for Conduct](https://tools.ietf.org/html/rfc7154) applies to all Working Group 58 | communications and meetings. 59 | 60 | 61 | ## NOTE WELL 62 | 63 | Any submission to the [IETF](https://www.ietf.org/) intended by the Contributor for publication as 64 | all or part of an IETF Internet-Draft or RFC and any statement made within the context of an IETF 65 | activity is considered an "IETF Contribution". Such statements include oral statements in IETF 66 | sessions, as well as written and electronic communications made at any time or place, which are 67 | addressed to: 68 | 69 | * The IETF plenary session 70 | * The IESG, or any member thereof on behalf of the IESG 71 | * Any IETF mailing list, including the IETF list itself, any working group 72 | or design team list, or any other list functioning under IETF auspices 73 | * Any IETF working group or portion thereof 74 | * Any Birds of a Feather (BOF) session 75 | * The IAB or any member thereof on behalf of the IAB 76 | * The RFC Editor or the Internet-Drafts function 77 | * All IETF Contributions are subject to the rules of 78 | [RFC 5378](https://tools.ietf.org/html/rfc5378) and 79 | [RFC 8179](https://tools.ietf.org/html/rfc8179). 80 | 81 | Statements made outside of an IETF session, mailing list or other function, that are clearly not 82 | intended to be input to an IETF activity, group or function, are not IETF Contributions in the 83 | context of this notice. 84 | 85 | Please consult [RFC 5378](https://tools.ietf.org/html/rfc5378) and [RFC 8179](https://tools.ietf.org/html/rfc8179) for details. 86 | 87 | A participant in any IETF activity is deemed to accept all IETF rules of process, as documented in 88 | Best Current Practices RFCs and IESG Statements. 89 | 90 | A participant in any IETF activity acknowledges that written, audio and video records of meetings 91 | may be made and may be available to the public. 92 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | MD_PREPROCESSOR := sed -e 's/{DATE}/$(shell date '+%Y-%m-%d')/g' 2 | TIDY := true 3 | 4 | LIBDIR := lib 5 | include $(LIBDIR)/main.mk 6 | 7 | $(LIBDIR)/main.mk: 8 | ifneq (,$(shell git submodule status $(LIBDIR) 2>/dev/null)) 9 | git submodule sync 10 | git submodule update $(CLONE_ARGS) --init 11 | else 12 | git clone -q --depth 10 $(CLONE_ARGS) \ 13 | -b main https://github.com/martinthomson/i-d-template $(LIBDIR) 14 | endif 15 | 16 | latest:: lint 17 | .PHONY: lint 18 | 19 | lint:: 20 | @$(trace) wslint $(python) ./.lint.py $(addsuffix .md,$(drafts)) 21 | 22 | show-next: 23 | @echo $(drafts_next) 24 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # QUIC Protocol Drafts 2 | 3 | The base-drafts repository is the historical home of the QUIC version 1 4 | specifications that were written by the QUIC Working Group. 5 | 6 | **The documents have now been published as RFCs. Technical or editorial 7 | erratum can be reported to the RFC Editor using the [errata 8 | tool](https://www.rfc-editor.org/errata.php).** 9 | 10 | **The QUIC Working Group welcomes discussion about new versions of QUIC, and new 11 | extensions to QUIC, or other proposals related to the QUIC transport. See 12 | [Engaging with the QUIC 13 | community](https://github.com/quicwg/base-drafts/blob/main/CONTRIBUTING.md#engaging-with-the-quic-community) 14 | for guidance.** 15 | 16 | ## QUIC Invariants 17 | 18 | * [RFC 8999](https://quicwg.org/base-drafts/rfc8999.html) 19 | * [Datatracker](https://datatracker.ietf.org/doc/html/draft-ietf-quic-invariants) 20 | 21 | ## Core Transport Protocol 22 | 23 | * [RFC 9000](https://quicwg.org/base-drafts/rfc9000.html) 24 | * [Working Group Draft](https://datatracker.ietf.org/doc/html/draft-ietf-quic-transport) 25 | 26 | ## Loss Detection & Congestion Control 27 | 28 | * [RFC 9002](https://quicwg.org/base-drafts/rfc9002.html) 29 | * [Working Group Draft](https://datatracker.ietf.org/doc/html/draft-ietf-quic-recovery) 30 | 31 | ## TLS Mapping 32 | 33 | * [RFC 9001](https://quicwg.org/base-drafts/rfc9001.html) 34 | * [Datatracker](https://datatracker.ietf.org/doc/html/draft-ietf-quic-tls) 35 | 36 | ## HTTP Mapping 37 | 38 | * [RFC 9114](https://quicwg.org/base-drafts/rfc9114.html) 39 | * [Datatracker](https://datatracker.ietf.org/doc/html/draft-ietf-quic-http) 40 | 41 | ## QPACK 42 | 43 | * [RFC 9204](https://quicwg.org/base-drafts/rfc9204.html) 44 | * [Datatracker](https://datatracker.ietf.org/doc/html/draft-ietf-quic-qpack) 45 | -------------------------------------------------------------------------------- /ietf.json: -------------------------------------------------------------------------------- 1 | { 2 | "group": "quic", 3 | "group_info": { 4 | "name": "QUIC", 5 | "type": "wg", 6 | "email": "quic@ietf.org", 7 | "activity_exclude_labels": ["editorial"] 8 | }, 9 | "repo_type": "specs", 10 | "revisions_tagged": true, 11 | "activity_summary_to": ["group_email"] 12 | } -------------------------------------------------------------------------------- /protection-samples.js: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | ':' //; exec "$(command -v nodejs || command -v node)" "$0" "$@" 3 | 4 | // This script performs simple encryption and decryption for Initial packets. 5 | // It's crude, but it should be sufficient to generate examples. 6 | 7 | 8 | 'use strict'; 9 | require('buffer'); 10 | const assert = require('assert'); 11 | const crypto = require('crypto'); 12 | 13 | const INITIAL_SALT = Buffer.from('38762cf7f55934b34d179ae6a4c80cadccbb7f0a', 'hex'); 14 | const RETRY_KEY = Buffer.from('be0c690b9f66575a1d766b54e368c84e', 'hex'); 15 | const RETRY_NONCE = Buffer.from('461599d35d632bf2239825bb', 'hex'); 16 | const SHA256 = 'sha256'; 17 | const AES_GCM = 'aes-128-gcm'; 18 | const AES_ECB = 'aes-128-ecb'; 19 | 20 | const version = '00000001'; 21 | 22 | function chunk(s, n) { 23 | return (new Array(Math.ceil(s.length / n))) 24 | .fill() 25 | .map((_, i) => s.slice(i * n, i * n + n)); 26 | } 27 | 28 | function log(m, k) { 29 | console.log(m + ' [' + k.length + ']: ' + chunk(k.toString('hex'), 32).join(' ')); 30 | }; 31 | 32 | class HMAC { 33 | constructor(hash) { 34 | this.hash = hash; 35 | } 36 | 37 | digest(key, input) { 38 | var hmac = crypto.createHmac(this.hash, key); 39 | hmac.update(input); 40 | return hmac.digest(); 41 | } 42 | } 43 | 44 | /* HKDF as defined in RFC5869, with HKDF-Expand-Label from RFC8446. */ 45 | class QHKDF { 46 | constructor(hmac, prk) { 47 | this.hmac = hmac; 48 | this.prk = prk; 49 | } 50 | 51 | static extract(hash, salt, ikm) { 52 | var hmac = new HMAC(hash); 53 | return new QHKDF(hmac, hmac.digest(salt, ikm)); 54 | } 55 | 56 | expand(info, len) { 57 | var output = Buffer.alloc(0); 58 | var T = Buffer.alloc(0); 59 | info = Buffer.from(info, 'ascii'); 60 | var counter = 0; 61 | var cbuf = Buffer.alloc(1); 62 | while (output.length < len) { 63 | cbuf.writeUIntBE(++counter, 0, 1); 64 | T = this.hmac.digest(this.prk, Buffer.concat([T, info, cbuf])); 65 | output = Buffer.concat([output, T]); 66 | } 67 | 68 | return output.slice(0, len); 69 | } 70 | 71 | expand_label(label, len) { 72 | const prefix = "tls13 "; 73 | var info = Buffer.alloc(2 + 1 + prefix.length + label.length + 1); 74 | // Note that Buffer.write returns the number of bytes written, whereas 75 | // Buffer.writeUIntBE returns the end offset of the write. Consistency FTW. 76 | var offset = info.writeUIntBE(len, 0, 2); 77 | offset = info.writeUIntBE(prefix.length + label.length, offset, 1); 78 | offset += info.write(prefix + label, offset); 79 | info.writeUIntBE(0, offset, 1); 80 | log('info for ' + label, info); 81 | return this.expand(info, len); 82 | } 83 | } 84 | 85 | // XOR b into a. 86 | function xor(a, b) { 87 | a.forEach((_, i) => { 88 | a[i] ^= b[i]; 89 | }); 90 | } 91 | 92 | function applyNonce(iv, counter) { 93 | var nonce = Buffer.from(iv); 94 | const m = nonce.readUIntBE(nonce.length - 6, 6); 95 | const x = ((m ^ counter) & 0xffffff) + 96 | ((((m / 0x1000000) ^ (counter / 0x1000000)) & 0xffffff) * 0x1000000); 97 | nonce.writeUIntBE(x, nonce.length - 6, 6); 98 | return nonce; 99 | } 100 | 101 | class InitialProtection { 102 | constructor(label, cid) { 103 | var qhkdf = QHKDF.extract(SHA256, INITIAL_SALT, cid); 104 | log('initial_secret', qhkdf.prk); 105 | qhkdf = new QHKDF(qhkdf.hmac, qhkdf.expand_label(label, 32)); 106 | log(label + ' secret', qhkdf.prk); 107 | this.key = qhkdf.expand_label("quic key", 16); 108 | log(label + ' key', this.key); 109 | this.iv = qhkdf.expand_label("quic iv", 12); 110 | log(label + ' iv', this.iv); 111 | this.hp = qhkdf.expand_label("quic hp", 16); 112 | log(label + ' hp', this.hp); 113 | } 114 | 115 | generateNonce(counter) { 116 | return applyNonce(this.iv, counter); 117 | } 118 | 119 | // Returns the encrypted data with authentication tag appended. The AAD is 120 | // used, but not added to the output. 121 | encipher(pn, aad, data) { 122 | console.log('encipher pn', pn); 123 | log('encipher aad', aad); 124 | log('encipher data', data); 125 | var nonce = this.generateNonce(pn); 126 | var gcm = crypto.createCipheriv(AES_GCM, this.key, nonce); 127 | gcm.setAAD(aad); 128 | var e = gcm.update(data); 129 | gcm.final(); 130 | e = Buffer.concat([e, gcm.getAuthTag()]); 131 | log('enciphered', e); 132 | return e; 133 | } 134 | 135 | decipher(pn, aad, data) { 136 | console.log('decipher pn', pn); 137 | log('decipher aad', aad); 138 | log('decipher data', data); 139 | var nonce = this.generateNonce(pn); 140 | var gcm = crypto.createDecipheriv(AES_GCM, this.key, nonce); 141 | gcm.setAAD(aad); 142 | gcm.setAuthTag(data.slice(data.length - 16)); 143 | var d = gcm.update(data.slice(0, data.length - 16)); 144 | gcm.final(); 145 | log('deciphered', d); 146 | return d; 147 | } 148 | 149 | // Calculates the header protection mask. Returns 16 bytes of output. 150 | hpMask(sample) { 151 | log('hp sample', sample); 152 | // var ctr = crypto.createCipheriv('aes-128-ctr', this.hp, sample); 153 | // var mask = ctr.update(Buffer.alloc(5)); 154 | var ecb = crypto.createCipheriv(AES_ECB, this.hp, Buffer.alloc(0)); 155 | var mask = ecb.update(sample); 156 | log('hp mask', mask); 157 | return mask; 158 | } 159 | 160 | // hdr is everything before the length field 161 | // hdr[0] has the packet number length already in place 162 | // pn is the packet number 163 | // data is the payload (i.e., encoded frames) 164 | encrypt(hdr, pn, data) { 165 | var pn_len = 1 + (hdr[0] & 0x3); 166 | if (pn_len + data.length < 4) { 167 | throw new Error('insufficient length of packet number and payload'); 168 | } 169 | 170 | var aad = Buffer.alloc(hdr.length + 2 + pn_len); 171 | var offset = hdr.copy(aad); 172 | // Add a length that covers the packet number encoding and the auth tag. 173 | offset = aad.writeUIntBE(0x4000 | (pn_len + data.length + 16), offset, 2); 174 | var pn_offset = offset; 175 | var pn_mask = 0xffffffff >> (8 * (4 - pn_len)); 176 | offset = aad.writeUIntBE(pn & pn_mask, offset, pn_len) 177 | log('header', aad); 178 | 179 | var payload = this.encipher(pn, aad, data); 180 | 181 | var mask = this.hpMask(payload.slice(4 - pn_len, 20 - pn_len)); 182 | aad[0] ^= mask[0] & (0x1f >> (aad[0] >> 7)); 183 | xor(aad.slice(pn_offset), mask.slice(1)); 184 | log('masked header', aad); 185 | return Buffer.concat([aad, payload]); 186 | } 187 | 188 | cidLen(v) { 189 | if (!v) { 190 | return 0; 191 | } 192 | return v + 3; 193 | } 194 | 195 | decrypt(data) { 196 | log('decrypt', data); 197 | if (data[0] & 0x40 !== 0x40) { 198 | throw new Error('missing QUIC bit'); 199 | } 200 | if (data[0] & 0x80 === 0) { 201 | throw new Error('short header unsupported'); 202 | } 203 | var hdr_len = 1 + 4; 204 | hdr_len += 1 + data[hdr_len]; // DCID 205 | hdr_len += 1 + data[hdr_len]; // SCID 206 | if ((data[0] & 0x30) === 0) { // Initial packet: token. 207 | if ((data[hdr_len] & 0xc0) !== 0) { 208 | throw new Error('multi-byte token length unsupported'); 209 | } 210 | hdr_len += 1 + data[hdr_len]; // oops: this only handles single octet lengths. 211 | } 212 | // Skip the length. 213 | hdr_len += 1 << (data[hdr_len] >> 6); 214 | // Now we're at the encrypted bit. 215 | var mask = this.hpMask(data.slice(hdr_len + 4, hdr_len + 20)); 216 | 217 | var octet0 = data[0] ^ (mask[0] & (0x1f >> (data[0] >> 7))); 218 | var pn_len = (octet0 & 3) + 1; 219 | var hdr = Buffer.from(data.slice(0, hdr_len + pn_len)); 220 | hdr[0] = octet0; 221 | log('header', hdr); 222 | xor(hdr.slice(hdr_len), mask.slice(1)); 223 | log('unmasked header', hdr); 224 | var pn = hdr.readUIntBE(hdr_len, pn_len); 225 | // Important: this doesn't recover PN based on expected value. 226 | // The expectation being that Initial packets won't ever need that. 227 | return this.decipher(pn, hdr, data.slice(hdr.length)); 228 | } 229 | } 230 | 231 | function pad(hdr, body) { 232 | var pn_len = (hdr[0] & 3) + 1; 233 | var size = 1200 - hdr.length - 2 - pn_len - 16; // Assume 2 byte length. 234 | if (size < 0) { 235 | return body; 236 | } 237 | var padded = Buffer.allocUnsafe(size); 238 | console.log('pad amount', size); 239 | body.copy(padded); 240 | padded.fill(0, body.length); 241 | log('padded', padded); 242 | return padded; 243 | } 244 | 245 | function test(role, cid, hdr, pn, body) { 246 | cid = Buffer.from(cid, 'hex'); 247 | log('connection ID', cid); 248 | hdr = Buffer.from(hdr, 'hex'); 249 | log('header', hdr); 250 | console.log('packet number = ' + pn); 251 | body = Buffer.from(body, 'hex'); 252 | log('body', hdr); 253 | 254 | if (role === 'client' && (hdr[0] & 0x30) === 0) { 255 | body = pad(hdr, body); 256 | } 257 | 258 | var endpoint = new InitialProtection(role + ' in', cid); 259 | var packet = endpoint.encrypt(hdr, pn, body); 260 | log('encrypted packet', packet); 261 | 262 | var content = endpoint.decrypt(packet); 263 | log('decrypted content', content); 264 | if (content.compare(body) !== 0) { 265 | throw new Error('decrypted result not the same as the original'); 266 | } 267 | } 268 | 269 | function hex_cid(cid) { 270 | return '0' + (cid.length / 2).toString(16) + cid; 271 | } 272 | 273 | // Verify that the retry keys are correct. 274 | function derive_retry() { 275 | let secret = Buffer.from('d9c9943e6101fd200021506bcc02814c73030f25c79d71ce876eca876e6fca8e', 'hex'); 276 | let qhkdf = new QHKDF(new HMAC(SHA256), secret); 277 | let key = qhkdf.expand_label("quic key", 16); 278 | log('retry key', key); 279 | assert.deepStrictEqual(key, RETRY_KEY); 280 | let nonce = qhkdf.expand_label("quic iv", 12); 281 | log('retry nonce', nonce); 282 | assert.deepStrictEqual(nonce, RETRY_NONCE); 283 | } 284 | 285 | function retry(dcid, scid, odcid) { 286 | var pfx = Buffer.from(hex_cid(odcid), 'hex'); 287 | var encoded = Buffer.from('ff' + version + hex_cid(dcid) + hex_cid(scid), 'hex'); 288 | var token = Buffer.from('token', 'ascii'); 289 | var header = Buffer.concat([encoded, token]); 290 | log('retry header', header); 291 | var aad = Buffer.concat([pfx, header]); 292 | log('retry aad', aad); 293 | 294 | var gcm = crypto.createCipheriv(AES_GCM, RETRY_KEY, RETRY_NONCE); 295 | gcm.setAAD(aad); 296 | gcm.update(''); 297 | gcm.final(); 298 | log('retry', Buffer.concat([header, gcm.getAuthTag()])); 299 | } 300 | 301 | // A simple ChaCha20-Poly1305 packet. 302 | function chacha20(pn, payload) { 303 | log('chacha20poly1305 pn=' + pn.toString(), payload); 304 | let header = Buffer.alloc(4); 305 | header.writeUIntBE(0x42, 0, 1); 306 | header.writeUIntBE(pn & 0xffffff, 1, 3); 307 | log('unprotected header', header); 308 | const key = Buffer.from('c6d98ff3441c3fe1b2182094f69caa2e' + 309 | 'd4b716b65488960a7a984979fb23e1c8', 'hex'); 310 | const iv = Buffer.from('e0459b3474bdd0e44a41c144', 'hex'); 311 | const nonce = applyNonce(iv, pn); 312 | log('nonce', nonce); 313 | let aead = crypto.createCipheriv('ChaCha20-Poly1305', key, nonce, { authTagLength: 16 }); 314 | aead.setAAD(header); 315 | const e = aead.update(payload); 316 | aead.final(); 317 | let ct = Buffer.concat([e, aead.getAuthTag()]); 318 | log('ciphertext', ct); 319 | 320 | const sample = ct.slice(1, 17); 321 | log('sample', sample); 322 | const hp = Buffer.from('25a282b9e82f06f21f488917a4fc8f1b' + 323 | '73573685608597d0efcb076b0ab7a7a4', 'hex'); 324 | let chacha = crypto.createCipheriv('ChaCha20', hp, sample); 325 | const mask = chacha.update(Buffer.alloc(5)); 326 | log('mask', mask); 327 | let packet = Buffer.concat([header, ct]); 328 | header[0] ^= mask[0] & 0x1f; 329 | xor(header.slice(1), mask.slice(1)); 330 | log('header', header); 331 | log('protected packet', Buffer.concat([header, ct])); 332 | } 333 | 334 | var cid = '8394c8f03e515708'; 335 | 336 | var ci_hdr = 'c3' + version + hex_cid(cid) + '0000'; 337 | // This is a client Initial. 338 | var crypto_frame = '060040f1' + 339 | '010000ed0303ebf8fa56f12939b9584a3896472ec40bb863cfd3e86804fe3a47' + 340 | 'f06a2b69484c00000413011302010000c000000010000e00000b6578616d706c' + 341 | '652e636f6dff01000100000a00080006001d0017001800100007000504616c70' + 342 | '6e000500050100000000003300260024001d00209370b2c9caa47fbabaf4559f' + 343 | 'edba753de171fa71f50f1ce15d43e994ec74d748002b0003020304000d001000' + 344 | '0e0403050306030203080408050806002d00020101001c000240010039003204' + 345 | '08ffffffffffffffff05048000ffff07048000ffff0801100104800075300901' + 346 | '100f088394c8f03e51570806048000ffff'; 347 | 348 | test('client', cid, ci_hdr, 2, crypto_frame); 349 | 350 | // This should be a valid server Initial. 351 | var frames = '02000000000600405a' + 352 | '020000560303eefce7f7b37ba1d163' + 353 | '2e96677825ddf73988cfc79825df566dc5430b9a04' + 354 | '5a1200130100002e00330024001d00209d3c940d89' + 355 | '690b84d08a60993c144eca684d1081287c834d5311' + 356 | 'bcf32bb9da1a002b00020304'; 357 | var scid = 'f067a5502a4262b5'; 358 | var si_hdr = 'c1' + version + '00' + hex_cid(scid) + '00'; 359 | test('server', cid, si_hdr, 1, frames); 360 | 361 | derive_retry(); 362 | retry('', scid, cid); 363 | chacha20(654360564, Buffer.from('01', 'hex')); 364 | -------------------------------------------------------------------------------- /rfc8999.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Version-Independent Properties of QUIC" 3 | abbrev: QUIC Invariants 4 | number: 8999 5 | docName: draft-ietf-quic-invariants-13 6 | date: 2021-05 7 | category: std 8 | consensus: true 9 | ipr: trust200902 10 | area: Transport 11 | workgroup: QUIC 12 | keyword: 13 | - crypto 14 | - next generation 15 | - protocol 16 | - secure 17 | - transport 18 | - UDP 19 | 20 | stand_alone: yes 21 | pi: [toc, sortrefs, symrefs, docmapping] 22 | 23 | author: 24 | - 25 | ins: M. Thomson 26 | name: Martin Thomson 27 | org: Mozilla 28 | email: mt@lowentropy.net 29 | 30 | informative: 31 | 32 | QUIC-TRANSPORT: 33 | title: "QUIC: A UDP-Based Multiplexed and Secure Transport" 34 | date: 2021-05 35 | seriesinfo: 36 | RFC: 9000 37 | DOI: 10.17487/RFC9000 38 | author: 39 | - 40 | ins: J. Iyengar 41 | name: Jana Iyengar 42 | org: Google 43 | role: editor 44 | - 45 | ins: M. Thomson 46 | name: Martin Thomson 47 | org: Mozilla 48 | role: editor 49 | 50 | QUIC-TLS: 51 | title: "Using TLS to Secure QUIC" 52 | date: 2021-05 53 | seriesinfo: 54 | RFC: 9001 55 | DOI: 10.17487/RFC9001 56 | author: 57 | - 58 | ins: M. Thomson 59 | name: Martin Thomson 60 | org: Mozilla 61 | role: editor 62 | - 63 | ins: S. Turner 64 | name: Sean Turner 65 | org: sn3rd 66 | role: editor 67 | 68 | 69 | --- abstract 70 | 71 | This document defines the properties of the QUIC transport protocol that are 72 | common to all versions of the protocol. 73 | 74 | 75 | --- middle 76 | 77 | # An Extremely Abstract Description of QUIC 78 | 79 | QUIC is a connection-oriented protocol between two endpoints. Those endpoints 80 | exchange UDP datagrams. These UDP datagrams contain QUIC packets. QUIC 81 | endpoints use QUIC packets to establish a QUIC connection, which is shared 82 | protocol state between those endpoints. 83 | 84 | 85 | # Fixed Properties of All QUIC Versions 86 | 87 | In addition to providing secure, multiplexed transport, QUIC {{QUIC-TRANSPORT}} 88 | allows for the option to negotiate a version. This allows the protocol to 89 | change over time in response to new requirements. Many characteristics of the 90 | protocol could change between versions. 91 | 92 | This document describes the subset of QUIC that is intended to remain stable as 93 | new versions are developed and deployed. All of these invariants are 94 | independent of the IP version. 95 | 96 | The primary goal of this document is to ensure that it is possible to deploy new 97 | versions of QUIC. By documenting the properties that cannot change, this 98 | document aims to preserve the ability for QUIC endpoints to negotiate changes to 99 | any other aspect of the protocol. As a consequence, this also guarantees a 100 | minimal amount of information that is made available to entities other than 101 | endpoints. Unless specifically prohibited in this document, any aspect of the 102 | protocol can change between different versions. 103 | 104 | {{bad-assumptions}} contains a non-exhaustive list of some incorrect assumptions 105 | that might be made based on knowledge of QUIC version 1; these do not apply to 106 | every version of QUIC. 107 | 108 | 109 | # Conventions and Definitions 110 | 111 | The key words "MUST", "MUST NOT", "REQUIRED", "SHALL", "SHALL NOT", "SHOULD", 112 | "SHOULD NOT", "RECOMMENDED", "NOT RECOMMENDED", "MAY", and "OPTIONAL" in this 113 | document are to be interpreted as described in BCP 14 {{!RFC2119}} {{!RFC8174}} 114 | when, and only when, they appear in all capitals, as shown here. 115 | 116 | This document defines requirements on future QUIC versions, even where normative 117 | language is not used. 118 | 119 | This document uses terms and notational conventions from {{QUIC-TRANSPORT}}. 120 | 121 | 122 | # Notational Conventions 123 | 124 | The format of packets is described using the notation defined in this section. 125 | This notation is the same as that used in {{QUIC-TRANSPORT}}. 126 | 127 | Complex fields are named and then followed by a list of fields surrounded by a 128 | pair of matching braces. Each field in this list is separated by commas. 129 | 130 | Individual fields include length information, plus indications about fixed 131 | value, optionality, or repetitions. Individual fields use the following 132 | notational conventions, with all lengths in bits: 133 | 134 | x (A): 135 | : Indicates that x is A bits long 136 | 137 | x (A..B): 138 | : Indicates that x can be any length from A to B; A can be omitted to indicate 139 | a minimum of zero bits, and B can be omitted to indicate no set upper limit; 140 | values in this format always end on a byte boundary 141 | 142 | x (L) = C: 143 | : Indicates that x has a fixed value of C; the length of x is described by 144 | L, which can use any of the length forms above 145 | 146 | x (L) ...: 147 | : Indicates that x is repeated zero or more times and that each instance has a 148 | length of L 149 | 150 | This document uses network byte order (that is, big endian) values. Fields 151 | are placed starting from the high-order bits of each byte. 152 | 153 | {{fig-ex-format}} shows an example structure: 154 | 155 | ~~~ 156 | Example Structure { 157 | One-bit Field (1), 158 | 7-bit Field with Fixed Value (7) = 61, 159 | Arbitrary-Length Field (..), 160 | Variable-Length Field (8..24), 161 | Repeated Field (8) ..., 162 | } 163 | ~~~ 164 | {: #fig-ex-format title="Example Format"} 165 | 166 | 167 | # QUIC Packets 168 | 169 | QUIC endpoints exchange UDP datagrams that contain one or more QUIC packets. 170 | This section describes the invariant characteristics of a QUIC packet. A 171 | version of QUIC could permit multiple QUIC packets in a single UDP datagram, but 172 | the invariant properties only describe the first packet in a datagram. 173 | 174 | QUIC defines two types of packet headers: long and short. Packets with a long 175 | header are identified by the most significant bit of the first byte being set; 176 | packets with a short header have that bit cleared. 177 | 178 | QUIC packets might be integrity protected, including the header. However, QUIC 179 | Version Negotiation packets are not integrity protected; see {{vn}}. 180 | 181 | Aside from the values described here, the payload of QUIC packets is 182 | version specific and of arbitrary length. 183 | 184 | 185 | ## Long Header 186 | 187 | Long headers take the form described in {{fig-long}}. 188 | 189 | ~~~ 190 | Long Header Packet { 191 | Header Form (1) = 1, 192 | Version-Specific Bits (7), 193 | Version (32), 194 | Destination Connection ID Length (8), 195 | Destination Connection ID (0..2040), 196 | Source Connection ID Length (8), 197 | Source Connection ID (0..2040), 198 | Version-Specific Data (..), 199 | } 200 | ~~~ 201 | {: #fig-long title="QUIC Long Header"} 202 | 203 | A QUIC packet with a long header has the high bit of the first byte set to 1. 204 | All other bits in that byte are version specific. 205 | 206 | The next four bytes include a 32-bit Version field. Versions are described in 207 | {{version}}. 208 | 209 | The next byte contains the length in bytes of the Destination Connection ID 210 | field that follows it. This length is encoded as an 8-bit unsigned integer. 211 | The Destination Connection ID field follows the Destination Connection ID Length 212 | field and is between 0 and 255 bytes in length. Connection IDs are described in 213 | {{connection-id}}. 214 | 215 | The next byte contains the length in bytes of the Source Connection ID field 216 | that follows it. This length is encoded as an 8-bit unsigned integer. The 217 | Source Connection ID field follows the Source Connection ID Length field and is 218 | between 0 and 255 bytes in length. 219 | 220 | The remainder of the packet contains version-specific content. 221 | 222 | 223 | ## Short Header 224 | 225 | Short headers take the form described in {{fig-short}}. 226 | 227 | ~~~~~ 228 | Short Header Packet { 229 | Header Form (1) = 0, 230 | Version-Specific Bits (7), 231 | Destination Connection ID (..), 232 | Version-Specific Data (..), 233 | } 234 | ~~~~~ 235 | {: #fig-short title="QUIC Short Header"} 236 | 237 | A QUIC packet with a short header has the high bit of the first byte set to 0. 238 | 239 | A QUIC packet with a short header includes a Destination Connection ID 240 | immediately following the first byte. The short header does not include the 241 | Destination Connection ID Length, Source Connection ID Length, Source Connection 242 | ID, or Version fields. The length of the Destination Connection ID is not 243 | encoded in packets with a short header and is not constrained by this 244 | specification. 245 | 246 | The remainder of the packet has version-specific semantics. 247 | 248 | 249 | ## Connection ID 250 | 251 | A connection ID is an opaque field of arbitrary length. 252 | 253 | The primary function of a connection ID is to ensure that changes in addressing 254 | at lower protocol layers (UDP, IP, and below) do not cause packets for a QUIC 255 | connection to be delivered to the wrong QUIC endpoint. The connection ID 256 | is used by endpoints and the intermediaries that support them to ensure that 257 | each QUIC packet can be delivered to the correct instance of an endpoint. At 258 | the endpoint, the connection ID is used to identify the QUIC connection for 259 | which the packet is intended. 260 | 261 | The connection ID is chosen by each endpoint using version-specific methods. 262 | Packets for the same QUIC connection might use different connection ID values. 263 | 264 | 265 | ## Version 266 | 267 | The Version field contains a 4-byte identifier. This value can be used by 268 | endpoints to identify a QUIC version. A Version field with a value of 269 | 0x00000000 is reserved for version negotiation; see {{vn}}. All other values 270 | are potentially valid. 271 | 272 | The properties described in this document apply to all versions of QUIC. A 273 | protocol that does not conform to the properties described in this document is 274 | not QUIC. Future documents might describe additional properties that apply to 275 | a specific QUIC version or to a range of QUIC versions. 276 | 277 | 278 | # Version Negotiation {#vn} 279 | 280 | A QUIC endpoint that receives a packet with a long header and a version it 281 | either does not understand or does not support might send a Version Negotiation 282 | packet in response. Packets with a short header do not trigger version 283 | negotiation. 284 | 285 | A Version Negotiation packet sets the high bit of the first byte, and thus it 286 | conforms with the format of a packet with a long header as defined in 287 | {{long-header}}. A Version Negotiation packet is identifiable as such by the 288 | Version field, which is set to 0x00000000. 289 | 290 | ~~~ 291 | Version Negotiation Packet { 292 | Header Form (1) = 1, 293 | Unused (7), 294 | Version (32) = 0, 295 | Destination Connection ID Length (8), 296 | Destination Connection ID (0..2040), 297 | Source Connection ID Length (8), 298 | Source Connection ID (0..2040), 299 | Supported Version (32) ..., 300 | } 301 | ~~~ 302 | {: #version-negotiation-format title="Version Negotiation Packet"} 303 | 304 | Only the most significant bit of the first byte of a Version Negotiation packet 305 | has any defined value. The remaining 7 bits, labeled "Unused", can be set to 306 | any value when sending and MUST be ignored on receipt. 307 | 308 | After the Source Connection ID field, the Version Negotiation packet contains a 309 | list of Supported Version fields, each identifying a version that the endpoint 310 | sending the packet supports. A Version Negotiation packet contains no other 311 | fields. An endpoint MUST ignore a packet that contains no Supported Version 312 | fields or contains a truncated Supported Version value. 313 | 314 | Version Negotiation packets do not use integrity or confidentiality protection. 315 | Specific QUIC versions might include protocol elements that allow endpoints to 316 | detect modification or corruption in the set of supported versions. 317 | 318 | An endpoint MUST include the value from the Source Connection ID field of the 319 | packet it receives in the Destination Connection ID field. The value for the 320 | Source Connection ID field MUST be copied from the Destination Connection ID 321 | field of the received packet, which is initially randomly selected by a client. 322 | Echoing both connection IDs gives clients some assurance that the server 323 | received the packet and that the Version Negotiation packet was not generated by 324 | an attacker that is unable to observe packets. 325 | 326 | An endpoint that receives a Version Negotiation packet might change the version 327 | that it decides to use for subsequent packets. The conditions under which an 328 | endpoint changes its QUIC version will depend on the version of QUIC that it 329 | chooses. 330 | 331 | See {{QUIC-TRANSPORT}} for a more thorough description of how an endpoint that 332 | supports QUIC version 1 generates and consumes a Version Negotiation packet. 333 | 334 | 335 | # Security and Privacy Considerations 336 | 337 | It is possible that middleboxes could observe traits of a specific version of 338 | QUIC and assume that when other versions of QUIC exhibit similar traits the same 339 | underlying semantic is being expressed. There are potentially many such traits; 340 | see {{bad-assumptions}}. Some effort has been made to either eliminate or 341 | obscure some observable traits in QUIC version 1, but many of these remain. 342 | Other QUIC versions might make different design decisions and so exhibit 343 | different traits. 344 | 345 | The QUIC version number does not appear in all QUIC packets, which means that 346 | reliably extracting information from a flow based on version-specific traits 347 | requires that middleboxes retain state for every connection ID they see. 348 | 349 | The Version Negotiation packet described in this document is not 350 | integrity protected; it only has modest protection against insertion by 351 | attackers. An endpoint MUST authenticate the semantic content of a Version 352 | Negotiation packet if it attempts a different QUIC version as a result. 353 | 354 | 355 | --- back 356 | 357 | # Incorrect Assumptions {#bad-assumptions} 358 | 359 | There are several traits of QUIC version 1 {{QUIC-TRANSPORT}} that are not 360 | protected from observation but are nonetheless considered to be changeable when 361 | a new version is deployed. 362 | 363 | This section lists a sampling of incorrect assumptions that might be made about 364 | QUIC based on knowledge of QUIC version 1. Some of these statements are not 365 | even true for QUIC version 1. This is not an exhaustive list; it is intended to 366 | be illustrative only. 367 | 368 | **Any and all of the following statements can be false for a given QUIC 369 | version:** 370 | 371 | * QUIC uses TLS {{QUIC-TLS}} and some TLS messages are visible on the wire. 372 | 373 | * QUIC long headers are only exchanged during connection establishment. 374 | 375 | * Every flow on a given 5-tuple will include a connection establishment phase. 376 | 377 | * The first packets exchanged on a flow use the long header. 378 | 379 | * The last packet before a long period of quiescence might be assumed 380 | to contain only an acknowledgment. 381 | 382 | * QUIC uses an Authenticated Encryption with Associated Data (AEAD) function 383 | (AEAD_AES_128_GCM; see {{?RFC5116}}) to protect the packets it exchanges 384 | during connection establishment. 385 | 386 | * QUIC packet numbers are encrypted and appear as the first encrypted bytes. 387 | 388 | * QUIC packet numbers increase by one for every packet sent. 389 | 390 | * QUIC has a minimum size for the first handshake packet sent by a client. 391 | 392 | * QUIC stipulates that a client speak first. 393 | 394 | * QUIC packets always have the second bit of the first byte (0x40) set. 395 | 396 | * A QUIC Version Negotiation packet is only sent by a server. 397 | 398 | * A QUIC connection ID changes infrequently. 399 | 400 | * QUIC endpoints change the version they speak if they are sent a Version 401 | Negotiation packet. 402 | 403 | * The Version field in a QUIC long header is the same in both directions. 404 | 405 | * A QUIC packet with a particular value in the Version field means that the 406 | corresponding version of QUIC is in use. 407 | 408 | * Only one connection at a time is established between any pair of QUIC 409 | endpoints. 410 | -------------------------------------------------------------------------------- /rfc9002.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: QUIC Loss Detection and Congestion Control 3 | abbrev: QUIC Loss Detection 4 | number: 9002 5 | docName: draft-ietf-quic-recovery-34 6 | date: 2021-05 7 | category: std 8 | consensus: true 9 | ipr: trust200902 10 | area: Transport 11 | workgroup: QUIC 12 | keyword: 13 | - bbr 14 | - delay-sensitive congestion control 15 | - fec 16 | - loss-tolerant congestion control 17 | - next generation 18 | 19 | stand_alone: yes 20 | pi: [toc, sortrefs, symrefs, docmapping] 21 | 22 | author: 23 | - 24 | ins: J. Iyengar 25 | name: Jana Iyengar 26 | org: Fastly 27 | email: jri.ietf@gmail.com 28 | role: editor 29 | - 30 | ins: I. Swett 31 | name: Ian Swett 32 | org: Google 33 | email: ianswett@google.com 34 | role: editor 35 | 36 | normative: 37 | 38 | QUIC-TRANSPORT: 39 | title: "QUIC: A UDP-Based Multiplexed and Secure Transport" 40 | date: 2021-05 41 | seriesinfo: 42 | RFC: 9000 43 | DOI: 10.17487/RFC9000 44 | author: 45 | - 46 | ins: J. Iyengar 47 | name: Jana Iyengar 48 | org: Fastly 49 | role: editor 50 | - 51 | ins: M. Thomson 52 | name: Martin Thomson 53 | org: Mozilla 54 | role: editor 55 | 56 | QUIC-TLS: 57 | title: "Using TLS to Secure QUIC" 58 | date: 2021-05 59 | seriesinfo: 60 | RFC: 9001 61 | DOI: 10.17487/RFC9001 62 | author: 63 | - 64 | ins: M. Thomson 65 | name: Martin Thomson 66 | org: Mozilla 67 | role: editor 68 | - 69 | ins: S. Turner 70 | name: Sean Turner 71 | org: sn3rd 72 | role: editor 73 | 74 | RFC8085: 75 | 76 | informative: 77 | 78 | FACK: 79 | title: "Forward acknowledgement: Refining TCP Congestion Control" 80 | author: 81 | - 82 | initials: M. 83 | surname: Mathis 84 | - 85 | initials: J. 86 | surname: Mahdavi 87 | date: 1996-08 88 | refcontent: ACM SIGCOMM Computer Communication Review 89 | seriesinfo: 90 | DOI: 10.1145/248157.248181 91 | 92 | RETRANSMISSION: 93 | title: "Improving Round-Trip Time Estimates in Reliable Transport Protocols" 94 | author: 95 | - 96 | initials: P. 97 | surname: Karn 98 | - 99 | initials: C. 100 | surname: Partridge 101 | date: 1991-11 102 | refcontent: ACM Transactions on Computer Systems 103 | seriesinfo: 104 | DOI: 10.1145/118544.118549 105 | 106 | RFC3465: 107 | 108 | --- abstract 109 | 110 | This document describes loss detection and congestion control mechanisms for 111 | QUIC. 112 | 113 | 114 | --- middle 115 | 116 | # Introduction 117 | 118 | 119 | QUIC is a secure, general-purpose transport protocol, described in 120 | {{QUIC-TRANSPORT}}. This document describes loss detection and congestion 121 | control mechanisms for QUIC. 122 | 123 | # Conventions and Definitions 124 | 125 | The key words "MUST", "MUST NOT", "REQUIRED", "SHALL", "SHALL NOT", "SHOULD", 126 | "SHOULD NOT", "RECOMMENDED", "NOT RECOMMENDED", "MAY", and "OPTIONAL" in this 127 | document are to be interpreted as described in BCP 14 {{!RFC2119}} {{!RFC8174}} 128 | when, and only when, they appear in all capitals, as shown here. 129 | 130 | Definitions of terms that are used in this document: 131 | 132 | Ack-eliciting frames: 133 | 134 | : All frames other than ACK, PADDING, and CONNECTION_CLOSE are considered 135 | ack-eliciting. 136 | 137 | Ack-eliciting packets: 138 | 139 | : Packets that contain ack-eliciting frames elicit an ACK from the receiver 140 | within the maximum acknowledgment delay and are called ack-eliciting packets. 141 | 142 | In-flight packets: 143 | 144 | : Packets are considered in flight when they are ack-eliciting or contain a 145 | PADDING frame, and they have been sent but are not acknowledged, declared 146 | lost, or discarded along with old keys. 147 | 148 | # Design of the QUIC Transmission Machinery 149 | 150 | All transmissions in QUIC are sent with a packet-level header, which indicates 151 | the encryption level and includes a packet sequence number (referred to below as 152 | a packet number). The encryption level indicates the packet number space, as 153 | described in {{Section 12.3 of QUIC-TRANSPORT}}. Packet numbers never repeat 154 | within a packet number space for the lifetime of a connection. Packet numbers 155 | are sent in monotonically increasing order within a space, preventing ambiguity. 156 | It is permitted for some packet numbers to never be used, leaving intentional 157 | gaps. 158 | 159 | This design obviates the need for disambiguating between transmissions and 160 | retransmissions; this eliminates significant complexity from QUIC's 161 | interpretation of TCP loss detection mechanisms. 162 | 163 | QUIC packets can contain multiple frames of different types. The recovery 164 | mechanisms ensure that data and frames that need reliable delivery are 165 | acknowledged or declared lost and sent in new packets as necessary. The types 166 | of frames contained in a packet affect recovery and congestion control logic: 167 | 168 | * All packets are acknowledged, though packets that contain no 169 | ack-eliciting frames are only acknowledged along with ack-eliciting 170 | packets. 171 | 172 | * Long header packets that contain CRYPTO frames are critical to the 173 | performance of the QUIC handshake and use shorter timers for 174 | acknowledgment. 175 | 176 | * Packets containing frames besides ACK or CONNECTION_CLOSE frames count toward 177 | congestion control limits and are considered to be in flight. 178 | 179 | * PADDING frames cause packets to contribute toward bytes in flight without 180 | directly causing an acknowledgment to be sent. 181 | 182 | # Relevant Differences Between QUIC and TCP 183 | 184 | Readers familiar with TCP's loss detection and congestion control will find 185 | algorithms here that parallel well-known TCP ones. However, protocol differences 186 | between QUIC and TCP contribute to algorithmic differences. These protocol 187 | differences are briefly described below. 188 | 189 | ## Separate Packet Number Spaces 190 | 191 | QUIC uses separate packet number spaces for each encryption level, 192 | except 0-RTT and all generations of 1-RTT keys use the same packet 193 | number space. Separate packet number spaces ensures that the 194 | acknowledgment of packets sent with one level of encryption will not 195 | cause spurious retransmission of packets sent with a different 196 | encryption level. Congestion control and round-trip time (RTT) 197 | measurement are unified across packet number spaces. 198 | 199 | ## Monotonically Increasing Packet Numbers 200 | 201 | TCP conflates transmission order at the sender with delivery order at the 202 | receiver, resulting in the retransmission ambiguity problem 203 | {{RETRANSMISSION}}. QUIC separates transmission order from delivery order: 204 | packet numbers indicate transmission order, and delivery order is determined by 205 | the stream offsets in STREAM frames. 206 | 207 | QUIC's packet number is strictly increasing within a packet number space 208 | and directly encodes transmission order. A higher packet number signifies 209 | that the packet was sent later, and a lower packet number signifies that 210 | the packet was sent earlier. When a packet containing ack-eliciting 211 | frames is detected lost, QUIC includes necessary frames in a new packet 212 | with a new packet number, removing ambiguity about which packet is 213 | acknowledged when an ACK is received. Consequently, more accurate RTT 214 | measurements can be made, spurious retransmissions are trivially detected, and 215 | mechanisms such as Fast Retransmit can be applied universally, based only on 216 | packet number. 217 | 218 | This design point significantly simplifies loss detection mechanisms for QUIC. 219 | Most TCP mechanisms implicitly attempt to infer transmission ordering based on 220 | TCP sequence numbers -- a nontrivial task, especially when TCP timestamps are 221 | not available. 222 | 223 | ## Clearer Loss Epoch 224 | 225 | QUIC starts a loss epoch when a packet is lost. The loss epoch ends when any 226 | packet sent after the start of the epoch is acknowledged. TCP waits for the gap 227 | in the sequence number space to be filled, and so if a segment is lost multiple 228 | times in a row, the loss epoch may not end for several round trips. Because both 229 | should reduce their congestion windows only once per epoch, QUIC will do it once 230 | for every round trip that experiences loss, while TCP may only do it once across 231 | multiple round trips. 232 | 233 | ## No Reneging 234 | 235 | QUIC ACK frames contain information similar to that in TCP Selective 236 | Acknowledgments (SACKs) {{?RFC2018}}. However, QUIC does not allow a packet 237 | acknowledgment to be reneged, greatly simplifying implementations on both sides 238 | and reducing memory pressure on the sender. 239 | 240 | ## More ACK Ranges 241 | 242 | QUIC supports many ACK ranges, as opposed to TCP's three SACK ranges. In 243 | high-loss environments, this speeds recovery, reduces spurious retransmits, and 244 | ensures forward progress without relying on timeouts. 245 | 246 | ## Explicit Correction For Delayed Acknowledgments 247 | 248 | QUIC endpoints measure the delay incurred between when a packet is received and 249 | when the corresponding acknowledgment is sent, allowing a peer to maintain a 250 | more accurate RTT estimate; see {{Section 13.2 of QUIC-TRANSPORT}}. 251 | 252 | ## Probe Timeout Replaces RTO and TLP 253 | 254 | QUIC uses a probe timeout (PTO; see {{pto}}), with a timer based on TCP's 255 | retransmission timeout (RTO) computation; see {{?RFC6298}}. QUIC's PTO includes 256 | the peer's maximum expected acknowledgment delay instead of using a fixed 257 | minimum timeout. 258 | 259 | Similar to the RACK-TLP loss detection algorithm for TCP {{?RFC8985}}, QUIC does 260 | not collapse the congestion window when the PTO expires, since a single packet 261 | loss at the tail does not indicate persistent congestion. Instead, QUIC 262 | collapses the congestion window when persistent congestion is declared; see 263 | {{persistent-congestion}}. In doing this, QUIC avoids unnecessary congestion 264 | window reductions, obviating the need for correcting mechanisms such as Forward 265 | RTO-Recovery (F-RTO) {{?RFC5682}}. Since QUIC does not collapse the congestion 266 | window on a PTO expiration, a QUIC sender is not limited from sending more 267 | in-flight packets after a PTO expiration if it still has available congestion 268 | window. This occurs when a sender is application limited and the PTO timer 269 | expires. This is more aggressive than TCP's RTO mechanism when application 270 | limited, but identical when not application limited. 271 | 272 | QUIC allows probe packets to temporarily exceed the congestion window whenever 273 | the timer expires. 274 | 275 | ## The Minimum Congestion Window Is Two Packets 276 | 277 | TCP uses a minimum congestion window of one packet. However, loss of that single 278 | packet means that the sender needs to wait for a PTO to recover ({{pto}}), which 279 | can be much longer than an RTT. Sending a single ack-eliciting packet also 280 | increases the chances of incurring additional latency when a receiver delays its 281 | acknowledgment. 282 | 283 | QUIC therefore recommends that the minimum congestion window be two 284 | packets. While this increases network load, it is considered safe since the 285 | sender will still reduce its sending rate exponentially under persistent 286 | congestion ({{pto}}). 287 | 288 | ## Handshake Packets Are Not Special 289 | 290 | TCP treats the loss of SYN or SYN-ACK packet as persistent congestion and 291 | reduces the congestion window to one packet; see {{?RFC5681}}. QUIC treats loss 292 | of a packet containing handshake data the same as other losses. 293 | 294 | # Estimating the Round-Trip Time {#compute-rtt} 295 | 296 | At a high level, an endpoint measures the time from when a packet was sent to 297 | when it is acknowledged as an RTT sample. The endpoint uses RTT samples and 298 | peer-reported host delays (see {{Section 13.2 of QUIC-TRANSPORT}}) to generate a 299 | statistical description of the network path's RTT. An endpoint computes the 300 | following three values for each path: the minimum value over a period of time 301 | (min_rtt), an exponentially weighted moving average (smoothed_rtt), and the mean 302 | deviation (referred to as "variation" in the rest of this document) in the 303 | observed RTT samples (rttvar). 304 | 305 | ## Generating RTT Samples {#latest-rtt} 306 | 307 | An endpoint generates an RTT sample on receiving an ACK frame that meets the 308 | following two conditions: 309 | 310 | - the largest acknowledged packet number is newly acknowledged, and 311 | 312 | - at least one of the newly acknowledged packets was ack-eliciting. 313 | 314 | The RTT sample, latest_rtt, is generated as the time elapsed since the largest 315 | acknowledged packet was sent: 316 | 317 | ~~~pseudocode 318 | latest_rtt = ack_time - send_time_of_largest_acked 319 | ~~~ 320 | 321 | An RTT sample is generated using only the largest acknowledged packet in the 322 | received ACK frame. This is because a peer reports acknowledgment delays for 323 | only the largest acknowledged packet in an ACK frame. While the reported 324 | acknowledgment delay is not used by the RTT sample measurement, it is used to 325 | adjust the RTT sample in subsequent computations of smoothed_rtt and rttvar 326 | ({{smoothed-rtt}}). 327 | 328 | To avoid generating multiple RTT samples for a single packet, an ACK frame 329 | SHOULD NOT be used to update RTT estimates if it does not newly acknowledge the 330 | largest acknowledged packet. 331 | 332 | An RTT sample MUST NOT be generated on receiving an ACK frame that does not 333 | newly acknowledge at least one ack-eliciting packet. A peer usually does not 334 | send an ACK frame when only non-ack-eliciting packets are received. Therefore, 335 | an ACK frame that contains acknowledgments for only non-ack-eliciting packets 336 | could include an arbitrarily large ACK Delay value. Ignoring 337 | such ACK frames avoids complications in subsequent smoothed_rtt and rttvar 338 | computations. 339 | 340 | A sender might generate multiple RTT samples per RTT when multiple ACK frames 341 | are received within an RTT. As suggested in {{?RFC6298}}, doing so might result 342 | in inadequate history in smoothed_rtt and rttvar. Ensuring that RTT estimates 343 | retain sufficient history is an open research question. 344 | 345 | ## Estimating min_rtt {#min-rtt} 346 | 347 | min_rtt is the sender's estimate of the minimum RTT observed for a given network 348 | path over a period of time. In this document, min_rtt is used by loss detection 349 | to reject implausibly small RTT samples. 350 | 351 | min_rtt MUST be set to the latest_rtt on the first RTT sample. min_rtt MUST be 352 | set to the lesser of min_rtt and latest_rtt ({{latest-rtt}}) on all other 353 | samples. 354 | 355 | An endpoint uses only locally observed times in computing the min_rtt and does 356 | not adjust for acknowledgment delays reported by the peer. Doing so allows the 357 | endpoint to set a lower bound for the smoothed_rtt based entirely on what it 358 | observes (see {{smoothed-rtt}}) and limits potential underestimation due to 359 | erroneously reported delays by the peer. 360 | 361 | The RTT for a network path may change over time. If a path's actual RTT 362 | decreases, the min_rtt will adapt immediately on the first low sample. If the 363 | path's actual RTT increases, however, the min_rtt will not adapt to it, allowing 364 | future RTT samples that are smaller than the new RTT to be included in 365 | smoothed_rtt. 366 | 367 | Endpoints SHOULD set the min_rtt to the newest RTT sample after persistent 368 | congestion is established. This avoids repeatedly declaring persistent 369 | congestion when the RTT increases. This also allows a connection to reset 370 | its estimate of min_rtt and smoothed_rtt after a disruptive network event; 371 | see {{smoothed-rtt}}. 372 | 373 | Endpoints MAY reestablish the min_rtt at other times in the connection, such as 374 | when traffic volume is low and an acknowledgment is received with a low 375 | acknowledgment delay. Implementations SHOULD NOT refresh the min_rtt 376 | value too often since the actual minimum RTT of the path is not 377 | frequently observable. 378 | 379 | 380 | ## Estimating smoothed_rtt and rttvar {#smoothed-rtt} 381 | 382 | smoothed_rtt is an exponentially weighted moving average of an endpoint's RTT 383 | samples, and rttvar estimates the variation in the RTT samples using a mean 384 | variation. 385 | 386 | The calculation of smoothed_rtt uses RTT samples after adjusting them for 387 | acknowledgment delays. These delays are decoded from the ACK Delay field of 388 | ACK frames as described in {{Section 19.3 of QUIC-TRANSPORT}}. 389 | 390 | The peer might report acknowledgment delays that are larger than the peer's 391 | max_ack_delay during the handshake ({{Section 13.2.1 of QUIC-TRANSPORT}}). To 392 | account for this, the endpoint SHOULD ignore max_ack_delay until the handshake 393 | is confirmed, as defined in {{Section 4.1.2 of QUIC-TLS}}. When they occur, 394 | these large acknowledgment delays are likely to be non-repeating and limited to 395 | the handshake. The endpoint can therefore use them without limiting them to the 396 | max_ack_delay, avoiding unnecessary inflation of the RTT estimate. 397 | 398 | Note that a large acknowledgment delay can result in a substantially inflated 399 | smoothed_rtt if there is an error either in the peer's reporting of the 400 | acknowledgment delay or in the endpoint's min_rtt estimate. Therefore, prior 401 | to handshake confirmation, an endpoint MAY ignore RTT samples if adjusting 402 | the RTT sample for acknowledgment delay causes the sample to be less than the 403 | min_rtt. 404 | 405 | After the handshake is confirmed, any acknowledgment delays reported by the 406 | peer that are greater than the peer's max_ack_delay are attributed to 407 | unintentional but potentially repeating delays, such as scheduler latency at the 408 | peer or loss of previous acknowledgments. Excess delays could also be due to 409 | a noncompliant receiver. Therefore, these extra delays are considered 410 | effectively part of path delay and incorporated into the RTT estimate. 411 | 412 | Therefore, when adjusting an RTT sample using peer-reported acknowledgment 413 | delays, an endpoint: 414 | 415 | - MAY ignore the acknowledgment delay for Initial packets, since these 416 | acknowledgments are not delayed by the peer ({{Section 13.2.1 of 417 | QUIC-TRANSPORT}}); 418 | 419 | - SHOULD ignore the peer's max_ack_delay until the handshake is confirmed; 420 | 421 | - MUST use the lesser of the acknowledgment delay and the peer's max_ack_delay 422 | after the handshake is confirmed; and 423 | 424 | - MUST NOT subtract the acknowledgment delay from the RTT sample if the 425 | resulting value is smaller than the min_rtt. This limits the underestimation 426 | of the smoothed_rtt due to a misreporting peer. 427 | 428 | Additionally, an endpoint might postpone the processing of acknowledgments when 429 | the corresponding decryption keys are not immediately available. For example, a 430 | client might receive an acknowledgment for a 0-RTT packet that it cannot 431 | decrypt because 1-RTT packet protection keys are not yet available to it. In 432 | such cases, an endpoint SHOULD subtract such local delays from its RTT sample 433 | until the handshake is confirmed. 434 | 435 | Similar to {{?RFC6298}}, smoothed_rtt and rttvar are computed as follows. 436 | 437 | An endpoint initializes the RTT estimator during connection establishment and 438 | when the estimator is reset during connection migration; see {{Section 9.4 of 439 | QUIC-TRANSPORT}}. Before any RTT samples are available for a new path or when 440 | the estimator is reset, the estimator is initialized using the initial RTT; see 441 | {{pto-handshake}}. 442 | 443 | smoothed_rtt and rttvar are initialized as follows, where kInitialRtt contains 444 | the initial RTT value: 445 | 446 | ~~~pseudocode 447 | smoothed_rtt = kInitialRtt 448 | rttvar = kInitialRtt / 2 449 | ~~~ 450 | 451 | RTT samples for the network path are recorded in latest_rtt; see 452 | {{latest-rtt}}. On the first RTT sample after initialization, the estimator is 453 | reset using that sample. This ensures that the estimator retains no history of 454 | past samples. Packets sent on other paths do not contribute RTT samples to the 455 | current path, as described in {{Section 9.4 of QUIC-TRANSPORT}}. 456 | 457 | On the first RTT sample after initialization, smoothed_rtt and rttvar are set as 458 | follows: 459 | 460 | ~~~pseudocode 461 | smoothed_rtt = latest_rtt 462 | rttvar = latest_rtt / 2 463 | ~~~ 464 | 465 | On subsequent RTT samples, smoothed_rtt and rttvar evolve as follows: 466 | 467 | ~~~pseudocode 468 | ack_delay = decoded acknowledgment delay from ACK frame 469 | if (handshake confirmed): 470 | ack_delay = min(ack_delay, max_ack_delay) 471 | adjusted_rtt = latest_rtt 472 | if (latest_rtt >= min_rtt + ack_delay): 473 | adjusted_rtt = latest_rtt - ack_delay 474 | smoothed_rtt = 7/8 * smoothed_rtt + 1/8 * adjusted_rtt 475 | rttvar_sample = abs(smoothed_rtt - adjusted_rtt) 476 | rttvar = 3/4 * rttvar + 1/4 * rttvar_sample 477 | ~~~ 478 | 479 | # Loss Detection {#loss-detection} 480 | 481 | QUIC senders use acknowledgments to detect lost packets and a PTO to ensure 482 | acknowledgments are received; see {{pto}}. This section provides a description 483 | of these algorithms. 484 | 485 | If a packet is lost, the QUIC transport needs to recover from that loss, such 486 | as by retransmitting the data, sending an updated frame, or discarding the 487 | frame. For more information, see {{Section 13.3 of QUIC-TRANSPORT}}. 488 | 489 | Loss detection is separate per packet number space, unlike RTT measurement and 490 | congestion control, because RTT and congestion control are properties of the 491 | path, whereas loss detection also relies upon key availability. 492 | 493 | ## Acknowledgment-Based Detection {#ack-loss-detection} 494 | 495 | Acknowledgment-based loss detection implements the spirit of TCP's Fast 496 | Retransmit {{?RFC5681}}, Early Retransmit {{?RFC5827}}, Forward Acknowledgment 497 | {{FACK}}, SACK loss recovery {{?RFC6675}}, and RACK-TLP {{?RFC8985}}. This 498 | section provides an overview of how these algorithms are implemented in QUIC. 499 | 500 | A packet is declared lost if it meets all of the following conditions: 501 | 502 | * The packet is unacknowledged, in flight, and was sent prior to an 503 | acknowledged packet. 504 | 505 | * The packet was sent kPacketThreshold packets before an acknowledged packet 506 | ({{packet-threshold}}), or it was sent long enough in the past 507 | ({{time-threshold}}). 508 | 509 | The acknowledgment indicates that a packet sent later was delivered, and the 510 | packet and time thresholds provide some tolerance for packet reordering. 511 | 512 | Spuriously declaring packets as lost leads to unnecessary retransmissions and 513 | may result in degraded performance due to the actions of the congestion 514 | controller upon detecting loss. Implementations can detect spurious 515 | retransmissions and increase the packet or time reordering threshold to 516 | reduce future spurious retransmissions and loss events. Implementations with 517 | adaptive time thresholds MAY choose to start with smaller initial reordering 518 | thresholds to minimize recovery latency. 519 | 520 | ### Packet Threshold {#packet-threshold} 521 | 522 | The RECOMMENDED initial value for the packet reordering threshold 523 | (kPacketThreshold) is 3, based on best practices for TCP loss detection 524 | {{?RFC5681}} {{?RFC6675}}. In order to remain similar to TCP, 525 | implementations SHOULD NOT use a packet threshold less than 3; see {{?RFC5681}}. 526 | 527 | Some networks may exhibit higher degrees of packet reordering, causing a sender 528 | to detect spurious losses. Additionally, packet reordering could be more common 529 | with QUIC than TCP because network elements that could observe and reorder TCP 530 | packets cannot do that for QUIC and also because QUIC packet numbers are 531 | encrypted. Algorithms that increase the reordering threshold after spuriously 532 | detecting losses, such as RACK {{?RFC8985}}, have proven to be useful in TCP and 533 | are expected to be at least as useful in QUIC. 534 | 535 | ### Time Threshold {#time-threshold} 536 | 537 | Once a later packet within the same packet number space has been acknowledged, 538 | an endpoint SHOULD declare an earlier packet lost if it was sent a threshold 539 | amount of time in the past. To avoid declaring packets as lost too early, this 540 | time threshold MUST be set to at least the local timer granularity, as 541 | indicated by the kGranularity constant. The time threshold is: 542 | 543 | ~~~pseudocode 544 | max(kTimeThreshold * max(smoothed_rtt, latest_rtt), kGranularity) 545 | ~~~ 546 | 547 | If packets sent prior to the largest acknowledged packet cannot yet be declared 548 | lost, then a timer SHOULD be set for the remaining time. 549 | 550 | Using max(smoothed_rtt, latest_rtt) protects from the two following cases: 551 | 552 | * the latest RTT sample is lower than the smoothed RTT, perhaps due to 553 | reordering where the acknowledgment encountered a shorter path; 554 | 555 | * the latest RTT sample is higher than the smoothed RTT, perhaps due to a 556 | sustained increase in the actual RTT, but the smoothed RTT has not yet caught 557 | up. 558 | 559 | The RECOMMENDED time threshold (kTimeThreshold), expressed as an RTT multiplier, 560 | is 9/8. The RECOMMENDED value of the timer granularity (kGranularity) is 1 561 | millisecond. 562 | 563 | 567 | 568 | Implementations MAY experiment with absolute thresholds, thresholds from 569 | previous connections, adaptive thresholds, or the including of RTT variation. 570 | Smaller thresholds reduce reordering resilience and increase spurious 571 | retransmissions, and larger thresholds increase loss detection delay. 572 | 573 | 574 | ## Probe Timeout {#pto} 575 | 576 | A Probe Timeout (PTO) triggers the sending of one or two probe datagrams when 577 | ack-eliciting packets are not acknowledged within the expected period of 578 | time or the server may not have validated the client's address. A PTO enables 579 | a connection to recover from loss of tail packets or acknowledgments. 580 | 581 | As with loss detection, the PTO is per packet number space. That is, a 582 | PTO value is computed per packet number space. 583 | 584 | A PTO timer expiration event does not indicate packet loss and MUST NOT cause 585 | prior unacknowledged packets to be marked as lost. When an acknowledgment is 586 | received that newly acknowledges packets, loss detection proceeds as dictated 587 | by the packet and time threshold mechanisms; see {{ack-loss-detection}}. 588 | 589 | The PTO algorithm used in QUIC implements the reliability functions of Tail Loss 590 | Probe {{?RFC8985}}, RTO {{?RFC5681}}, and F-RTO algorithms for TCP 591 | {{?RFC5682}}. The timeout computation is based on TCP's RTO period {{?RFC6298}}. 592 | 593 | ### Computing PTO 594 | 595 | When an ack-eliciting packet is transmitted, the sender schedules a timer for 596 | the PTO period as follows: 597 | 598 | ~~~pseudocode 599 | PTO = smoothed_rtt + max(4*rttvar, kGranularity) + max_ack_delay 600 | ~~~ 601 | 602 | The PTO period is the amount of time that a sender ought to wait for an 603 | acknowledgment of a sent packet. This time period includes the estimated 604 | network RTT (smoothed_rtt), the variation in the estimate (4*rttvar), 605 | and max_ack_delay, to account for the maximum time by which a receiver might 606 | delay sending an acknowledgment. 607 | 608 | When the PTO is armed for Initial or Handshake packet number spaces, the 609 | max_ack_delay in the PTO period computation is set to 0, since the peer is 610 | expected to not delay these packets intentionally; see {{Section 13.2.1 of 611 | QUIC-TRANSPORT}}. 612 | 613 | The PTO period MUST be at least kGranularity to avoid the timer expiring 614 | immediately. 615 | 616 | When ack-eliciting packets in multiple packet number spaces are in flight, the 617 | timer MUST be set to the earlier value of the Initial and Handshake packet 618 | number spaces. 619 | 620 | An endpoint MUST NOT set its PTO timer for the Application Data packet number 621 | space until the handshake is confirmed. Doing so prevents the endpoint from 622 | retransmitting information in packets when either the peer does not yet have the 623 | keys to process them or the endpoint does not yet have the keys to process their 624 | acknowledgments. For example, this can happen when a client sends 0-RTT packets 625 | to the server; it does so without knowing whether the server will be able to 626 | decrypt them. Similarly, this can happen when a server sends 1-RTT packets 627 | before confirming that the client has verified the server's certificate and can 628 | therefore read these 1-RTT packets. 629 | 630 | A sender SHOULD restart its PTO timer every time an ack-eliciting packet is 631 | sent or acknowledged, or when Initial or Handshake keys are discarded 632 | ({{Section 4.9 of QUIC-TLS}}). This ensures the PTO is always set based on the 633 | latest estimate of the RTT and for the correct packet across packet 634 | number spaces. 635 | 636 | When a PTO timer expires, the PTO backoff MUST be increased, resulting in the 637 | PTO period being set to twice its current value. The PTO backoff factor is reset 638 | when an acknowledgment is received, except in the following case. A server 639 | might take longer to respond to packets during the handshake than otherwise. To 640 | protect such a server from repeated client probes, the PTO backoff is not reset 641 | at a client that is not yet certain that the server has finished validating the 642 | client's address. That is, a client does not reset the PTO backoff factor on 643 | receiving acknowledgments in Initial packets. 644 | 645 | This exponential reduction in the sender's rate is important because consecutive 646 | PTOs might be caused by loss of packets or acknowledgments due to severe 647 | congestion. Even when there are ack-eliciting packets in flight in multiple 648 | packet number spaces, the exponential increase in PTO occurs across all spaces 649 | to prevent excess load on the network. For example, a timeout in the Initial 650 | packet number space doubles the length of the timeout in the Handshake packet 651 | number space. 652 | 653 | The total length of time over which consecutive PTOs expire is limited by the 654 | idle timeout. 655 | 656 | The PTO timer MUST NOT be set if a timer is set for time threshold 657 | loss detection; see {{time-threshold}}. A timer that is set for time 658 | threshold loss detection will expire earlier than the PTO timer 659 | in most cases and is less likely to spuriously retransmit data. 660 | 661 | ### Handshakes and New Paths {#pto-handshake} 662 | 663 | Resumed connections over the same network MAY use the previous connection's 664 | final smoothed RTT value as the resumed connection's initial RTT. When no 665 | previous RTT is available, the initial RTT SHOULD be set to 333 milliseconds. 666 | This results in handshakes starting with a PTO of 1 second, as recommended for 667 | TCP's initial RTO; see {{Section 2 of RFC6298}}. 668 | 669 | A connection MAY use the delay between sending a PATH_CHALLENGE and receiving a 670 | PATH_RESPONSE to set the initial RTT (see kInitialRtt in 671 | {{constants-of-interest}}) for a new path, but the delay SHOULD NOT be 672 | considered an RTT sample. 673 | 674 | When the Initial keys and Handshake keys are discarded (see 675 | {{discarding-packets}}), any Initial packets and Handshake packets can 676 | no longer be acknowledged, so they are removed from bytes in 677 | flight. When Initial or Handshake keys are discarded, the PTO and loss 678 | detection timers MUST be reset, because discarding keys indicates 679 | forward progress and the loss detection timer might have been set for 680 | a now-discarded packet number space. 681 | 682 | #### Before Address Validation 683 | 684 | Until the server has validated the client's address on the path, the amount of 685 | data it can send is limited to three times the amount of data received, 686 | as specified in {{Section 8.1 of QUIC-TRANSPORT}}. If no additional data can be 687 | sent, the server's PTO timer MUST NOT be armed until datagrams have been 688 | received from the client because packets sent on PTO count against the 689 | anti-amplification limit. 690 | 691 | When the server receives a datagram from the client, the amplification limit is 692 | increased and the server resets the PTO timer. If the PTO timer is then set to 693 | a time in the past, it is executed immediately. Doing so avoids sending new 694 | 1-RTT packets prior to packets critical to the completion of the handshake. 695 | In particular, this can happen when 0-RTT is accepted but the server fails to 696 | validate the client's address. 697 | 698 | Since the server could be blocked until more datagrams are received from the 699 | client, it is the client's responsibility to send packets to unblock the server 700 | until it is certain that the server has finished its address validation (see 701 | {{Section 8 of QUIC-TRANSPORT}}). That is, the client MUST set the PTO timer 702 | if the client has not received an acknowledgment for any of its Handshake 703 | packets and the handshake is not confirmed (see {{Section 4.1.2 of QUIC-TLS}}), 704 | even if there are no packets in flight. When the PTO fires, the client MUST 705 | send a Handshake packet if it has Handshake keys, otherwise it MUST send an 706 | Initial packet in a UDP datagram with a payload of at least 1200 bytes. 707 | 708 | ### Speeding up Handshake Completion 709 | 710 | When a server receives an Initial packet containing duplicate CRYPTO data, 711 | it can assume the client did not receive all of the server's CRYPTO data sent 712 | in Initial packets, or the client's estimated RTT is too small. When a 713 | client receives Handshake or 1-RTT packets prior to obtaining Handshake keys, 714 | it may assume some or all of the server's Initial packets were lost. 715 | 716 | To speed up handshake completion under these conditions, an endpoint MAY, for a 717 | limited number of times per connection, send a packet containing 718 | unacknowledged CRYPTO data earlier than the PTO expiry, subject to the address 719 | validation limits in {{Section 8.1 of QUIC-TRANSPORT}}. Doing so at most once 720 | for each connection is adequate to quickly recover from a single packet loss. 721 | An endpoint that always retransmits packets in response to receiving packets 722 | that it cannot process risks creating an infinite exchange of packets. 723 | 724 | Endpoints can also use coalesced packets (see {{Section 12.2 of 725 | QUIC-TRANSPORT}}) to ensure that each datagram elicits at least one 726 | acknowledgment. For example, a client can coalesce an Initial packet containing 727 | PING and PADDING frames with a 0-RTT data packet, and a server can coalesce an 728 | Initial packet containing a PING frame with one or more packets in its first 729 | flight. 730 | 731 | ### Sending Probe Packets 732 | 733 | When a PTO timer expires, a sender MUST send at least one ack-eliciting packet 734 | in the packet number space as a probe. An endpoint MAY send up to two 735 | full-sized datagrams containing ack-eliciting packets to avoid an expensive 736 | consecutive PTO expiration due to a single lost datagram or to transmit data 737 | from multiple packet number spaces. All probe packets sent on a PTO MUST be 738 | ack-eliciting. 739 | 740 | In addition to sending data in the packet number space for which the timer 741 | expired, the sender SHOULD send ack-eliciting packets from other packet number 742 | spaces with in-flight data, coalescing packets if possible. This is 743 | particularly valuable when the server has both Initial and Handshake data in 744 | flight or when the client has both Handshake and Application Data in flight 745 | because the peer might only have receive keys for one of the two packet number 746 | spaces. 747 | 748 | If the sender wants to elicit a faster acknowledgment on PTO, it can skip a 749 | packet number to eliminate the acknowledgment delay. 750 | 751 | An endpoint SHOULD include new data in packets that are sent on PTO expiration. 752 | Previously sent data MAY be sent if no new data can be sent. Implementations 753 | MAY use alternative strategies for determining the content of probe packets, 754 | including sending new or retransmitted data based on the application's 755 | priorities. 756 | 757 | It is possible the sender has no new or previously sent data to send. 758 | As an example, consider the following sequence of events: new application data 759 | is sent in a STREAM frame, deemed lost, then retransmitted in a new packet, 760 | and then the original transmission is acknowledged. When there is no data to 761 | send, the sender SHOULD send a PING or other ack-eliciting frame in a single 762 | packet, rearming the PTO timer. 763 | 764 | Alternatively, instead of sending an ack-eliciting packet, the sender MAY mark 765 | any packets still in flight as lost. Doing so avoids sending an additional 766 | packet but increases the risk that loss is declared too aggressively, resulting 767 | in an unnecessary rate reduction by the congestion controller. 768 | 769 | Consecutive PTO periods increase exponentially, and as a result, connection 770 | recovery latency increases exponentially as packets continue to be dropped in 771 | the network. Sending two packets on PTO expiration increases resilience to 772 | packet drops, thus reducing the probability of consecutive PTO events. 773 | 774 | When the PTO timer expires multiple times and new data cannot be sent, 775 | implementations must choose between sending the same payload every time 776 | or sending different payloads. Sending the same payload may be simpler 777 | and ensures the highest priority frames arrive first. Sending different 778 | payloads each time reduces the chances of spurious retransmission. 779 | 780 | 781 | ## Handling Retry Packets 782 | 783 | A Retry packet causes a client to send another Initial packet, effectively 784 | restarting the connection process. A Retry packet indicates that the Initial 785 | packet was received but not processed. A Retry packet cannot be treated as an 786 | acknowledgment because it does not indicate that a packet was processed or 787 | specify the packet number. 788 | 789 | Clients that receive a Retry packet reset congestion control and loss recovery 790 | state, including resetting any pending timers. Other connection state, in 791 | particular cryptographic handshake messages, is retained; see 792 | {{Section 17.2.5 of QUIC-TRANSPORT}}. 793 | 794 | The client MAY compute an RTT estimate to the server as the time period from 795 | when the first Initial packet was sent to when a Retry or a Version Negotiation 796 | packet is received. The client MAY use this value in place of its default for 797 | the initial RTT estimate. 798 | 799 | ## Discarding Keys and Packet State {#discarding-packets} 800 | 801 | When Initial and Handshake packet protection keys are discarded 802 | (see {{Section 4.9 of QUIC-TLS}}), all packets that were sent with those keys 803 | can no longer be acknowledged because their acknowledgments cannot be processed. 804 | The sender MUST discard all recovery state associated with those packets 805 | and MUST remove them from the count of bytes in flight. 806 | 807 | Endpoints stop sending and receiving Initial packets once they start exchanging 808 | Handshake packets; see {{Section 17.2.2.1 of QUIC-TRANSPORT}}. At this point, 809 | recovery state for all in-flight Initial packets is discarded. 810 | 811 | When 0-RTT is rejected, recovery state for all in-flight 0-RTT packets is 812 | discarded. 813 | 814 | If a server accepts 0-RTT, but does not buffer 0-RTT packets that arrive 815 | before Initial packets, early 0-RTT packets will be declared lost, but that 816 | is expected to be infrequent. 817 | 818 | It is expected that keys are discarded at some time after the packets 819 | encrypted with them are either acknowledged or declared lost. However, 820 | Initial and Handshake secrets are discarded as soon as Handshake and 821 | 1-RTT keys are proven to be available to both client and server; see 822 | {{Section 4.9.1 of QUIC-TLS}}. 823 | 824 | # Congestion Control {#congestion-control} 825 | 826 | This document specifies a sender-side congestion controller for QUIC similar to 827 | TCP NewReno {{?RFC6582}}. 828 | 829 | The signals QUIC provides for congestion control are generic and are designed to 830 | support different sender-side algorithms. A sender can unilaterally choose a 831 | different algorithm to use, such as CUBIC {{?RFC8312}}. 832 | 833 | If a sender uses a different controller than that specified in this document, 834 | the chosen controller MUST conform to the congestion control guidelines 835 | specified in {{Section 3.1 of RFC8085}}. 836 | 837 | Similar to TCP, packets containing only ACK frames do not count toward bytes 838 | in flight and are not congestion controlled. Unlike TCP, QUIC can detect the 839 | loss of these packets and MAY use that information to adjust the congestion 840 | controller or the rate of ACK-only packets being sent, but this document does 841 | not describe a mechanism for doing so. 842 | 843 | The congestion controller is per path, so packets sent on other paths do not 844 | alter the current path's congestion controller, as described in 845 | {{Section 9.4 of QUIC-TRANSPORT}}. 846 | 847 | The algorithm in this document specifies and uses the controller's congestion 848 | window in bytes. 849 | 850 | An endpoint MUST NOT send a packet if it would cause bytes_in_flight (see 851 | {{vars-of-interest}}) to be larger than the congestion window, unless the packet 852 | is sent on a PTO timer expiration (see {{pto}}) or when entering recovery 853 | (see {{recovery-period}}). 854 | 855 | ## Explicit Congestion Notification {#congestion-ecn} 856 | 857 | If a path has been validated to support Explicit Congestion Notification (ECN) 858 | {{!RFC3168}} {{?RFC8311}}, QUIC treats a Congestion Experienced (CE) codepoint 859 | in the IP header as a signal of congestion. This document specifies an 860 | endpoint's response when the peer-reported ECN-CE count increases; see {{Section 861 | 13.4.2 of QUIC-TRANSPORT}}. 862 | 863 | ## Initial and Minimum Congestion Window {#initial-cwnd} 864 | 865 | QUIC begins every connection in slow start with the congestion window set to an 866 | initial value. Endpoints SHOULD use an initial congestion window of ten times 867 | the maximum datagram size (max_datagram_size), while limiting the window to the 868 | larger of 14,720 bytes or twice the maximum datagram size. This follows the 869 | analysis and recommendations in {{?RFC6928}}, increasing the byte limit to 870 | account for the smaller 8-byte overhead of UDP compared to the 20-byte overhead 871 | for TCP. 872 | 873 | If the maximum datagram size changes during the connection, the initial 874 | congestion window SHOULD be recalculated with the new size. If the maximum 875 | datagram size is decreased in order to complete the handshake, the 876 | congestion window SHOULD be set to the new initial congestion window. 877 | 878 | Prior to validating the client's address, the server can be further limited by 879 | the anti-amplification limit as specified in {{Section 8.1 of QUIC-TRANSPORT}}. 880 | Though the anti-amplification limit can prevent the congestion window from 881 | being fully utilized and therefore slow down the increase in congestion window, 882 | it does not directly affect the congestion window. 883 | 884 | The minimum congestion window is the smallest value the congestion window can 885 | attain in response to loss, an increase in the peer-reported ECN-CE count, 886 | or persistent congestion. The RECOMMENDED value is 2 * max_datagram_size. 887 | 888 | ## Congestion Control States 889 | 890 | The NewReno congestion controller described in this document has three 891 | distinct states, as shown in {{fig-cc-fsm}}. 892 | 893 | ~~~ 894 | New path or +------------+ 895 | persistent congestion | Slow | 896 | (O)---------------------->| Start | 897 | +------------+ 898 | | 899 | Loss or | 900 | ECN-CE increase | 901 | v 902 | +------------+ Loss or +------------+ 903 | | Congestion | ECN-CE increase | Recovery | 904 | | Avoidance |------------------>| Period | 905 | +------------+ +------------+ 906 | ^ | 907 | | | 908 | +----------------------------+ 909 | Acknowledgment of packet 910 | sent during recovery 911 | ~~~ 912 | {: #fig-cc-fsm title="Congestion Control States and Transitions"} 913 | 914 | These states and the transitions between them are described in subsequent 915 | sections. 916 | 917 | ### Slow Start 918 | 919 | A NewReno sender is in slow start any time the congestion window is below the 920 | slow start threshold. A sender begins in slow start because the slow start 921 | threshold is initialized to an infinite value. 922 | 923 | While a sender is in slow start, the congestion window increases by the number 924 | of bytes acknowledged when each acknowledgment is processed. This results in 925 | exponential growth of the congestion window. 926 | 927 | The sender MUST exit slow start and enter a recovery period when a packet is 928 | lost or when the ECN-CE count reported by its peer increases. 929 | 930 | A sender reenters slow start any time the congestion window is less than the 931 | slow start threshold, which only occurs after persistent congestion is 932 | declared. 933 | 934 | ### Recovery {#recovery-period} 935 | 936 | A NewReno sender enters a recovery period when it detects the loss of a packet 937 | or when the ECN-CE count reported by its peer increases. A sender that is 938 | already in a recovery period stays in it and does not reenter it. 939 | 940 | On entering a recovery period, a sender MUST set the slow start threshold to 941 | half the value of the congestion window when loss is detected. The congestion 942 | window MUST be set to the reduced value of the slow start threshold before 943 | exiting the recovery period. 944 | 945 | Implementations MAY reduce the congestion window immediately upon entering a 946 | recovery period or use other mechanisms, such as Proportional Rate Reduction 947 | {{?PRR=RFC6937}}, to reduce the congestion window more gradually. If the 948 | congestion window is reduced immediately, a single packet can be sent prior to 949 | reduction. This speeds up loss recovery if the data in the lost packet is 950 | retransmitted and is similar to TCP as described in {{Section 5 of RFC6675}}. 951 | 952 | The recovery period aims to limit congestion window reduction to once per round 953 | trip. Therefore, during a recovery period, the congestion window does not change 954 | in response to new losses or increases in the ECN-CE count. 955 | 956 | A recovery period ends and the sender enters congestion avoidance when a packet 957 | sent during the recovery period is acknowledged. This is slightly different 958 | from TCP's definition of recovery, which ends when the lost segment that 959 | started recovery is acknowledged {{?RFC5681}}. 960 | 961 | ### Congestion Avoidance 962 | 963 | A NewReno sender is in congestion avoidance any time the congestion window is 964 | at or above the slow start threshold and not in a recovery period. 965 | 966 | A sender in congestion avoidance uses an Additive Increase Multiplicative 967 | Decrease (AIMD) approach that MUST limit the increase to the congestion window 968 | to at most one maximum datagram size for each congestion window that is 969 | acknowledged. 970 | 971 | The sender exits congestion avoidance and enters a recovery period when a 972 | packet is lost or when the ECN-CE count reported by its peer increases. 973 | 974 | ## Ignoring Loss of Undecryptable Packets 975 | 976 | During the handshake, some packet protection keys might not be available when 977 | a packet arrives, and the receiver can choose to drop the packet. In particular, 978 | Handshake and 0-RTT packets cannot be processed until the Initial packets 979 | arrive, and 1-RTT packets cannot be processed until the handshake completes. 980 | Endpoints MAY ignore the loss of Handshake, 0-RTT, and 1-RTT packets that might 981 | have arrived before the peer had packet protection keys to process those 982 | packets. Endpoints MUST NOT ignore the loss of packets that were sent after 983 | the earliest acknowledged packet in a given packet number space. 984 | 985 | ## Probe Timeout 986 | 987 | Probe packets MUST NOT be blocked by the congestion controller. A sender MUST 988 | however count these packets as being additionally in flight, since these packets 989 | add network load without establishing packet loss. Note that sending probe 990 | packets might cause the sender's bytes in flight to exceed the congestion window 991 | until an acknowledgment is received that establishes loss or delivery of 992 | packets. 993 | 994 | ## Persistent Congestion {#persistent-congestion} 995 | 996 | When a sender establishes loss of all packets sent over a long enough duration, 997 | the network is considered to be experiencing persistent congestion. 998 | 999 | ### Duration {#pc-duration} 1000 | 1001 | The persistent congestion duration is computed as follows: 1002 | 1003 | ~~~pseudocode 1004 | (smoothed_rtt + max(4*rttvar, kGranularity) + max_ack_delay) * 1005 | kPersistentCongestionThreshold 1006 | ~~~ 1007 | 1008 | Unlike the PTO computation in {{pto}}, this duration includes the max_ack_delay 1009 | irrespective of the packet number spaces in which losses are established. 1010 | 1011 | This duration allows a sender to send as many packets before establishing 1012 | persistent congestion, including some in response to PTO expiration, as TCP does 1013 | with Tail Loss Probes {{?RFC8985}} and an RTO {{?RFC5681}}. 1014 | 1015 | Larger values of kPersistentCongestionThreshold cause the sender to become less 1016 | responsive to persistent congestion in the network, which can result in 1017 | aggressive sending into a congested network. Too small a value can result in a 1018 | sender declaring persistent congestion unnecessarily, resulting in reduced 1019 | throughput for the sender. 1020 | 1021 | The RECOMMENDED value for kPersistentCongestionThreshold is 3, which results in 1022 | behavior that is approximately equivalent to a TCP sender declaring an RTO after 1023 | two TLPs. 1024 | 1025 | This design does not use consecutive PTO events to establish persistent 1026 | congestion, since application patterns impact PTO expiration. For example, a 1027 | sender that sends small amounts of data with silence periods between them 1028 | restarts the PTO timer every time it sends, potentially preventing the PTO timer 1029 | from expiring for a long period of time, even when no acknowledgments are being 1030 | received. The use of a duration enables a sender to establish persistent 1031 | congestion without depending on PTO expiration. 1032 | 1033 | ### Establishing Persistent Congestion 1034 | 1035 | A sender establishes persistent congestion after the receipt of an 1036 | acknowledgment if two packets that are ack-eliciting are declared lost, and: 1037 | 1038 | * across all packet number spaces, none of the packets sent between the send 1039 | times of these two packets are acknowledged; 1040 | 1041 | * the duration between the send times of these two packets exceeds the 1042 | persistent congestion duration ({{pc-duration}}); and 1043 | 1044 | * a prior RTT sample existed when these two packets were sent. 1045 | 1046 | These two packets MUST be ack-eliciting, since a receiver is required to 1047 | acknowledge only ack-eliciting packets within its maximum acknowledgment delay; 1048 | see {{Section 13.2 of QUIC-TRANSPORT}}. 1049 | 1050 | The persistent congestion period SHOULD NOT start until there is at least one 1051 | RTT sample. Before the first RTT sample, a sender arms its PTO timer based on 1052 | the initial RTT ({{pto-handshake}}), which could be substantially larger than 1053 | the actual RTT. Requiring a prior RTT sample prevents a sender from establishing 1054 | persistent congestion with potentially too few probes. 1055 | 1056 | Since network congestion is not affected by packet number spaces, persistent 1057 | congestion SHOULD consider packets sent across packet number spaces. A sender 1058 | that does not have state for all packet number spaces or an implementation that 1059 | cannot compare send times across packet number spaces MAY use state for just the 1060 | packet number space that was acknowledged. This might result in erroneously 1061 | declaring persistent congestion, but it will not lead to a failure to detect 1062 | persistent congestion. 1063 | 1064 | When persistent congestion is declared, the sender's congestion window MUST be 1065 | reduced to the minimum congestion window (kMinimumWindow), similar to a TCP 1066 | sender's response on an RTO {{RFC5681}}. 1067 | 1068 | ### Example 1069 | 1070 | The following example illustrates how a sender might establish persistent 1071 | congestion. Assume: 1072 | 1073 | ~~~pseudocode 1074 | smoothed_rtt + max(4*rttvar, kGranularity) + max_ack_delay = 2 1075 | kPersistentCongestionThreshold = 3 1076 | ~~~ 1077 | 1078 | Consider the following sequence of events: 1079 | 1080 | | Time | Action | 1081 | |:-------|:----------------------------------| 1082 | | t=0 | Send packet #1 (application data) | 1083 | | t=1 | Send packet #2 (application data) | 1084 | | t=1.2 | Receive acknowledgment of #1 | 1085 | | t=2 | Send packet #3 (application data) | 1086 | | t=3 | Send packet #4 (application data) | 1087 | | t=4 | Send packet #5 (application data) | 1088 | | t=5 | Send packet #6 (application data) | 1089 | | t=6 | Send packet #7 (application data) | 1090 | | t=8 | Send packet #8 (PTO 1) | 1091 | | t=12 | Send packet #9 (PTO 2) | 1092 | | t=12.2 | Receive acknowledgment of #9 | 1093 | 1094 | Packets 2 through 8 are declared lost when the acknowledgment for packet 9 is 1095 | received at `t = 12.2`. 1096 | 1097 | The congestion period is calculated as the time between the oldest and newest 1098 | lost packets: `8 - 1 = 7`. The persistent congestion duration is `2 * 3 = 6`. 1099 | Because the threshold was reached and because none of the packets between the 1100 | oldest and the newest lost packets were acknowledged, the network is considered 1101 | to have experienced persistent congestion. 1102 | 1103 | While this example shows PTO expiration, they are not required for persistent 1104 | congestion to be established. 1105 | 1106 | 1107 | ## Pacing {#pacing} 1108 | 1109 | A sender SHOULD pace sending of all in-flight packets based on input from the 1110 | congestion controller. 1111 | 1112 | Sending multiple packets into the network without any delay between them creates 1113 | a packet burst that might cause short-term congestion and losses. Senders MUST 1114 | either use pacing or limit such bursts. Senders SHOULD limit bursts to the 1115 | initial congestion window; see {{initial-cwnd}}. A sender with knowledge that 1116 | the network path to the receiver can absorb larger bursts MAY use a higher 1117 | limit. 1118 | 1119 | An implementation should take care to architect its congestion controller to 1120 | work well with a pacer. For instance, a pacer might wrap the congestion 1121 | controller and control the availability of the congestion window, or a pacer 1122 | might pace out packets handed to it by the congestion controller. 1123 | 1124 | Timely delivery of ACK frames is important for efficient loss recovery. To avoid 1125 | delaying their delivery to the peer, packets containing only ACK frames SHOULD 1126 | therefore not be paced. 1127 | 1128 | Endpoints can implement pacing as they choose. A perfectly paced sender spreads 1129 | packets exactly evenly over time. For a window-based congestion controller, such 1130 | as the one in this document, that rate can be computed by averaging the 1131 | congestion window over the RTT. Expressed as a rate in units of 1132 | bytes per time, where congestion_window is in bytes: 1133 | 1134 | ~~~pseudocode 1135 | rate = N * congestion_window / smoothed_rtt 1136 | ~~~ 1137 | 1138 | Or expressed as an inter-packet interval in units of time: 1139 | 1140 | ~~~pseudocode 1141 | interval = ( smoothed_rtt * packet_size / congestion_window ) / N 1142 | ~~~ 1143 | 1144 | Using a value for `N` that is small, but at least 1 (for example, 1.25) ensures 1145 | that variations in RTT do not result in underutilization of the 1146 | congestion window. 1147 | 1148 | Practical considerations, such as packetization, scheduling delays, and 1149 | computational efficiency, can cause a sender to deviate from this rate over time 1150 | periods that are much shorter than an RTT. 1151 | 1152 | One possible implementation strategy for pacing uses a leaky bucket algorithm, 1153 | where the capacity of the "bucket" is limited to the maximum burst size and the 1154 | rate the "bucket" fills is determined by the above function. 1155 | 1156 | ## Underutilizing the Congestion Window 1157 | 1158 | When bytes in flight is smaller than the congestion window and sending is not 1159 | pacing limited, the congestion window is underutilized. This can happen due to 1160 | insufficient application data or flow control limits. When this occurs, 1161 | the congestion window SHOULD NOT be increased in either slow start or 1162 | congestion avoidance. 1163 | 1164 | A sender that paces packets (see {{pacing}}) might delay sending packets 1165 | and not fully utilize the congestion window due to this delay. A sender 1166 | SHOULD NOT consider itself application limited if it would have fully 1167 | utilized the congestion window without pacing delay. 1168 | 1169 | A sender MAY implement alternative mechanisms to update its congestion window 1170 | after periods of underutilization, such as those proposed for TCP in 1171 | {{?RFC7661}}. 1172 | 1173 | 1174 | # Security Considerations 1175 | 1176 | ## Loss and Congestion Signals 1177 | 1178 | Loss detection and congestion control fundamentally involve the consumption of 1179 | signals, such as delay, loss, and ECN markings, from unauthenticated 1180 | entities. An attacker can cause endpoints to reduce their sending rate by 1181 | manipulating these signals: by dropping packets, by altering path delay 1182 | strategically, or by changing ECN codepoints. 1183 | 1184 | ## Traffic Analysis 1185 | 1186 | Packets that carry only ACK frames can be heuristically identified by observing 1187 | packet size. Acknowledgment patterns may expose information about link 1188 | characteristics or application behavior. To reduce leaked information, 1189 | endpoints can bundle acknowledgments with other frames, or they can use PADDING 1190 | frames at a potential cost to performance. 1191 | 1192 | ## Misreporting ECN Markings 1193 | 1194 | A receiver can misreport ECN markings to alter the congestion response of a 1195 | sender. Suppressing reports of ECN-CE markings could cause a sender to 1196 | increase their send rate. This increase could result in congestion and loss. 1197 | 1198 | A sender can detect suppression of reports by marking occasional packets that it 1199 | sends with an ECN-CE marking. If a packet sent with an ECN-CE marking is not 1200 | reported as having been CE marked when the packet is acknowledged, then the 1201 | sender can disable ECN for that path by not setting ECN-Capable Transport (ECT) 1202 | codepoints in subsequent packets sent on that path {{!RFC3168}}. 1203 | 1204 | Reporting additional ECN-CE markings will cause a sender to reduce their sending 1205 | rate, which is similar in effect to advertising reduced connection flow control 1206 | limits and so no advantage is gained by doing so. 1207 | 1208 | Endpoints choose the congestion controller that they use. Congestion controllers 1209 | respond to reports of ECN-CE by reducing their rate, but the response may vary. 1210 | Markings can be treated as equivalent to loss {{!RFC3168}}, but other 1211 | responses can be specified, such as {{?RFC8511}} or {{?RFC8311}}. 1212 | 1213 | 1214 | --- back 1215 | 1216 | # Loss Recovery Pseudocode 1217 | 1218 | We now describe an example implementation of the loss detection mechanisms 1219 | described in {{loss-detection}}. 1220 | 1221 | The pseudocode segments in this section are licensed as Code Components; see the 1222 | copyright notice. 1223 | 1224 | ## Tracking Sent Packets {#tracking-sent-packets} 1225 | 1226 | To correctly implement congestion control, a QUIC sender tracks every 1227 | ack-eliciting packet until the packet is acknowledged or lost. 1228 | It is expected that implementations will be able to access this information by 1229 | packet number and crypto context and store the per-packet fields 1230 | ({{sent-packets-fields}}) for loss recovery and congestion control. 1231 | 1232 | After a packet is declared lost, the endpoint can still maintain state for it 1233 | for an amount of time to allow for packet reordering; see {{Section 13.3 of 1234 | QUIC-TRANSPORT}}. This enables a sender to detect spurious retransmissions. 1235 | 1236 | Sent packets are tracked for each packet number space, and ACK 1237 | processing only applies to a single space. 1238 | 1239 | ### Sent Packet Fields {#sent-packets-fields} 1240 | 1241 | packet_number: 1242 | : The packet number of the sent packet. 1243 | 1244 | ack_eliciting: 1245 | : A Boolean that indicates whether a packet is ack-eliciting. 1246 | If true, it is expected that an acknowledgment will be received, 1247 | though the peer could delay sending the ACK frame containing it 1248 | by up to the max_ack_delay. 1249 | 1250 | in_flight: 1251 | : A Boolean that indicates whether the packet counts toward bytes in 1252 | flight. 1253 | 1254 | sent_bytes: 1255 | : The number of bytes sent in the packet, not including UDP or IP 1256 | overhead, but including QUIC framing overhead. 1257 | 1258 | time_sent: 1259 | : The time the packet was sent. 1260 | 1261 | 1262 | ## Constants of Interest {#constants-of-interest} 1263 | 1264 | Constants used in loss recovery are based on a combination of RFCs, papers, and 1265 | common practice. 1266 | 1267 | kPacketThreshold: 1268 | : Maximum reordering in packets before packet threshold loss detection 1269 | considers a packet lost. The value recommended in {{packet-threshold}} is 3. 1270 | 1271 | kTimeThreshold: 1272 | 1273 | : Maximum reordering in time before time threshold loss detection 1274 | considers a packet lost. Specified as an RTT multiplier. The value 1275 | recommended in {{time-threshold}} is 9/8. 1276 | 1277 | kGranularity: 1278 | 1279 | : Timer granularity. This is a system-dependent value, and {{time-threshold}} 1280 | recommends a value of 1 ms. 1281 | 1282 | kInitialRtt: 1283 | : The RTT used before an RTT sample is taken. The value recommended in 1284 | {{pto-handshake}} is 333 ms. 1285 | 1286 | kPacketNumberSpace: 1287 | : An enum to enumerate the three packet number spaces: 1288 | 1289 | ~~~ 1290 | enum kPacketNumberSpace { 1291 | Initial, 1292 | Handshake, 1293 | ApplicationData, 1294 | } 1295 | ~~~ 1296 | 1297 | ## Variables of Interest {#ld-vars-of-interest} 1298 | 1299 | Variables required to implement the congestion control mechanisms 1300 | are described in this section. 1301 | 1302 | latest_rtt: 1303 | : The most recent RTT measurement made when receiving an acknowledgment for 1304 | a previously unacknowledged packet. 1305 | 1306 | smoothed_rtt: 1307 | : The smoothed RTT of the connection, computed as described in 1308 | {{smoothed-rtt}}. 1309 | 1310 | rttvar: 1311 | : The RTT variation, computed as described in {{smoothed-rtt}}. 1312 | 1313 | min_rtt: 1314 | : The minimum RTT seen over a period of time, ignoring acknowledgment delay, as 1315 | described in {{min-rtt}}. 1316 | 1317 | first_rtt_sample: 1318 | : The time that the first RTT sample was obtained. 1319 | 1320 | max_ack_delay: 1321 | : The maximum amount of time by which the receiver intends to delay 1322 | acknowledgments for packets in the Application Data packet number 1323 | space, as defined by the eponymous transport parameter ({{Section 18.2 1324 | of QUIC-TRANSPORT}}). Note that the actual ack_delay in a received 1325 | ACK frame may be larger due to late timers, reordering, or loss. 1326 | 1327 | loss_detection_timer: 1328 | : Multi-modal timer used for loss detection. 1329 | 1330 | pto_count: 1331 | : The number of times a PTO has been sent without receiving an acknowledgment. 1332 | 1333 | time_of_last_ack_eliciting_packet\[kPacketNumberSpace]: 1334 | : The time the most recent ack-eliciting packet was sent. 1335 | 1336 | largest_acked_packet\[kPacketNumberSpace]: 1337 | : The largest packet number acknowledged in the packet number space so far. 1338 | 1339 | loss_time\[kPacketNumberSpace]: 1340 | : The time at which the next packet in that packet number space can be 1341 | considered lost based on exceeding the reordering window in time. 1342 | 1343 | sent_packets\[kPacketNumberSpace]: 1344 | : An association of packet numbers in a packet number space to information 1345 | about them. Described in detail above in {{tracking-sent-packets}}. 1346 | 1347 | 1348 | ## Initialization 1349 | 1350 | At the beginning of the connection, initialize the loss detection variables as 1351 | follows: 1352 | 1353 | ~~~pseudocode 1354 | loss_detection_timer.reset() 1355 | pto_count = 0 1356 | latest_rtt = 0 1357 | smoothed_rtt = kInitialRtt 1358 | rttvar = kInitialRtt / 2 1359 | min_rtt = 0 1360 | first_rtt_sample = 0 1361 | for pn_space in [ Initial, Handshake, ApplicationData ]: 1362 | largest_acked_packet[pn_space] = infinite 1363 | time_of_last_ack_eliciting_packet[pn_space] = 0 1364 | loss_time[pn_space] = 0 1365 | ~~~ 1366 | 1367 | 1368 | ## On Sending a Packet 1369 | 1370 | After a packet is sent, information about the packet is stored. The parameters 1371 | to OnPacketSent are described in detail above in {{sent-packets-fields}}. 1372 | 1373 | Pseudocode for OnPacketSent follows: 1374 | 1375 | ~~~pseudocode 1376 | OnPacketSent(packet_number, pn_space, ack_eliciting, 1377 | in_flight, sent_bytes): 1378 | sent_packets[pn_space][packet_number].packet_number = 1379 | packet_number 1380 | sent_packets[pn_space][packet_number].time_sent = now() 1381 | sent_packets[pn_space][packet_number].ack_eliciting = 1382 | ack_eliciting 1383 | sent_packets[pn_space][packet_number].in_flight = in_flight 1384 | sent_packets[pn_space][packet_number].sent_bytes = sent_bytes 1385 | if (in_flight): 1386 | if (ack_eliciting): 1387 | time_of_last_ack_eliciting_packet[pn_space] = now() 1388 | OnPacketSentCC(sent_bytes) 1389 | SetLossDetectionTimer() 1390 | ~~~ 1391 | 1392 | ## On Receiving a Datagram 1393 | 1394 | When a server is blocked by anti-amplification limits, receiving 1395 | a datagram unblocks it, even if none of the packets in the 1396 | datagram are successfully processed. In such a case, the PTO 1397 | timer will need to be rearmed. 1398 | 1399 | Pseudocode for OnDatagramReceived follows: 1400 | 1401 | ~~~pseudocode 1402 | OnDatagramReceived(datagram): 1403 | // If this datagram unblocks the server, arm the 1404 | // PTO timer to avoid deadlock. 1405 | if (server was at anti-amplification limit): 1406 | SetLossDetectionTimer() 1407 | if loss_detection_timer.timeout < now(): 1408 | // Execute PTO if it would have expired 1409 | // while the amplification limit applied. 1410 | OnLossDetectionTimeout() 1411 | ~~~ 1412 | 1413 | ## On Receiving an Acknowledgment 1414 | 1415 | When an ACK frame is received, it may newly acknowledge any number of packets. 1416 | 1417 | Pseudocode for OnAckReceived and UpdateRtt follow: 1418 | 1419 | ~~~pseudocode 1420 | IncludesAckEliciting(packets): 1421 | for packet in packets: 1422 | if (packet.ack_eliciting): 1423 | return true 1424 | return false 1425 | 1426 | OnAckReceived(ack, pn_space): 1427 | if (largest_acked_packet[pn_space] == infinite): 1428 | largest_acked_packet[pn_space] = ack.largest_acked 1429 | else: 1430 | largest_acked_packet[pn_space] = 1431 | max(largest_acked_packet[pn_space], ack.largest_acked) 1432 | 1433 | // DetectAndRemoveAckedPackets finds packets that are newly 1434 | // acknowledged and removes them from sent_packets. 1435 | newly_acked_packets = 1436 | DetectAndRemoveAckedPackets(ack, pn_space) 1437 | // Nothing to do if there are no newly acked packets. 1438 | if (newly_acked_packets.empty()): 1439 | return 1440 | 1441 | // Update the RTT if the largest acknowledged is newly acked 1442 | // and at least one ack-eliciting was newly acked. 1443 | if (newly_acked_packets.largest().packet_number == 1444 | ack.largest_acked && 1445 | IncludesAckEliciting(newly_acked_packets)): 1446 | latest_rtt = 1447 | now() - newly_acked_packets.largest().time_sent 1448 | UpdateRtt(ack.ack_delay) 1449 | 1450 | // Process ECN information if present. 1451 | if (ACK frame contains ECN information): 1452 | ProcessECN(ack, pn_space) 1453 | 1454 | lost_packets = DetectAndRemoveLostPackets(pn_space) 1455 | if (!lost_packets.empty()): 1456 | OnPacketsLost(lost_packets) 1457 | OnPacketsAcked(newly_acked_packets) 1458 | 1459 | // Reset pto_count unless the client is unsure if 1460 | // the server has validated the client's address. 1461 | if (PeerCompletedAddressValidation()): 1462 | pto_count = 0 1463 | SetLossDetectionTimer() 1464 | 1465 | 1466 | UpdateRtt(ack_delay): 1467 | if (first_rtt_sample == 0): 1468 | min_rtt = latest_rtt 1469 | smoothed_rtt = latest_rtt 1470 | rttvar = latest_rtt / 2 1471 | first_rtt_sample = now() 1472 | return 1473 | 1474 | // min_rtt ignores acknowledgment delay. 1475 | min_rtt = min(min_rtt, latest_rtt) 1476 | // Limit ack_delay by max_ack_delay after handshake 1477 | // confirmation. 1478 | if (handshake confirmed): 1479 | ack_delay = min(ack_delay, max_ack_delay) 1480 | 1481 | // Adjust for acknowledgment delay if plausible. 1482 | adjusted_rtt = latest_rtt 1483 | if (latest_rtt >= min_rtt + ack_delay): 1484 | adjusted_rtt = latest_rtt - ack_delay 1485 | 1486 | rttvar = 3/4 * rttvar + 1/4 * abs(smoothed_rtt - adjusted_rtt) 1487 | smoothed_rtt = 7/8 * smoothed_rtt + 1/8 * adjusted_rtt 1488 | ~~~ 1489 | 1490 | ## Setting the Loss Detection Timer 1491 | 1492 | QUIC loss detection uses a single timer for all timeout loss detection. The 1493 | duration of the timer is based on the timer's mode, which is set in the packet 1494 | and timer events further below. The function SetLossDetectionTimer defined 1495 | below shows how the single timer is set. 1496 | 1497 | This algorithm may result in the timer being set in the past, particularly if 1498 | timers wake up late. Timers set in the past fire immediately. 1499 | 1500 | Pseudocode for SetLossDetectionTimer follows (where the "^" operator represents 1501 | exponentiation): 1502 | 1503 | ~~~pseudocode 1504 | GetLossTimeAndSpace(): 1505 | time = loss_time[Initial] 1506 | space = Initial 1507 | for pn_space in [ Handshake, ApplicationData ]: 1508 | if (time == 0 || loss_time[pn_space] < time): 1509 | time = loss_time[pn_space]; 1510 | space = pn_space 1511 | return time, space 1512 | 1513 | GetPtoTimeAndSpace(): 1514 | duration = (smoothed_rtt + max(4 * rttvar, kGranularity)) 1515 | * (2 ^ pto_count) 1516 | // Anti-deadlock PTO starts from the current time 1517 | if (no ack-eliciting packets in flight): 1518 | assert(!PeerCompletedAddressValidation()) 1519 | if (has handshake keys): 1520 | return (now() + duration), Handshake 1521 | else: 1522 | return (now() + duration), Initial 1523 | pto_timeout = infinite 1524 | pto_space = Initial 1525 | for space in [ Initial, Handshake, ApplicationData ]: 1526 | if (no ack-eliciting packets in flight in space): 1527 | continue; 1528 | if (space == ApplicationData): 1529 | // Skip Application Data until handshake confirmed. 1530 | if (handshake is not confirmed): 1531 | return pto_timeout, pto_space 1532 | // Include max_ack_delay and backoff for Application Data. 1533 | duration += max_ack_delay * (2 ^ pto_count) 1534 | 1535 | t = time_of_last_ack_eliciting_packet[space] + duration 1536 | if (t < pto_timeout): 1537 | pto_timeout = t 1538 | pto_space = space 1539 | return pto_timeout, pto_space 1540 | 1541 | PeerCompletedAddressValidation(): 1542 | // Assume clients validate the server's address implicitly. 1543 | if (endpoint is server): 1544 | return true 1545 | // Servers complete address validation when a 1546 | // protected packet is received. 1547 | return has received Handshake ACK || 1548 | handshake confirmed 1549 | 1550 | SetLossDetectionTimer(): 1551 | earliest_loss_time, _ = GetLossTimeAndSpace() 1552 | if (earliest_loss_time != 0): 1553 | // Time threshold loss detection. 1554 | loss_detection_timer.update(earliest_loss_time) 1555 | return 1556 | 1557 | if (server is at anti-amplification limit): 1558 | // The server's timer is not set if nothing can be sent. 1559 | loss_detection_timer.cancel() 1560 | return 1561 | 1562 | if (no ack-eliciting packets in flight && 1563 | PeerCompletedAddressValidation()): 1564 | // There is nothing to detect lost, so no timer is set. 1565 | // However, the client needs to arm the timer if the 1566 | // server might be blocked by the anti-amplification limit. 1567 | loss_detection_timer.cancel() 1568 | return 1569 | 1570 | timeout, _ = GetPtoTimeAndSpace() 1571 | loss_detection_timer.update(timeout) 1572 | ~~~ 1573 | 1574 | 1575 | ## On Timeout 1576 | 1577 | When the loss detection timer expires, the timer's mode determines the action 1578 | to be performed. 1579 | 1580 | Pseudocode for OnLossDetectionTimeout follows: 1581 | 1582 | ~~~pseudocode 1583 | OnLossDetectionTimeout(): 1584 | earliest_loss_time, pn_space = GetLossTimeAndSpace() 1585 | if (earliest_loss_time != 0): 1586 | // Time threshold loss Detection 1587 | lost_packets = DetectAndRemoveLostPackets(pn_space) 1588 | assert(!lost_packets.empty()) 1589 | OnPacketsLost(lost_packets) 1590 | SetLossDetectionTimer() 1591 | return 1592 | 1593 | if (no ack-eliciting packets in flight): 1594 | assert(!PeerCompletedAddressValidation()) 1595 | // Client sends an anti-deadlock packet: Initial is padded 1596 | // to earn more anti-amplification credit, 1597 | // a Handshake packet proves address ownership. 1598 | if (has Handshake keys): 1599 | SendOneAckElicitingHandshakePacket() 1600 | else: 1601 | SendOneAckElicitingPaddedInitialPacket() 1602 | else: 1603 | // PTO. Send new data if available, else retransmit old data. 1604 | // If neither is available, send a single PING frame. 1605 | _, pn_space = GetPtoTimeAndSpace() 1606 | SendOneOrTwoAckElicitingPackets(pn_space) 1607 | 1608 | pto_count++ 1609 | SetLossDetectionTimer() 1610 | ~~~ 1611 | 1612 | 1613 | ## Detecting Lost Packets 1614 | 1615 | DetectAndRemoveLostPackets is called every time an ACK is received or the time 1616 | threshold loss detection timer expires. This function operates on the 1617 | sent_packets for that packet number space and returns a list of packets newly 1618 | detected as lost. 1619 | 1620 | Pseudocode for DetectAndRemoveLostPackets follows: 1621 | 1622 | ~~~pseudocode 1623 | DetectAndRemoveLostPackets(pn_space): 1624 | assert(largest_acked_packet[pn_space] != infinite) 1625 | loss_time[pn_space] = 0 1626 | lost_packets = [] 1627 | loss_delay = kTimeThreshold * max(latest_rtt, smoothed_rtt) 1628 | 1629 | // Minimum time of kGranularity before packets are deemed lost. 1630 | loss_delay = max(loss_delay, kGranularity) 1631 | 1632 | // Packets sent before this time are deemed lost. 1633 | lost_send_time = now() - loss_delay 1634 | 1635 | foreach unacked in sent_packets[pn_space]: 1636 | if (unacked.packet_number > largest_acked_packet[pn_space]): 1637 | continue 1638 | 1639 | // Mark packet as lost, or set time when it should be marked. 1640 | // Note: The use of kPacketThreshold here assumes that there 1641 | // were no sender-induced gaps in the packet number space. 1642 | if (unacked.time_sent <= lost_send_time || 1643 | largest_acked_packet[pn_space] >= 1644 | unacked.packet_number + kPacketThreshold): 1645 | sent_packets[pn_space].remove(unacked.packet_number) 1646 | lost_packets.insert(unacked) 1647 | else: 1648 | if (loss_time[pn_space] == 0): 1649 | loss_time[pn_space] = unacked.time_sent + loss_delay 1650 | else: 1651 | loss_time[pn_space] = min(loss_time[pn_space], 1652 | unacked.time_sent + loss_delay) 1653 | return lost_packets 1654 | ~~~ 1655 | 1656 | 1657 | ## Upon Dropping Initial or Handshake Keys 1658 | 1659 | When Initial or Handshake keys are discarded, packets from the space 1660 | are discarded and loss detection state is updated. 1661 | 1662 | Pseudocode for OnPacketNumberSpaceDiscarded follows: 1663 | 1664 | ~~~pseudocode 1665 | OnPacketNumberSpaceDiscarded(pn_space): 1666 | assert(pn_space != ApplicationData) 1667 | RemoveFromBytesInFlight(sent_packets[pn_space]) 1668 | sent_packets[pn_space].clear() 1669 | // Reset the loss detection and PTO timer 1670 | time_of_last_ack_eliciting_packet[pn_space] = 0 1671 | loss_time[pn_space] = 0 1672 | pto_count = 0 1673 | SetLossDetectionTimer() 1674 | ~~~ 1675 | 1676 | 1677 | # Congestion Control Pseudocode 1678 | 1679 | We now describe an example implementation of the congestion controller described 1680 | in {{congestion-control}}. 1681 | 1682 | The pseudocode segments in this section are licensed as Code Components; see the 1683 | copyright notice. 1684 | 1685 | ## Constants of Interest {#cc-consts-of-interest} 1686 | 1687 | Constants used in congestion control are based on a combination of RFCs, papers, 1688 | and common practice. 1689 | 1690 | kInitialWindow: 1691 | : Default limit on the initial bytes in flight as described in {{initial-cwnd}}. 1692 | 1693 | kMinimumWindow: 1694 | : Minimum congestion window in bytes as described in {{initial-cwnd}}. 1695 | 1696 | kLossReductionFactor: 1697 | : Scaling factor applied to reduce the congestion window when a new loss event 1698 | is detected. {{congestion-control}} recommends a value of 0.5. 1699 | 1700 | kPersistentCongestionThreshold: 1701 | : Period of time for persistent congestion to be established, specified as a PTO 1702 | multiplier. {{persistent-congestion}} recommends a value of 3. 1703 | 1704 | 1705 | ## Variables of Interest {#vars-of-interest} 1706 | 1707 | Variables required to implement the congestion control mechanisms 1708 | are described in this section. 1709 | 1710 | max_datagram_size: 1711 | : The sender's current maximum payload size. This does not include UDP or IP 1712 | overhead. The max datagram size is used for congestion window 1713 | computations. An endpoint sets the value of this variable based on its Path 1714 | Maximum Transmission Unit (PMTU; see {{Section 14.2 of QUIC-TRANSPORT}}), with 1715 | a minimum value of 1200 bytes. 1716 | 1717 | ecn_ce_counters\[kPacketNumberSpace]: 1718 | : The highest value reported for the ECN-CE counter in the packet number space 1719 | by the peer in an ACK frame. This value is used to detect increases in the 1720 | reported ECN-CE counter. 1721 | 1722 | bytes_in_flight: 1723 | : The sum of the size in bytes of all sent packets that contain at least one 1724 | ack-eliciting or PADDING frame and have not been acknowledged or declared 1725 | lost. The size does not include IP or UDP overhead, but does include the QUIC 1726 | header and Authenticated Encryption with Associated Data (AEAD) overhead. 1727 | Packets only containing ACK frames do not count toward bytes_in_flight to 1728 | ensure congestion control does not impede congestion feedback. 1729 | 1730 | congestion_window: 1731 | : Maximum number of bytes allowed to be in flight. 1732 | 1733 | congestion_recovery_start_time: 1734 | : The time the current recovery period started due to the detection of loss 1735 | or ECN. When a packet sent after this time is acknowledged, QUIC exits 1736 | congestion recovery. 1737 | 1738 | ssthresh: 1739 | : Slow start threshold in bytes. When the congestion window is below ssthresh, 1740 | the mode is slow start and the window grows by the number of bytes 1741 | acknowledged. 1742 | 1743 | The congestion control pseudocode also accesses some of the variables from the 1744 | loss recovery pseudocode. 1745 | 1746 | ## Initialization 1747 | 1748 | At the beginning of the connection, initialize the congestion control 1749 | variables as follows: 1750 | 1751 | ~~~pseudocode 1752 | congestion_window = kInitialWindow 1753 | bytes_in_flight = 0 1754 | congestion_recovery_start_time = 0 1755 | ssthresh = infinite 1756 | for pn_space in [ Initial, Handshake, ApplicationData ]: 1757 | ecn_ce_counters[pn_space] = 0 1758 | ~~~ 1759 | 1760 | 1761 | ## On Packet Sent 1762 | 1763 | Whenever a packet is sent and it contains non-ACK frames, the packet 1764 | increases bytes_in_flight. 1765 | 1766 | ~~~pseudocode 1767 | OnPacketSentCC(sent_bytes): 1768 | bytes_in_flight += sent_bytes 1769 | ~~~ 1770 | 1771 | 1772 | ## On Packet Acknowledgment 1773 | 1774 | This is invoked from loss detection's OnAckReceived and is supplied with the 1775 | newly acked_packets from sent_packets. 1776 | 1777 | In congestion avoidance, implementers that use an integer representation 1778 | for congestion_window should be careful with division and can use 1779 | the alternative approach suggested in {{Section 2.1 of RFC3465}}. 1780 | 1781 | ~~~pseudocode 1782 | InCongestionRecovery(sent_time): 1783 | return sent_time <= congestion_recovery_start_time 1784 | 1785 | OnPacketsAcked(acked_packets): 1786 | for acked_packet in acked_packets: 1787 | OnPacketAcked(acked_packet) 1788 | 1789 | OnPacketAcked(acked_packet): 1790 | if (!acked_packet.in_flight): 1791 | return; 1792 | // Remove from bytes_in_flight. 1793 | bytes_in_flight -= acked_packet.sent_bytes 1794 | // Do not increase congestion_window if application 1795 | // limited or flow control limited. 1796 | if (IsAppOrFlowControlLimited()) 1797 | return 1798 | // Do not increase congestion window in recovery period. 1799 | if (InCongestionRecovery(acked_packet.time_sent)): 1800 | return 1801 | if (congestion_window < ssthresh): 1802 | // Slow start. 1803 | congestion_window += acked_packet.sent_bytes 1804 | else: 1805 | // Congestion avoidance. 1806 | congestion_window += 1807 | max_datagram_size * acked_packet.sent_bytes 1808 | / congestion_window 1809 | ~~~ 1810 | 1811 | 1812 | ## On New Congestion Event 1813 | 1814 | This is invoked from ProcessECN and OnPacketsLost when a new congestion event is 1815 | detected. If not already in recovery, this starts a recovery period and 1816 | reduces the slow start threshold and congestion window immediately. 1817 | 1818 | ~~~pseudocode 1819 | OnCongestionEvent(sent_time): 1820 | // No reaction if already in a recovery period. 1821 | if (InCongestionRecovery(sent_time)): 1822 | return 1823 | 1824 | // Enter recovery period. 1825 | congestion_recovery_start_time = now() 1826 | ssthresh = congestion_window * kLossReductionFactor 1827 | congestion_window = max(ssthresh, kMinimumWindow) 1828 | // A packet can be sent to speed up loss recovery. 1829 | MaybeSendOnePacket() 1830 | ~~~ 1831 | 1832 | 1833 | ## Process ECN Information 1834 | 1835 | This is invoked when an ACK frame with an ECN section is received from the peer. 1836 | 1837 | ~~~pseudocode 1838 | ProcessECN(ack, pn_space): 1839 | // If the ECN-CE counter reported by the peer has increased, 1840 | // this could be a new congestion event. 1841 | if (ack.ce_counter > ecn_ce_counters[pn_space]): 1842 | ecn_ce_counters[pn_space] = ack.ce_counter 1843 | sent_time = sent_packets[ack.largest_acked].time_sent 1844 | OnCongestionEvent(sent_time) 1845 | ~~~ 1846 | 1847 | 1848 | ## On Packets Lost 1849 | 1850 | This is invoked when DetectAndRemoveLostPackets deems packets lost. 1851 | 1852 | ~~~pseudocode 1853 | OnPacketsLost(lost_packets): 1854 | sent_time_of_last_loss = 0 1855 | // Remove lost packets from bytes_in_flight. 1856 | for lost_packet in lost_packets: 1857 | if lost_packet.in_flight: 1858 | bytes_in_flight -= lost_packet.sent_bytes 1859 | sent_time_of_last_loss = 1860 | max(sent_time_of_last_loss, lost_packet.time_sent) 1861 | // Congestion event if in-flight packets were lost 1862 | if (sent_time_of_last_loss != 0): 1863 | OnCongestionEvent(sent_time_of_last_loss) 1864 | 1865 | // Reset the congestion window if the loss of these 1866 | // packets indicates persistent congestion. 1867 | // Only consider packets sent after getting an RTT sample. 1868 | if (first_rtt_sample == 0): 1869 | return 1870 | pc_lost = [] 1871 | for lost in lost_packets: 1872 | if lost.time_sent > first_rtt_sample: 1873 | pc_lost.insert(lost) 1874 | if (InPersistentCongestion(pc_lost)): 1875 | congestion_window = kMinimumWindow 1876 | congestion_recovery_start_time = 0 1877 | ~~~ 1878 | 1879 | 1880 | ## Removing Discarded Packets from Bytes in Flight 1881 | 1882 | When Initial or Handshake keys are discarded, packets sent in that space no 1883 | longer count toward bytes in flight. 1884 | 1885 | Pseudocode for RemoveFromBytesInFlight follows: 1886 | 1887 | ~~~pseudocode 1888 | RemoveFromBytesInFlight(discarded_packets): 1889 | // Remove any unacknowledged packets from flight. 1890 | foreach packet in discarded_packets: 1891 | if packet.in_flight 1892 | bytes_in_flight -= size 1893 | ~~~ 1894 | 1895 | 1896 | # Contributors 1897 | {: numbered="false"} 1898 | 1899 | The IETF QUIC Working Group received an enormous amount of support from many 1900 | people. The following people provided substantive contributions to this 1901 | document: 1902 | 1903 |