├── .editorconfig ├── .gitattributes ├── .github ├── CODEOWNERS ├── ISSUE_TEMPLATE │ └── quic-draft-issue.md ├── in-solidarity.yml ├── release-drafter.yml └── workflows │ ├── archive.yml │ ├── assign-to-project.yml │ ├── ghpages.yml │ ├── publish.yml │ └── update.yml ├── .gitignore ├── .lint.py ├── .travis.yml ├── CONTRIBUTING.md ├── Makefile ├── README.md ├── ietf.json ├── protection-samples.js ├── rfc8999.md ├── rfc9000.md ├── rfc9001.md ├── rfc9002.md ├── rfc9114.md ├── rfc9204.md ├── tag.sh ├── writeups └── base-drafts.md └── xml2rfc-tidy.py /.editorconfig: -------------------------------------------------------------------------------- 1 | # See http://editorconfig.org 2 | 3 | root = true 4 | 5 | [*.md] 6 | charset = utf-8 7 | end_of_line = lf 8 | indent_size = 2 9 | indent_style = space 10 | insert_final_newline = true 11 | max_line_length = 80 12 | trim_trailing_whitespace = true 13 | -------------------------------------------------------------------------------- /.gitattributes: -------------------------------------------------------------------------------- 1 | # Set the default behavior, in case people don't have core.autocrlf set. 2 | * text=auto 3 | 4 | # Explicitly declare text files you want to always be normalized and converted 5 | # to native line endings on checkout. 6 | *.md text 7 | *.xml text 8 | 9 | # Declare files that will always have LF line endings on checkout. 10 | *.sh text eol=lf 11 | *.mk txt eol=lf -------------------------------------------------------------------------------- /.github/CODEOWNERS: -------------------------------------------------------------------------------- 1 | * @quicwg/chairs 2 | 3 | draft-ietf-quic-http.md @MikeBishop 4 | 5 | draft-ietf-quic-invariants.md @martinthomson 6 | 7 | draft-ietf-quic-qpack.md @MikeBishop @afrind 8 | 9 | draft-ietf-quic-recovery.md @janaiyengar @ianswett 10 | 11 | draft-ietf-quic-tls.md @martinthomson @seanturner 12 | 13 | draft-ietf-quic-transport.md @janaiyengar @martinthomson 14 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/quic-draft-issue.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: QUIC Draft Issue 3 | about: File an issue with a QUIC draft document 4 | title: '' 5 | labels: '' 6 | assignees: '' 7 | 8 | --- 9 | 10 | Before opening an issue, please familiarise yourself with the QUIC WG [Contribution Guidlines](https://github.com/quicwg/base-drafts/blob/master/CONTRIBUTING.md) and [Late-Stage Process](https://github.com/quicwg/base-drafts/blob/master/CONTRIBUTING.md#late-stage-process). 11 | 12 | All documents in this repository follow this process. Before filing a new issue against any of them, please consider a few things: 13 | 14 | * Issues should be just that; issues with our deliverables, **not proposals, questions or support requests**. 15 | * Please review the issues list to make sure that you aren't filing a duplicate. Design issues that revisit a topic where there's already declared consensus (see https://github.com/quicwg/base-drafts/issues?q=is%3Aclosed+label%3Ahas-consensus) need to provide compelling reasons to warrant reopening the discussion. 16 | * If you're not sure how to phrase your issue, please ask on the [mailing list](https://www.ietf.org/mailman/listinfo/quic). 17 | -------------------------------------------------------------------------------- /.github/in-solidarity.yml: -------------------------------------------------------------------------------- 1 | _extends: ietf/terminology 2 | -------------------------------------------------------------------------------- /.github/release-drafter.yml: -------------------------------------------------------------------------------- 1 | categories: 2 | - title: Transport 3 | label: -transport 4 | - title: Recovery 5 | label: -recovery 6 | - title: TLS 7 | label: -tls 8 | - title: HTTP/3 9 | label: -http 10 | change-template: - $TITLE (#$NUMBER) 11 | template: | 12 | ## What’s Changed 13 | 14 | $CHANGES -------------------------------------------------------------------------------- /.github/workflows/archive.yml: -------------------------------------------------------------------------------- 1 | name: "Archive Issues and Pull Requests" 2 | 3 | on: 4 | schedule: 5 | - cron: '0 0 * * 0,2,4' 6 | repository_dispatch: 7 | types: [archive] 8 | workflow_dispatch: 9 | inputs: 10 | archive_full: 11 | description: 'Recreate the archive from scratch' 12 | default: false 13 | type: boolean 14 | 15 | jobs: 16 | build: 17 | name: "Archive Issues and Pull Requests" 18 | runs-on: ubuntu-latest 19 | steps: 20 | - name: "Checkout" 21 | uses: actions/checkout@v4 22 | 23 | # Note: No caching for this build! 24 | 25 | - name: "Update Archive" 26 | uses: martinthomson/i-d-template@v1 27 | env: 28 | ARCHIVE_FULL: ${{ inputs.archive_full }} 29 | with: 30 | make: archive 31 | token: ${{ github.token }} 32 | 33 | - name: "Update GitHub Pages" 34 | uses: martinthomson/i-d-template@v1 35 | with: 36 | make: gh-archive 37 | token: ${{ github.token }} 38 | 39 | - name: "Save Archive" 40 | uses: actions/upload-artifact@v4 41 | with: 42 | path: archive.json 43 | -------------------------------------------------------------------------------- /.github/workflows/assign-to-project.yml: -------------------------------------------------------------------------------- 1 | name: Auto Assign to Late Stage Processing Project 2 | 3 | on: 4 | issues: 5 | types: [opened, labeled] 6 | env: 7 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} 8 | 9 | jobs: 10 | assign_one_project: 11 | runs-on: ubuntu-latest 12 | name: Assign to One Project 13 | steps: 14 | - name: Assign NEW issues to Late Stage Processing 15 | uses: srggrs/assign-one-project-github-action@1.2.0 16 | if: github.event.action == 'opened' 17 | with: 18 | project: 'https://github.com/quicwg/base-drafts/projects/5' 19 | -------------------------------------------------------------------------------- /.github/workflows/ghpages.yml: -------------------------------------------------------------------------------- 1 | name: "Update Editor's Copy" 2 | 3 | on: 4 | push: 5 | paths-ignore: 6 | - README.md 7 | - CONTRIBUTING.md 8 | - LICENSE.md 9 | - .gitignore 10 | pull_request: 11 | paths-ignore: 12 | - README.md 13 | - CONTRIBUTING.md 14 | - LICENSE.md 15 | - .gitignore 16 | 17 | jobs: 18 | build: 19 | name: "Update Editor's Copy" 20 | runs-on: ubuntu-latest 21 | steps: 22 | - name: "Checkout" 23 | uses: actions/checkout@v4 24 | 25 | - name: "Setup" 26 | id: setup 27 | run: date -u "+date=%FT%T" >>"$GITHUB_OUTPUT" 28 | 29 | - name: "Caching" 30 | uses: actions/cache@v4 31 | with: 32 | path: | 33 | .refcache 34 | .venv 35 | .gems 36 | node_modules 37 | .targets.mk 38 | key: i-d-${{ steps.setup.outputs.date }} 39 | restore-keys: i-d- 40 | 41 | - name: "Build Drafts" 42 | uses: martinthomson/i-d-template@v1 43 | with: 44 | token: ${{ github.token }} 45 | 46 | - name: "Update GitHub Pages" 47 | uses: martinthomson/i-d-template@v1 48 | if: ${{ github.event_name == 'push' }} 49 | with: 50 | make: gh-pages 51 | token: ${{ github.token }} 52 | 53 | - name: "Archive Built Drafts" 54 | uses: actions/upload-artifact@v4 55 | with: 56 | path: | 57 | draft-*.html 58 | draft-*.txt 59 | -------------------------------------------------------------------------------- /.github/workflows/publish.yml: -------------------------------------------------------------------------------- 1 | name: "Publish New Draft Version" 2 | 3 | on: 4 | push: 5 | tags: 6 | - "draft-*" 7 | workflow_dispatch: 8 | inputs: 9 | email: 10 | description: "Submitter email" 11 | default: "" 12 | type: string 13 | 14 | jobs: 15 | build: 16 | name: "Publish New Draft Version" 17 | runs-on: ubuntu-latest 18 | steps: 19 | - name: "Checkout" 20 | uses: actions/checkout@v4 21 | 22 | # See https://github.com/actions/checkout/issues/290 23 | - name: "Get Tag Annotations" 24 | run: git fetch -f origin ${{ github.ref }}:${{ github.ref }} 25 | 26 | - name: "Setup" 27 | id: setup 28 | run: date -u "+date=%FT%T" >>"$GITHUB_OUTPUT" 29 | 30 | - name: "Caching" 31 | uses: actions/cache@v4 32 | with: 33 | path: | 34 | .refcache 35 | .venv 36 | .gems 37 | node_modules 38 | .targets.mk 39 | key: i-d-${{ steps.setup.outputs.date }} 40 | restore-keys: i-d- 41 | 42 | - name: "Build Drafts" 43 | uses: martinthomson/i-d-template@v1 44 | with: 45 | token: ${{ github.token }} 46 | 47 | - name: "Upload to Datatracker" 48 | uses: martinthomson/i-d-template@v1 49 | with: 50 | make: upload 51 | env: 52 | UPLOAD_EMAIL: ${{ inputs.email }} 53 | 54 | - name: "Archive Submitted Drafts" 55 | uses: actions/upload-artifact@v4 56 | with: 57 | path: "versioned/draft-*-[0-9][0-9].*" 58 | -------------------------------------------------------------------------------- /.github/workflows/update.yml: -------------------------------------------------------------------------------- 1 | name: "Update Generated Files" 2 | # This rule is not run automatically. 3 | # It can be run manually to update all of the files that are part 4 | # of the template, specifically: 5 | # - README.md 6 | # - CONTRIBUTING.md 7 | # - .note.xml 8 | # - .github/CODEOWNERS 9 | # - Makefile 10 | # 11 | # 12 | # This might be useful if you have: 13 | # - added, removed, or renamed drafts (including after adoption) 14 | # - added, removed, or changed draft editors 15 | # - changed the title of drafts 16 | # 17 | # Note that this removes any customizations you have made to 18 | # the affected files. 19 | on: workflow_dispatch 20 | 21 | jobs: 22 | build: 23 | name: "Update Files" 24 | runs-on: ubuntu-latest 25 | steps: 26 | - name: "Checkout" 27 | uses: actions/checkout@v4 28 | 29 | - name: "Update Generated Files" 30 | uses: martinthomson/i-d-template@v1 31 | with: 32 | make: update-files 33 | token: ${{ github.token }} 34 | 35 | - name: "Push Update" 36 | run: git push 37 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.html 2 | *.pdf 3 | *.redxml 4 | *.swp 5 | *.txt 6 | *.upload 7 | *~ 8 | .refcache 9 | .tags 10 | .targets.mk 11 | /*-[0-9][0-9].xml 12 | /lib 13 | /node_modules/ 14 | /old/ 15 | Gemfile.lock 16 | archive.json 17 | package-lock.json 18 | report.xml 19 | rfc8999.xml 20 | rfc9000.xml 21 | rfc9001.xml 22 | rfc9002.xml 23 | rfc9114.xml 24 | rfc9204.xml 25 | -------------------------------------------------------------------------------- /.lint.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import sys 4 | import argparse 5 | import re 6 | 7 | parser = argparse.ArgumentParser(description="Lint markdown drafts.") 8 | parser.add_argument("files", metavar="file", nargs="+", help="Files to lint") 9 | parser.add_argument("-l", dest="maxLineLength", default=80) 10 | parser.add_argument("-f", dest="maxFigureLineLength", default=66) 11 | 12 | args = parser.parse_args() 13 | 14 | foundError = False 15 | 16 | for inputfile in args.files: 17 | insideFigure = False 18 | beforeAbstract = True 19 | 20 | with open(inputfile, mode="rt", newline=None, encoding="utf-8") as draft: 21 | linenumber = 0 22 | lines = draft.readlines() 23 | 24 | abstract = re.compile("^--- abstract") 25 | table = re.compile("^\s*(?:\||{:)") 26 | figure = re.compile("^[~`]{3,}") 27 | 28 | for line in lines: 29 | line = line.rstrip("\r\n") 30 | linenumber += 1 31 | 32 | def err(msg): 33 | global foundError 34 | foundError = True 35 | sys.stderr.write("{0}:{1}: {2}\n".format(inputfile, linenumber, msg)) 36 | sys.stderr.write("{0}\n".format(line)) 37 | 38 | if line.find("\t") >= 0: 39 | err("Line contains HTAB") 40 | 41 | # Skip everything before abstract 42 | if beforeAbstract: 43 | matchObj = abstract.match(line) 44 | if matchObj: 45 | beforeAbstract = False 46 | continue 47 | 48 | # Skip tables 49 | matchObj = table.match(line) 50 | if matchObj: 51 | continue 52 | 53 | # Toggle figure state 54 | matchObj = figure.match(line) 55 | if matchObj: 56 | insideFigure = not insideFigure 57 | continue 58 | 59 | # Check length 60 | length = len(line) 61 | limit = ( 62 | int(args.maxFigureLineLength) 63 | if insideFigure 64 | else int(args.maxLineLength) 65 | ) 66 | if length > limit: 67 | err("Line is {0} characters; limit is {1}".format(length, limit)) 68 | 69 | sys.exit(1 if foundError else 0) 70 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: python 2 | sudo: false 3 | dist: trusty 4 | 5 | addons: 6 | apt: 7 | packages: 8 | - python-pip 9 | - xsltproc 10 | 11 | env: 12 | global: 13 | - GOPATH="${TRAVIS_BUILD_DIR}/.go_workspace" 14 | - mmark_src=github.com/miekg/mmark/mmark 15 | - mmark=./mmark 16 | 17 | install: 18 | - pip install xml2rfc 19 | - if head -1 -q *.md | grep '^\-\-\-' >/dev/null 2>&1; then gem install --no-doc kramdown-rfc2629; fi 20 | - if head -1 -q *.md | grep '^%%%' >/dev/null 2>&1; then go get "$mmark_src" && go build "$mmark_src"; fi 21 | 22 | script: make ghpages 23 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # QUIC version 1 is done 2 | 3 | The base-drafts repository is the historical home of the QUIC version 1 4 | specifications that were written by the IETF QUIC Working Group. 5 | 6 | The set of documents are described [here](https://github.com/quicwg/base-drafts/blob/main/README.md). 7 | 8 | **Be aware that all contributions fall under the "[NOTE WELL](#note-well)" terms 9 | outlined below and our [Code of Conduct](#code-of-conduct) applies.** 10 | 11 | # Engaging with the QUIC community 12 | 13 | The scope of work in the QUIC Working Group is described in our 14 | [charter](https://datatracker.ietf.org/wg/quic/about/) and it extends beyond the 15 | development of the documents held in this repository. Anyone is welcome to 16 | contribute to the QUIC Working Group; you don't have to join the Working Group, 17 | because there is no "membership" -- anyone who participates in the work **is** a 18 | part of the QUIC Working Group. 19 | 20 | Before doing so, please familiarize yourself with our 21 | [charter](https://datatracker.ietf.org/wg/quic/about/). If you're new to IETF 22 | work, you may also want to read the [Tao of the 23 | IETF](https://www.ietf.org/tao.html). 24 | 25 | ## Following Discussion 26 | 27 | The Working Group has a few venues for discussion: 28 | 29 | * We plan to meet at all [IETF meetings](https://www.ietf.org/meeting/) for the 30 | foreseeable future, and possibly hold interim meetings between them as 31 | required. Agendas, minutes, and presentations are available in our [meeting 32 | materials repository](https://github.com/quicwg/wg-materials) and the 33 | [official proceedings](https://datatracker.ietf.org/wg/quic/meetings/). 34 | 35 | * Our [mailing list](https://www.ietf.org/mailman/listinfo/quic) is used for 36 | most communication, including notifications of meetings, new drafts, consensus 37 | calls and other business, as well as issue discussion. 38 | 39 | * We maintain several repositories in our GitHub organization 40 | [Github](https://github.com/quicwg/). Specific issues are discussed on the 41 | relevant issues list. If you don't want to use Github to follow these 42 | discussions, you can subscribe to the [issue announce 43 | list](https://www.ietf.org/mailman/listinfo/quic-issues). 44 | 45 | * The [quicdev Slack](https://quicdev.slack.com/) is used for more realtime 46 | communication, typcially amongst implementers, operators and researchers. 47 | Contact the [WG chairs](quic-chairs@ietf.org) for an invitation. Note that 48 | discussions on Slack are subject to the contribution guideline described in 49 | this document. 50 | 51 | To be active in the Working Group, you can participate in any of these places. 52 | Most activity takes place on the mailing list, but if you just want to comment 53 | on and raise issues, that's fine too. 54 | 55 | ## Code of Conduct 56 | 57 | The [IETF Guidelines for Conduct](https://tools.ietf.org/html/rfc7154) applies to all Working Group 58 | communications and meetings. 59 | 60 | 61 | ## NOTE WELL 62 | 63 | Any submission to the [IETF](https://www.ietf.org/) intended by the Contributor for publication as 64 | all or part of an IETF Internet-Draft or RFC and any statement made within the context of an IETF 65 | activity is considered an "IETF Contribution". Such statements include oral statements in IETF 66 | sessions, as well as written and electronic communications made at any time or place, which are 67 | addressed to: 68 | 69 | * The IETF plenary session 70 | * The IESG, or any member thereof on behalf of the IESG 71 | * Any IETF mailing list, including the IETF list itself, any working group 72 | or design team list, or any other list functioning under IETF auspices 73 | * Any IETF working group or portion thereof 74 | * Any Birds of a Feather (BOF) session 75 | * The IAB or any member thereof on behalf of the IAB 76 | * The RFC Editor or the Internet-Drafts function 77 | * All IETF Contributions are subject to the rules of 78 | [RFC 5378](https://tools.ietf.org/html/rfc5378) and 79 | [RFC 8179](https://tools.ietf.org/html/rfc8179). 80 | 81 | Statements made outside of an IETF session, mailing list or other function, that are clearly not 82 | intended to be input to an IETF activity, group or function, are not IETF Contributions in the 83 | context of this notice. 84 | 85 | Please consult [RFC 5378](https://tools.ietf.org/html/rfc5378) and [RFC 8179](https://tools.ietf.org/html/rfc8179) for details. 86 | 87 | A participant in any IETF activity is deemed to accept all IETF rules of process, as documented in 88 | Best Current Practices RFCs and IESG Statements. 89 | 90 | A participant in any IETF activity acknowledges that written, audio and video records of meetings 91 | may be made and may be available to the public. 92 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | MD_PREPROCESSOR := sed -e 's/{DATE}/$(shell date '+%Y-%m-%d')/g' 2 | TIDY := true 3 | 4 | LIBDIR := lib 5 | include $(LIBDIR)/main.mk 6 | 7 | $(LIBDIR)/main.mk: 8 | ifneq (,$(shell git submodule status $(LIBDIR) 2>/dev/null)) 9 | git submodule sync 10 | git submodule update $(CLONE_ARGS) --init 11 | else 12 | git clone -q --depth 10 $(CLONE_ARGS) \ 13 | -b main https://github.com/martinthomson/i-d-template $(LIBDIR) 14 | endif 15 | 16 | latest:: lint 17 | .PHONY: lint 18 | 19 | lint:: 20 | @$(trace) wslint $(python) ./.lint.py $(addsuffix .md,$(drafts)) 21 | 22 | show-next: 23 | @echo $(drafts_next) 24 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # QUIC Protocol Drafts 2 | 3 | The base-drafts repository is the historical home of the QUIC version 1 4 | specifications that were written by the QUIC Working Group. 5 | 6 | **The documents have now been published as RFCs. Technical or editorial 7 | erratum can be reported to the RFC Editor using the [errata 8 | tool](https://www.rfc-editor.org/errata.php).** 9 | 10 | **The QUIC Working Group welcomes discussion about new versions of QUIC, and new 11 | extensions to QUIC, or other proposals related to the QUIC transport. See 12 | [Engaging with the QUIC 13 | community](https://github.com/quicwg/base-drafts/blob/main/CONTRIBUTING.md#engaging-with-the-quic-community) 14 | for guidance.** 15 | 16 | ## QUIC Invariants 17 | 18 | * [RFC 8999](https://quicwg.org/base-drafts/rfc8999.html) 19 | * [Datatracker](https://datatracker.ietf.org/doc/html/draft-ietf-quic-invariants) 20 | 21 | ## Core Transport Protocol 22 | 23 | * [RFC 9000](https://quicwg.org/base-drafts/rfc9000.html) 24 | * [Working Group Draft](https://datatracker.ietf.org/doc/html/draft-ietf-quic-transport) 25 | 26 | ## Loss Detection & Congestion Control 27 | 28 | * [RFC 9002](https://quicwg.org/base-drafts/rfc9002.html) 29 | * [Working Group Draft](https://datatracker.ietf.org/doc/html/draft-ietf-quic-recovery) 30 | 31 | ## TLS Mapping 32 | 33 | * [RFC 9001](https://quicwg.org/base-drafts/rfc9001.html) 34 | * [Datatracker](https://datatracker.ietf.org/doc/html/draft-ietf-quic-tls) 35 | 36 | ## HTTP Mapping 37 | 38 | * [RFC 9114](https://quicwg.org/base-drafts/rfc9114.html) 39 | * [Datatracker](https://datatracker.ietf.org/doc/html/draft-ietf-quic-http) 40 | 41 | ## QPACK 42 | 43 | * [RFC 9204](https://quicwg.org/base-drafts/rfc9204.html) 44 | * [Datatracker](https://datatracker.ietf.org/doc/html/draft-ietf-quic-qpack) 45 | -------------------------------------------------------------------------------- /ietf.json: -------------------------------------------------------------------------------- 1 | { 2 | "group": "quic", 3 | "group_info": { 4 | "name": "QUIC", 5 | "type": "wg", 6 | "email": "quic@ietf.org", 7 | "activity_exclude_labels": ["editorial"] 8 | }, 9 | "repo_type": "specs", 10 | "revisions_tagged": true, 11 | "activity_summary_to": ["group_email"] 12 | } -------------------------------------------------------------------------------- /protection-samples.js: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | ':' //; exec "$(command -v nodejs || command -v node)" "$0" "$@" 3 | 4 | // This script performs simple encryption and decryption for Initial packets. 5 | // It's crude, but it should be sufficient to generate examples. 6 | 7 | 8 | 'use strict'; 9 | require('buffer'); 10 | const assert = require('assert'); 11 | const crypto = require('crypto'); 12 | 13 | const INITIAL_SALT = Buffer.from('38762cf7f55934b34d179ae6a4c80cadccbb7f0a', 'hex'); 14 | const RETRY_KEY = Buffer.from('be0c690b9f66575a1d766b54e368c84e', 'hex'); 15 | const RETRY_NONCE = Buffer.from('461599d35d632bf2239825bb', 'hex'); 16 | const SHA256 = 'sha256'; 17 | const AES_GCM = 'aes-128-gcm'; 18 | const AES_ECB = 'aes-128-ecb'; 19 | 20 | const version = '00000001'; 21 | 22 | function chunk(s, n) { 23 | return (new Array(Math.ceil(s.length / n))) 24 | .fill() 25 | .map((_, i) => s.slice(i * n, i * n + n)); 26 | } 27 | 28 | function log(m, k) { 29 | console.log(m + ' [' + k.length + ']: ' + chunk(k.toString('hex'), 32).join(' ')); 30 | }; 31 | 32 | class HMAC { 33 | constructor(hash) { 34 | this.hash = hash; 35 | } 36 | 37 | digest(key, input) { 38 | var hmac = crypto.createHmac(this.hash, key); 39 | hmac.update(input); 40 | return hmac.digest(); 41 | } 42 | } 43 | 44 | /* HKDF as defined in RFC5869, with HKDF-Expand-Label from RFC8446. */ 45 | class QHKDF { 46 | constructor(hmac, prk) { 47 | this.hmac = hmac; 48 | this.prk = prk; 49 | } 50 | 51 | static extract(hash, salt, ikm) { 52 | var hmac = new HMAC(hash); 53 | return new QHKDF(hmac, hmac.digest(salt, ikm)); 54 | } 55 | 56 | expand(info, len) { 57 | var output = Buffer.alloc(0); 58 | var T = Buffer.alloc(0); 59 | info = Buffer.from(info, 'ascii'); 60 | var counter = 0; 61 | var cbuf = Buffer.alloc(1); 62 | while (output.length < len) { 63 | cbuf.writeUIntBE(++counter, 0, 1); 64 | T = this.hmac.digest(this.prk, Buffer.concat([T, info, cbuf])); 65 | output = Buffer.concat([output, T]); 66 | } 67 | 68 | return output.slice(0, len); 69 | } 70 | 71 | expand_label(label, len) { 72 | const prefix = "tls13 "; 73 | var info = Buffer.alloc(2 + 1 + prefix.length + label.length + 1); 74 | // Note that Buffer.write returns the number of bytes written, whereas 75 | // Buffer.writeUIntBE returns the end offset of the write. Consistency FTW. 76 | var offset = info.writeUIntBE(len, 0, 2); 77 | offset = info.writeUIntBE(prefix.length + label.length, offset, 1); 78 | offset += info.write(prefix + label, offset); 79 | info.writeUIntBE(0, offset, 1); 80 | log('info for ' + label, info); 81 | return this.expand(info, len); 82 | } 83 | } 84 | 85 | // XOR b into a. 86 | function xor(a, b) { 87 | a.forEach((_, i) => { 88 | a[i] ^= b[i]; 89 | }); 90 | } 91 | 92 | function applyNonce(iv, counter) { 93 | var nonce = Buffer.from(iv); 94 | const m = nonce.readUIntBE(nonce.length - 6, 6); 95 | const x = ((m ^ counter) & 0xffffff) + 96 | ((((m / 0x1000000) ^ (counter / 0x1000000)) & 0xffffff) * 0x1000000); 97 | nonce.writeUIntBE(x, nonce.length - 6, 6); 98 | return nonce; 99 | } 100 | 101 | class InitialProtection { 102 | constructor(label, cid) { 103 | var qhkdf = QHKDF.extract(SHA256, INITIAL_SALT, cid); 104 | log('initial_secret', qhkdf.prk); 105 | qhkdf = new QHKDF(qhkdf.hmac, qhkdf.expand_label(label, 32)); 106 | log(label + ' secret', qhkdf.prk); 107 | this.key = qhkdf.expand_label("quic key", 16); 108 | log(label + ' key', this.key); 109 | this.iv = qhkdf.expand_label("quic iv", 12); 110 | log(label + ' iv', this.iv); 111 | this.hp = qhkdf.expand_label("quic hp", 16); 112 | log(label + ' hp', this.hp); 113 | } 114 | 115 | generateNonce(counter) { 116 | return applyNonce(this.iv, counter); 117 | } 118 | 119 | // Returns the encrypted data with authentication tag appended. The AAD is 120 | // used, but not added to the output. 121 | encipher(pn, aad, data) { 122 | console.log('encipher pn', pn); 123 | log('encipher aad', aad); 124 | log('encipher data', data); 125 | var nonce = this.generateNonce(pn); 126 | var gcm = crypto.createCipheriv(AES_GCM, this.key, nonce); 127 | gcm.setAAD(aad); 128 | var e = gcm.update(data); 129 | gcm.final(); 130 | e = Buffer.concat([e, gcm.getAuthTag()]); 131 | log('enciphered', e); 132 | return e; 133 | } 134 | 135 | decipher(pn, aad, data) { 136 | console.log('decipher pn', pn); 137 | log('decipher aad', aad); 138 | log('decipher data', data); 139 | var nonce = this.generateNonce(pn); 140 | var gcm = crypto.createDecipheriv(AES_GCM, this.key, nonce); 141 | gcm.setAAD(aad); 142 | gcm.setAuthTag(data.slice(data.length - 16)); 143 | var d = gcm.update(data.slice(0, data.length - 16)); 144 | gcm.final(); 145 | log('deciphered', d); 146 | return d; 147 | } 148 | 149 | // Calculates the header protection mask. Returns 16 bytes of output. 150 | hpMask(sample) { 151 | log('hp sample', sample); 152 | // var ctr = crypto.createCipheriv('aes-128-ctr', this.hp, sample); 153 | // var mask = ctr.update(Buffer.alloc(5)); 154 | var ecb = crypto.createCipheriv(AES_ECB, this.hp, Buffer.alloc(0)); 155 | var mask = ecb.update(sample); 156 | log('hp mask', mask); 157 | return mask; 158 | } 159 | 160 | // hdr is everything before the length field 161 | // hdr[0] has the packet number length already in place 162 | // pn is the packet number 163 | // data is the payload (i.e., encoded frames) 164 | encrypt(hdr, pn, data) { 165 | var pn_len = 1 + (hdr[0] & 0x3); 166 | if (pn_len + data.length < 4) { 167 | throw new Error('insufficient length of packet number and payload'); 168 | } 169 | 170 | var aad = Buffer.alloc(hdr.length + 2 + pn_len); 171 | var offset = hdr.copy(aad); 172 | // Add a length that covers the packet number encoding and the auth tag. 173 | offset = aad.writeUIntBE(0x4000 | (pn_len + data.length + 16), offset, 2); 174 | var pn_offset = offset; 175 | var pn_mask = 0xffffffff >> (8 * (4 - pn_len)); 176 | offset = aad.writeUIntBE(pn & pn_mask, offset, pn_len) 177 | log('header', aad); 178 | 179 | var payload = this.encipher(pn, aad, data); 180 | 181 | var mask = this.hpMask(payload.slice(4 - pn_len, 20 - pn_len)); 182 | aad[0] ^= mask[0] & (0x1f >> (aad[0] >> 7)); 183 | xor(aad.slice(pn_offset), mask.slice(1)); 184 | log('masked header', aad); 185 | return Buffer.concat([aad, payload]); 186 | } 187 | 188 | cidLen(v) { 189 | if (!v) { 190 | return 0; 191 | } 192 | return v + 3; 193 | } 194 | 195 | decrypt(data) { 196 | log('decrypt', data); 197 | if (data[0] & 0x40 !== 0x40) { 198 | throw new Error('missing QUIC bit'); 199 | } 200 | if (data[0] & 0x80 === 0) { 201 | throw new Error('short header unsupported'); 202 | } 203 | var hdr_len = 1 + 4; 204 | hdr_len += 1 + data[hdr_len]; // DCID 205 | hdr_len += 1 + data[hdr_len]; // SCID 206 | if ((data[0] & 0x30) === 0) { // Initial packet: token. 207 | if ((data[hdr_len] & 0xc0) !== 0) { 208 | throw new Error('multi-byte token length unsupported'); 209 | } 210 | hdr_len += 1 + data[hdr_len]; // oops: this only handles single octet lengths. 211 | } 212 | // Skip the length. 213 | hdr_len += 1 << (data[hdr_len] >> 6); 214 | // Now we're at the encrypted bit. 215 | var mask = this.hpMask(data.slice(hdr_len + 4, hdr_len + 20)); 216 | 217 | var octet0 = data[0] ^ (mask[0] & (0x1f >> (data[0] >> 7))); 218 | var pn_len = (octet0 & 3) + 1; 219 | var hdr = Buffer.from(data.slice(0, hdr_len + pn_len)); 220 | hdr[0] = octet0; 221 | log('header', hdr); 222 | xor(hdr.slice(hdr_len), mask.slice(1)); 223 | log('unmasked header', hdr); 224 | var pn = hdr.readUIntBE(hdr_len, pn_len); 225 | // Important: this doesn't recover PN based on expected value. 226 | // The expectation being that Initial packets won't ever need that. 227 | return this.decipher(pn, hdr, data.slice(hdr.length)); 228 | } 229 | } 230 | 231 | function pad(hdr, body) { 232 | var pn_len = (hdr[0] & 3) + 1; 233 | var size = 1200 - hdr.length - 2 - pn_len - 16; // Assume 2 byte length. 234 | if (size < 0) { 235 | return body; 236 | } 237 | var padded = Buffer.allocUnsafe(size); 238 | console.log('pad amount', size); 239 | body.copy(padded); 240 | padded.fill(0, body.length); 241 | log('padded', padded); 242 | return padded; 243 | } 244 | 245 | function test(role, cid, hdr, pn, body) { 246 | cid = Buffer.from(cid, 'hex'); 247 | log('connection ID', cid); 248 | hdr = Buffer.from(hdr, 'hex'); 249 | log('header', hdr); 250 | console.log('packet number = ' + pn); 251 | body = Buffer.from(body, 'hex'); 252 | log('body', hdr); 253 | 254 | if (role === 'client' && (hdr[0] & 0x30) === 0) { 255 | body = pad(hdr, body); 256 | } 257 | 258 | var endpoint = new InitialProtection(role + ' in', cid); 259 | var packet = endpoint.encrypt(hdr, pn, body); 260 | log('encrypted packet', packet); 261 | 262 | var content = endpoint.decrypt(packet); 263 | log('decrypted content', content); 264 | if (content.compare(body) !== 0) { 265 | throw new Error('decrypted result not the same as the original'); 266 | } 267 | } 268 | 269 | function hex_cid(cid) { 270 | return '0' + (cid.length / 2).toString(16) + cid; 271 | } 272 | 273 | // Verify that the retry keys are correct. 274 | function derive_retry() { 275 | let secret = Buffer.from('d9c9943e6101fd200021506bcc02814c73030f25c79d71ce876eca876e6fca8e', 'hex'); 276 | let qhkdf = new QHKDF(new HMAC(SHA256), secret); 277 | let key = qhkdf.expand_label("quic key", 16); 278 | log('retry key', key); 279 | assert.deepStrictEqual(key, RETRY_KEY); 280 | let nonce = qhkdf.expand_label("quic iv", 12); 281 | log('retry nonce', nonce); 282 | assert.deepStrictEqual(nonce, RETRY_NONCE); 283 | } 284 | 285 | function retry(dcid, scid, odcid) { 286 | var pfx = Buffer.from(hex_cid(odcid), 'hex'); 287 | var encoded = Buffer.from('ff' + version + hex_cid(dcid) + hex_cid(scid), 'hex'); 288 | var token = Buffer.from('token', 'ascii'); 289 | var header = Buffer.concat([encoded, token]); 290 | log('retry header', header); 291 | var aad = Buffer.concat([pfx, header]); 292 | log('retry aad', aad); 293 | 294 | var gcm = crypto.createCipheriv(AES_GCM, RETRY_KEY, RETRY_NONCE); 295 | gcm.setAAD(aad); 296 | gcm.update(''); 297 | gcm.final(); 298 | log('retry', Buffer.concat([header, gcm.getAuthTag()])); 299 | } 300 | 301 | // A simple ChaCha20-Poly1305 packet. 302 | function chacha20(pn, payload) { 303 | log('chacha20poly1305 pn=' + pn.toString(), payload); 304 | let header = Buffer.alloc(4); 305 | header.writeUIntBE(0x42, 0, 1); 306 | header.writeUIntBE(pn & 0xffffff, 1, 3); 307 | log('unprotected header', header); 308 | const key = Buffer.from('c6d98ff3441c3fe1b2182094f69caa2e' + 309 | 'd4b716b65488960a7a984979fb23e1c8', 'hex'); 310 | const iv = Buffer.from('e0459b3474bdd0e44a41c144', 'hex'); 311 | const nonce = applyNonce(iv, pn); 312 | log('nonce', nonce); 313 | let aead = crypto.createCipheriv('ChaCha20-Poly1305', key, nonce, { authTagLength: 16 }); 314 | aead.setAAD(header); 315 | const e = aead.update(payload); 316 | aead.final(); 317 | let ct = Buffer.concat([e, aead.getAuthTag()]); 318 | log('ciphertext', ct); 319 | 320 | const sample = ct.slice(1, 17); 321 | log('sample', sample); 322 | const hp = Buffer.from('25a282b9e82f06f21f488917a4fc8f1b' + 323 | '73573685608597d0efcb076b0ab7a7a4', 'hex'); 324 | let chacha = crypto.createCipheriv('ChaCha20', hp, sample); 325 | const mask = chacha.update(Buffer.alloc(5)); 326 | log('mask', mask); 327 | let packet = Buffer.concat([header, ct]); 328 | header[0] ^= mask[0] & 0x1f; 329 | xor(header.slice(1), mask.slice(1)); 330 | log('header', header); 331 | log('protected packet', Buffer.concat([header, ct])); 332 | } 333 | 334 | var cid = '8394c8f03e515708'; 335 | 336 | var ci_hdr = 'c3' + version + hex_cid(cid) + '0000'; 337 | // This is a client Initial. 338 | var crypto_frame = '060040f1' + 339 | '010000ed0303ebf8fa56f12939b9584a3896472ec40bb863cfd3e86804fe3a47' + 340 | 'f06a2b69484c00000413011302010000c000000010000e00000b6578616d706c' + 341 | '652e636f6dff01000100000a00080006001d0017001800100007000504616c70' + 342 | '6e000500050100000000003300260024001d00209370b2c9caa47fbabaf4559f' + 343 | 'edba753de171fa71f50f1ce15d43e994ec74d748002b0003020304000d001000' + 344 | '0e0403050306030203080408050806002d00020101001c000240010039003204' + 345 | '08ffffffffffffffff05048000ffff07048000ffff0801100104800075300901' + 346 | '100f088394c8f03e51570806048000ffff'; 347 | 348 | test('client', cid, ci_hdr, 2, crypto_frame); 349 | 350 | // This should be a valid server Initial. 351 | var frames = '02000000000600405a' + 352 | '020000560303eefce7f7b37ba1d163' + 353 | '2e96677825ddf73988cfc79825df566dc5430b9a04' + 354 | '5a1200130100002e00330024001d00209d3c940d89' + 355 | '690b84d08a60993c144eca684d1081287c834d5311' + 356 | 'bcf32bb9da1a002b00020304'; 357 | var scid = 'f067a5502a4262b5'; 358 | var si_hdr = 'c1' + version + '00' + hex_cid(scid) + '00'; 359 | test('server', cid, si_hdr, 1, frames); 360 | 361 | derive_retry(); 362 | retry('', scid, cid); 363 | chacha20(654360564, Buffer.from('01', 'hex')); 364 | -------------------------------------------------------------------------------- /rfc8999.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Version-Independent Properties of QUIC" 3 | abbrev: QUIC Invariants 4 | number: 8999 5 | docName: draft-ietf-quic-invariants-13 6 | date: 2021-05 7 | category: std 8 | consensus: true 9 | ipr: trust200902 10 | area: Transport 11 | workgroup: QUIC 12 | keyword: 13 | - crypto 14 | - next generation 15 | - protocol 16 | - secure 17 | - transport 18 | - UDP 19 | 20 | stand_alone: yes 21 | pi: [toc, sortrefs, symrefs, docmapping] 22 | 23 | author: 24 | - 25 | ins: M. Thomson 26 | name: Martin Thomson 27 | org: Mozilla 28 | email: mt@lowentropy.net 29 | 30 | informative: 31 | 32 | QUIC-TRANSPORT: 33 | title: "QUIC: A UDP-Based Multiplexed and Secure Transport" 34 | date: 2021-05 35 | seriesinfo: 36 | RFC: 9000 37 | DOI: 10.17487/RFC9000 38 | author: 39 | - 40 | ins: J. Iyengar 41 | name: Jana Iyengar 42 | org: Google 43 | role: editor 44 | - 45 | ins: M. Thomson 46 | name: Martin Thomson 47 | org: Mozilla 48 | role: editor 49 | 50 | QUIC-TLS: 51 | title: "Using TLS to Secure QUIC" 52 | date: 2021-05 53 | seriesinfo: 54 | RFC: 9001 55 | DOI: 10.17487/RFC9001 56 | author: 57 | - 58 | ins: M. Thomson 59 | name: Martin Thomson 60 | org: Mozilla 61 | role: editor 62 | - 63 | ins: S. Turner 64 | name: Sean Turner 65 | org: sn3rd 66 | role: editor 67 | 68 | 69 | --- abstract 70 | 71 | This document defines the properties of the QUIC transport protocol that are 72 | common to all versions of the protocol. 73 | 74 | 75 | --- middle 76 | 77 | # An Extremely Abstract Description of QUIC 78 | 79 | QUIC is a connection-oriented protocol between two endpoints. Those endpoints 80 | exchange UDP datagrams. These UDP datagrams contain QUIC packets. QUIC 81 | endpoints use QUIC packets to establish a QUIC connection, which is shared 82 | protocol state between those endpoints. 83 | 84 | 85 | # Fixed Properties of All QUIC Versions 86 | 87 | In addition to providing secure, multiplexed transport, QUIC {{QUIC-TRANSPORT}} 88 | allows for the option to negotiate a version. This allows the protocol to 89 | change over time in response to new requirements. Many characteristics of the 90 | protocol could change between versions. 91 | 92 | This document describes the subset of QUIC that is intended to remain stable as 93 | new versions are developed and deployed. All of these invariants are 94 | independent of the IP version. 95 | 96 | The primary goal of this document is to ensure that it is possible to deploy new 97 | versions of QUIC. By documenting the properties that cannot change, this 98 | document aims to preserve the ability for QUIC endpoints to negotiate changes to 99 | any other aspect of the protocol. As a consequence, this also guarantees a 100 | minimal amount of information that is made available to entities other than 101 | endpoints. Unless specifically prohibited in this document, any aspect of the 102 | protocol can change between different versions. 103 | 104 | {{bad-assumptions}} contains a non-exhaustive list of some incorrect assumptions 105 | that might be made based on knowledge of QUIC version 1; these do not apply to 106 | every version of QUIC. 107 | 108 | 109 | # Conventions and Definitions 110 | 111 | The key words "MUST", "MUST NOT", "REQUIRED", "SHALL", "SHALL NOT", "SHOULD", 112 | "SHOULD NOT", "RECOMMENDED", "NOT RECOMMENDED", "MAY", and "OPTIONAL" in this 113 | document are to be interpreted as described in BCP 14 {{!RFC2119}} {{!RFC8174}} 114 | when, and only when, they appear in all capitals, as shown here. 115 | 116 | This document defines requirements on future QUIC versions, even where normative 117 | language is not used. 118 | 119 | This document uses terms and notational conventions from {{QUIC-TRANSPORT}}. 120 | 121 | 122 | # Notational Conventions 123 | 124 | The format of packets is described using the notation defined in this section. 125 | This notation is the same as that used in {{QUIC-TRANSPORT}}. 126 | 127 | Complex fields are named and then followed by a list of fields surrounded by a 128 | pair of matching braces. Each field in this list is separated by commas. 129 | 130 | Individual fields include length information, plus indications about fixed 131 | value, optionality, or repetitions. Individual fields use the following 132 | notational conventions, with all lengths in bits: 133 | 134 | x (A): 135 | : Indicates that x is A bits long 136 | 137 | x (A..B): 138 | : Indicates that x can be any length from A to B; A can be omitted to indicate 139 | a minimum of zero bits, and B can be omitted to indicate no set upper limit; 140 | values in this format always end on a byte boundary 141 | 142 | x (L) = C: 143 | : Indicates that x has a fixed value of C; the length of x is described by 144 | L, which can use any of the length forms above 145 | 146 | x (L) ...: 147 | : Indicates that x is repeated zero or more times and that each instance has a 148 | length of L 149 | 150 | This document uses network byte order (that is, big endian) values. Fields 151 | are placed starting from the high-order bits of each byte. 152 | 153 | {{fig-ex-format}} shows an example structure: 154 | 155 | ~~~ 156 | Example Structure { 157 | One-bit Field (1), 158 | 7-bit Field with Fixed Value (7) = 61, 159 | Arbitrary-Length Field (..), 160 | Variable-Length Field (8..24), 161 | Repeated Field (8) ..., 162 | } 163 | ~~~ 164 | {: #fig-ex-format title="Example Format"} 165 | 166 | 167 | # QUIC Packets 168 | 169 | QUIC endpoints exchange UDP datagrams that contain one or more QUIC packets. 170 | This section describes the invariant characteristics of a QUIC packet. A 171 | version of QUIC could permit multiple QUIC packets in a single UDP datagram, but 172 | the invariant properties only describe the first packet in a datagram. 173 | 174 | QUIC defines two types of packet headers: long and short. Packets with a long 175 | header are identified by the most significant bit of the first byte being set; 176 | packets with a short header have that bit cleared. 177 | 178 | QUIC packets might be integrity protected, including the header. However, QUIC 179 | Version Negotiation packets are not integrity protected; see {{vn}}. 180 | 181 | Aside from the values described here, the payload of QUIC packets is 182 | version specific and of arbitrary length. 183 | 184 | 185 | ## Long Header 186 | 187 | Long headers take the form described in {{fig-long}}. 188 | 189 | ~~~ 190 | Long Header Packet { 191 | Header Form (1) = 1, 192 | Version-Specific Bits (7), 193 | Version (32), 194 | Destination Connection ID Length (8), 195 | Destination Connection ID (0..2040), 196 | Source Connection ID Length (8), 197 | Source Connection ID (0..2040), 198 | Version-Specific Data (..), 199 | } 200 | ~~~ 201 | {: #fig-long title="QUIC Long Header"} 202 | 203 | A QUIC packet with a long header has the high bit of the first byte set to 1. 204 | All other bits in that byte are version specific. 205 | 206 | The next four bytes include a 32-bit Version field. Versions are described in 207 | {{version}}. 208 | 209 | The next byte contains the length in bytes of the Destination Connection ID 210 | field that follows it. This length is encoded as an 8-bit unsigned integer. 211 | The Destination Connection ID field follows the Destination Connection ID Length 212 | field and is between 0 and 255 bytes in length. Connection IDs are described in 213 | {{connection-id}}. 214 | 215 | The next byte contains the length in bytes of the Source Connection ID field 216 | that follows it. This length is encoded as an 8-bit unsigned integer. The 217 | Source Connection ID field follows the Source Connection ID Length field and is 218 | between 0 and 255 bytes in length. 219 | 220 | The remainder of the packet contains version-specific content. 221 | 222 | 223 | ## Short Header 224 | 225 | Short headers take the form described in {{fig-short}}. 226 | 227 | ~~~~~ 228 | Short Header Packet { 229 | Header Form (1) = 0, 230 | Version-Specific Bits (7), 231 | Destination Connection ID (..), 232 | Version-Specific Data (..), 233 | } 234 | ~~~~~ 235 | {: #fig-short title="QUIC Short Header"} 236 | 237 | A QUIC packet with a short header has the high bit of the first byte set to 0. 238 | 239 | A QUIC packet with a short header includes a Destination Connection ID 240 | immediately following the first byte. The short header does not include the 241 | Destination Connection ID Length, Source Connection ID Length, Source Connection 242 | ID, or Version fields. The length of the Destination Connection ID is not 243 | encoded in packets with a short header and is not constrained by this 244 | specification. 245 | 246 | The remainder of the packet has version-specific semantics. 247 | 248 | 249 | ## Connection ID 250 | 251 | A connection ID is an opaque field of arbitrary length. 252 | 253 | The primary function of a connection ID is to ensure that changes in addressing 254 | at lower protocol layers (UDP, IP, and below) do not cause packets for a QUIC 255 | connection to be delivered to the wrong QUIC endpoint. The connection ID 256 | is used by endpoints and the intermediaries that support them to ensure that 257 | each QUIC packet can be delivered to the correct instance of an endpoint. At 258 | the endpoint, the connection ID is used to identify the QUIC connection for 259 | which the packet is intended. 260 | 261 | The connection ID is chosen by each endpoint using version-specific methods. 262 | Packets for the same QUIC connection might use different connection ID values. 263 | 264 | 265 | ## Version 266 | 267 | The Version field contains a 4-byte identifier. This value can be used by 268 | endpoints to identify a QUIC version. A Version field with a value of 269 | 0x00000000 is reserved for version negotiation; see {{vn}}. All other values 270 | are potentially valid. 271 | 272 | The properties described in this document apply to all versions of QUIC. A 273 | protocol that does not conform to the properties described in this document is 274 | not QUIC. Future documents might describe additional properties that apply to 275 | a specific QUIC version or to a range of QUIC versions. 276 | 277 | 278 | # Version Negotiation {#vn} 279 | 280 | A QUIC endpoint that receives a packet with a long header and a version it 281 | either does not understand or does not support might send a Version Negotiation 282 | packet in response. Packets with a short header do not trigger version 283 | negotiation. 284 | 285 | A Version Negotiation packet sets the high bit of the first byte, and thus it 286 | conforms with the format of a packet with a long header as defined in 287 | {{long-header}}. A Version Negotiation packet is identifiable as such by the 288 | Version field, which is set to 0x00000000. 289 | 290 | ~~~ 291 | Version Negotiation Packet { 292 | Header Form (1) = 1, 293 | Unused (7), 294 | Version (32) = 0, 295 | Destination Connection ID Length (8), 296 | Destination Connection ID (0..2040), 297 | Source Connection ID Length (8), 298 | Source Connection ID (0..2040), 299 | Supported Version (32) ..., 300 | } 301 | ~~~ 302 | {: #version-negotiation-format title="Version Negotiation Packet"} 303 | 304 | Only the most significant bit of the first byte of a Version Negotiation packet 305 | has any defined value. The remaining 7 bits, labeled "Unused", can be set to 306 | any value when sending and MUST be ignored on receipt. 307 | 308 | After the Source Connection ID field, the Version Negotiation packet contains a 309 | list of Supported Version fields, each identifying a version that the endpoint 310 | sending the packet supports. A Version Negotiation packet contains no other 311 | fields. An endpoint MUST ignore a packet that contains no Supported Version 312 | fields or contains a truncated Supported Version value. 313 | 314 | Version Negotiation packets do not use integrity or confidentiality protection. 315 | Specific QUIC versions might include protocol elements that allow endpoints to 316 | detect modification or corruption in the set of supported versions. 317 | 318 | An endpoint MUST include the value from the Source Connection ID field of the 319 | packet it receives in the Destination Connection ID field. The value for the 320 | Source Connection ID field MUST be copied from the Destination Connection ID 321 | field of the received packet, which is initially randomly selected by a client. 322 | Echoing both connection IDs gives clients some assurance that the server 323 | received the packet and that the Version Negotiation packet was not generated by 324 | an attacker that is unable to observe packets. 325 | 326 | An endpoint that receives a Version Negotiation packet might change the version 327 | that it decides to use for subsequent packets. The conditions under which an 328 | endpoint changes its QUIC version will depend on the version of QUIC that it 329 | chooses. 330 | 331 | See {{QUIC-TRANSPORT}} for a more thorough description of how an endpoint that 332 | supports QUIC version 1 generates and consumes a Version Negotiation packet. 333 | 334 | 335 | # Security and Privacy Considerations 336 | 337 | It is possible that middleboxes could observe traits of a specific version of 338 | QUIC and assume that when other versions of QUIC exhibit similar traits the same 339 | underlying semantic is being expressed. There are potentially many such traits; 340 | see {{bad-assumptions}}. Some effort has been made to either eliminate or 341 | obscure some observable traits in QUIC version 1, but many of these remain. 342 | Other QUIC versions might make different design decisions and so exhibit 343 | different traits. 344 | 345 | The QUIC version number does not appear in all QUIC packets, which means that 346 | reliably extracting information from a flow based on version-specific traits 347 | requires that middleboxes retain state for every connection ID they see. 348 | 349 | The Version Negotiation packet described in this document is not 350 | integrity protected; it only has modest protection against insertion by 351 | attackers. An endpoint MUST authenticate the semantic content of a Version 352 | Negotiation packet if it attempts a different QUIC version as a result. 353 | 354 | 355 | --- back 356 | 357 | # Incorrect Assumptions {#bad-assumptions} 358 | 359 | There are several traits of QUIC version 1 {{QUIC-TRANSPORT}} that are not 360 | protected from observation but are nonetheless considered to be changeable when 361 | a new version is deployed. 362 | 363 | This section lists a sampling of incorrect assumptions that might be made about 364 | QUIC based on knowledge of QUIC version 1. Some of these statements are not 365 | even true for QUIC version 1. This is not an exhaustive list; it is intended to 366 | be illustrative only. 367 | 368 | **Any and all of the following statements can be false for a given QUIC 369 | version:** 370 | 371 | * QUIC uses TLS {{QUIC-TLS}} and some TLS messages are visible on the wire. 372 | 373 | * QUIC long headers are only exchanged during connection establishment. 374 | 375 | * Every flow on a given 5-tuple will include a connection establishment phase. 376 | 377 | * The first packets exchanged on a flow use the long header. 378 | 379 | * The last packet before a long period of quiescence might be assumed 380 | to contain only an acknowledgment. 381 | 382 | * QUIC uses an Authenticated Encryption with Associated Data (AEAD) function 383 | (AEAD_AES_128_GCM; see {{?RFC5116}}) to protect the packets it exchanges 384 | during connection establishment. 385 | 386 | * QUIC packet numbers are encrypted and appear as the first encrypted bytes. 387 | 388 | * QUIC packet numbers increase by one for every packet sent. 389 | 390 | * QUIC has a minimum size for the first handshake packet sent by a client. 391 | 392 | * QUIC stipulates that a client speak first. 393 | 394 | * QUIC packets always have the second bit of the first byte (0x40) set. 395 | 396 | * A QUIC Version Negotiation packet is only sent by a server. 397 | 398 | * A QUIC connection ID changes infrequently. 399 | 400 | * QUIC endpoints change the version they speak if they are sent a Version 401 | Negotiation packet. 402 | 403 | * The Version field in a QUIC long header is the same in both directions. 404 | 405 | * A QUIC packet with a particular value in the Version field means that the 406 | corresponding version of QUIC is in use. 407 | 408 | * Only one connection at a time is established between any pair of QUIC 409 | endpoints. 410 | -------------------------------------------------------------------------------- /rfc9002.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: QUIC Loss Detection and Congestion Control 3 | abbrev: QUIC Loss Detection 4 | number: 9002 5 | docName: draft-ietf-quic-recovery-34 6 | date: 2021-05 7 | category: std 8 | consensus: true 9 | ipr: trust200902 10 | area: Transport 11 | workgroup: QUIC 12 | keyword: 13 | - bbr 14 | - delay-sensitive congestion control 15 | - fec 16 | - loss-tolerant congestion control 17 | - next generation 18 | 19 | stand_alone: yes 20 | pi: [toc, sortrefs, symrefs, docmapping] 21 | 22 | author: 23 | - 24 | ins: J. Iyengar 25 | name: Jana Iyengar 26 | org: Fastly 27 | email: jri.ietf@gmail.com 28 | role: editor 29 | - 30 | ins: I. Swett 31 | name: Ian Swett 32 | org: Google 33 | email: ianswett@google.com 34 | role: editor 35 | 36 | normative: 37 | 38 | QUIC-TRANSPORT: 39 | title: "QUIC: A UDP-Based Multiplexed and Secure Transport" 40 | date: 2021-05 41 | seriesinfo: 42 | RFC: 9000 43 | DOI: 10.17487/RFC9000 44 | author: 45 | - 46 | ins: J. Iyengar 47 | name: Jana Iyengar 48 | org: Fastly 49 | role: editor 50 | - 51 | ins: M. Thomson 52 | name: Martin Thomson 53 | org: Mozilla 54 | role: editor 55 | 56 | QUIC-TLS: 57 | title: "Using TLS to Secure QUIC" 58 | date: 2021-05 59 | seriesinfo: 60 | RFC: 9001 61 | DOI: 10.17487/RFC9001 62 | author: 63 | - 64 | ins: M. Thomson 65 | name: Martin Thomson 66 | org: Mozilla 67 | role: editor 68 | - 69 | ins: S. Turner 70 | name: Sean Turner 71 | org: sn3rd 72 | role: editor 73 | 74 | RFC8085: 75 | 76 | informative: 77 | 78 | FACK: 79 | title: "Forward acknowledgement: Refining TCP Congestion Control" 80 | author: 81 | - 82 | initials: M. 83 | surname: Mathis 84 | - 85 | initials: J. 86 | surname: Mahdavi 87 | date: 1996-08 88 | refcontent: ACM SIGCOMM Computer Communication Review 89 | seriesinfo: 90 | DOI: 10.1145/248157.248181 91 | 92 | RETRANSMISSION: 93 | title: "Improving Round-Trip Time Estimates in Reliable Transport Protocols" 94 | author: 95 | - 96 | initials: P. 97 | surname: Karn 98 | - 99 | initials: C. 100 | surname: Partridge 101 | date: 1991-11 102 | refcontent: ACM Transactions on Computer Systems 103 | seriesinfo: 104 | DOI: 10.1145/118544.118549 105 | 106 | RFC3465: 107 | 108 | --- abstract 109 | 110 | This document describes loss detection and congestion control mechanisms for 111 | QUIC. 112 | 113 | 114 | --- middle 115 | 116 | # Introduction 117 | 118 | 119 | QUIC is a secure, general-purpose transport protocol, described in 120 | {{QUIC-TRANSPORT}}. This document describes loss detection and congestion 121 | control mechanisms for QUIC. 122 | 123 | # Conventions and Definitions 124 | 125 | The key words "MUST", "MUST NOT", "REQUIRED", "SHALL", "SHALL NOT", "SHOULD", 126 | "SHOULD NOT", "RECOMMENDED", "NOT RECOMMENDED", "MAY", and "OPTIONAL" in this 127 | document are to be interpreted as described in BCP 14 {{!RFC2119}} {{!RFC8174}} 128 | when, and only when, they appear in all capitals, as shown here. 129 | 130 | Definitions of terms that are used in this document: 131 | 132 | Ack-eliciting frames: 133 | 134 | : All frames other than ACK, PADDING, and CONNECTION_CLOSE are considered 135 | ack-eliciting. 136 | 137 | Ack-eliciting packets: 138 | 139 | : Packets that contain ack-eliciting frames elicit an ACK from the receiver 140 | within the maximum acknowledgment delay and are called ack-eliciting packets. 141 | 142 | In-flight packets: 143 | 144 | : Packets are considered in flight when they are ack-eliciting or contain a 145 | PADDING frame, and they have been sent but are not acknowledged, declared 146 | lost, or discarded along with old keys. 147 | 148 | # Design of the QUIC Transmission Machinery 149 | 150 | All transmissions in QUIC are sent with a packet-level header, which indicates 151 | the encryption level and includes a packet sequence number (referred to below as 152 | a packet number). The encryption level indicates the packet number space, as 153 | described in {{Section 12.3 of QUIC-TRANSPORT}}. Packet numbers never repeat 154 | within a packet number space for the lifetime of a connection. Packet numbers 155 | are sent in monotonically increasing order within a space, preventing ambiguity. 156 | It is permitted for some packet numbers to never be used, leaving intentional 157 | gaps. 158 | 159 | This design obviates the need for disambiguating between transmissions and 160 | retransmissions; this eliminates significant complexity from QUIC's 161 | interpretation of TCP loss detection mechanisms. 162 | 163 | QUIC packets can contain multiple frames of different types. The recovery 164 | mechanisms ensure that data and frames that need reliable delivery are 165 | acknowledged or declared lost and sent in new packets as necessary. The types 166 | of frames contained in a packet affect recovery and congestion control logic: 167 | 168 | * All packets are acknowledged, though packets that contain no 169 | ack-eliciting frames are only acknowledged along with ack-eliciting 170 | packets. 171 | 172 | * Long header packets that contain CRYPTO frames are critical to the 173 | performance of the QUIC handshake and use shorter timers for 174 | acknowledgment. 175 | 176 | * Packets containing frames besides ACK or CONNECTION_CLOSE frames count toward 177 | congestion control limits and are considered to be in flight. 178 | 179 | * PADDING frames cause packets to contribute toward bytes in flight without 180 | directly causing an acknowledgment to be sent. 181 | 182 | # Relevant Differences Between QUIC and TCP 183 | 184 | Readers familiar with TCP's loss detection and congestion control will find 185 | algorithms here that parallel well-known TCP ones. However, protocol differences 186 | between QUIC and TCP contribute to algorithmic differences. These protocol 187 | differences are briefly described below. 188 | 189 | ## Separate Packet Number Spaces 190 | 191 | QUIC uses separate packet number spaces for each encryption level, 192 | except 0-RTT and all generations of 1-RTT keys use the same packet 193 | number space. Separate packet number spaces ensures that the 194 | acknowledgment of packets sent with one level of encryption will not 195 | cause spurious retransmission of packets sent with a different 196 | encryption level. Congestion control and round-trip time (RTT) 197 | measurement are unified across packet number spaces. 198 | 199 | ## Monotonically Increasing Packet Numbers 200 | 201 | TCP conflates transmission order at the sender with delivery order at the 202 | receiver, resulting in the retransmission ambiguity problem 203 | {{RETRANSMISSION}}. QUIC separates transmission order from delivery order: 204 | packet numbers indicate transmission order, and delivery order is determined by 205 | the stream offsets in STREAM frames. 206 | 207 | QUIC's packet number is strictly increasing within a packet number space 208 | and directly encodes transmission order. A higher packet number signifies 209 | that the packet was sent later, and a lower packet number signifies that 210 | the packet was sent earlier. When a packet containing ack-eliciting 211 | frames is detected lost, QUIC includes necessary frames in a new packet 212 | with a new packet number, removing ambiguity about which packet is 213 | acknowledged when an ACK is received. Consequently, more accurate RTT 214 | measurements can be made, spurious retransmissions are trivially detected, and 215 | mechanisms such as Fast Retransmit can be applied universally, based only on 216 | packet number. 217 | 218 | This design point significantly simplifies loss detection mechanisms for QUIC. 219 | Most TCP mechanisms implicitly attempt to infer transmission ordering based on 220 | TCP sequence numbers -- a nontrivial task, especially when TCP timestamps are 221 | not available. 222 | 223 | ## Clearer Loss Epoch 224 | 225 | QUIC starts a loss epoch when a packet is lost. The loss epoch ends when any 226 | packet sent after the start of the epoch is acknowledged. TCP waits for the gap 227 | in the sequence number space to be filled, and so if a segment is lost multiple 228 | times in a row, the loss epoch may not end for several round trips. Because both 229 | should reduce their congestion windows only once per epoch, QUIC will do it once 230 | for every round trip that experiences loss, while TCP may only do it once across 231 | multiple round trips. 232 | 233 | ## No Reneging 234 | 235 | QUIC ACK frames contain information similar to that in TCP Selective 236 | Acknowledgments (SACKs) {{?RFC2018}}. However, QUIC does not allow a packet 237 | acknowledgment to be reneged, greatly simplifying implementations on both sides 238 | and reducing memory pressure on the sender. 239 | 240 | ## More ACK Ranges 241 | 242 | QUIC supports many ACK ranges, as opposed to TCP's three SACK ranges. In 243 | high-loss environments, this speeds recovery, reduces spurious retransmits, and 244 | ensures forward progress without relying on timeouts. 245 | 246 | ## Explicit Correction For Delayed Acknowledgments 247 | 248 | QUIC endpoints measure the delay incurred between when a packet is received and 249 | when the corresponding acknowledgment is sent, allowing a peer to maintain a 250 | more accurate RTT estimate; see {{Section 13.2 of QUIC-TRANSPORT}}. 251 | 252 | ## Probe Timeout Replaces RTO and TLP 253 | 254 | QUIC uses a probe timeout (PTO; see {{pto}}), with a timer based on TCP's 255 | retransmission timeout (RTO) computation; see {{?RFC6298}}. QUIC's PTO includes 256 | the peer's maximum expected acknowledgment delay instead of using a fixed 257 | minimum timeout. 258 | 259 | Similar to the RACK-TLP loss detection algorithm for TCP {{?RFC8985}}, QUIC does 260 | not collapse the congestion window when the PTO expires, since a single packet 261 | loss at the tail does not indicate persistent congestion. Instead, QUIC 262 | collapses the congestion window when persistent congestion is declared; see 263 | {{persistent-congestion}}. In doing this, QUIC avoids unnecessary congestion 264 | window reductions, obviating the need for correcting mechanisms such as Forward 265 | RTO-Recovery (F-RTO) {{?RFC5682}}. Since QUIC does not collapse the congestion 266 | window on a PTO expiration, a QUIC sender is not limited from sending more 267 | in-flight packets after a PTO expiration if it still has available congestion 268 | window. This occurs when a sender is application limited and the PTO timer 269 | expires. This is more aggressive than TCP's RTO mechanism when application 270 | limited, but identical when not application limited. 271 | 272 | QUIC allows probe packets to temporarily exceed the congestion window whenever 273 | the timer expires. 274 | 275 | ## The Minimum Congestion Window Is Two Packets 276 | 277 | TCP uses a minimum congestion window of one packet. However, loss of that single 278 | packet means that the sender needs to wait for a PTO to recover ({{pto}}), which 279 | can be much longer than an RTT. Sending a single ack-eliciting packet also 280 | increases the chances of incurring additional latency when a receiver delays its 281 | acknowledgment. 282 | 283 | QUIC therefore recommends that the minimum congestion window be two 284 | packets. While this increases network load, it is considered safe since the 285 | sender will still reduce its sending rate exponentially under persistent 286 | congestion ({{pto}}). 287 | 288 | ## Handshake Packets Are Not Special 289 | 290 | TCP treats the loss of SYN or SYN-ACK packet as persistent congestion and 291 | reduces the congestion window to one packet; see {{?RFC5681}}. QUIC treats loss 292 | of a packet containing handshake data the same as other losses. 293 | 294 | # Estimating the Round-Trip Time {#compute-rtt} 295 | 296 | At a high level, an endpoint measures the time from when a packet was sent to 297 | when it is acknowledged as an RTT sample. The endpoint uses RTT samples and 298 | peer-reported host delays (see {{Section 13.2 of QUIC-TRANSPORT}}) to generate a 299 | statistical description of the network path's RTT. An endpoint computes the 300 | following three values for each path: the minimum value over a period of time 301 | (min_rtt), an exponentially weighted moving average (smoothed_rtt), and the mean 302 | deviation (referred to as "variation" in the rest of this document) in the 303 | observed RTT samples (rttvar). 304 | 305 | ## Generating RTT Samples {#latest-rtt} 306 | 307 | An endpoint generates an RTT sample on receiving an ACK frame that meets the 308 | following two conditions: 309 | 310 | - the largest acknowledged packet number is newly acknowledged, and 311 | 312 | - at least one of the newly acknowledged packets was ack-eliciting. 313 | 314 | The RTT sample, latest_rtt, is generated as the time elapsed since the largest 315 | acknowledged packet was sent: 316 | 317 | ~~~pseudocode 318 | latest_rtt = ack_time - send_time_of_largest_acked 319 | ~~~ 320 | 321 | An RTT sample is generated using only the largest acknowledged packet in the 322 | received ACK frame. This is because a peer reports acknowledgment delays for 323 | only the largest acknowledged packet in an ACK frame. While the reported 324 | acknowledgment delay is not used by the RTT sample measurement, it is used to 325 | adjust the RTT sample in subsequent computations of smoothed_rtt and rttvar 326 | ({{smoothed-rtt}}). 327 | 328 | To avoid generating multiple RTT samples for a single packet, an ACK frame 329 | SHOULD NOT be used to update RTT estimates if it does not newly acknowledge the 330 | largest acknowledged packet. 331 | 332 | An RTT sample MUST NOT be generated on receiving an ACK frame that does not 333 | newly acknowledge at least one ack-eliciting packet. A peer usually does not 334 | send an ACK frame when only non-ack-eliciting packets are received. Therefore, 335 | an ACK frame that contains acknowledgments for only non-ack-eliciting packets 336 | could include an arbitrarily large ACK Delay value. Ignoring 337 | such ACK frames avoids complications in subsequent smoothed_rtt and rttvar 338 | computations. 339 | 340 | A sender might generate multiple RTT samples per RTT when multiple ACK frames 341 | are received within an RTT. As suggested in {{?RFC6298}}, doing so might result 342 | in inadequate history in smoothed_rtt and rttvar. Ensuring that RTT estimates 343 | retain sufficient history is an open research question. 344 | 345 | ## Estimating min_rtt {#min-rtt} 346 | 347 | min_rtt is the sender's estimate of the minimum RTT observed for a given network 348 | path over a period of time. In this document, min_rtt is used by loss detection 349 | to reject implausibly small RTT samples. 350 | 351 | min_rtt MUST be set to the latest_rtt on the first RTT sample. min_rtt MUST be 352 | set to the lesser of min_rtt and latest_rtt ({{latest-rtt}}) on all other 353 | samples. 354 | 355 | An endpoint uses only locally observed times in computing the min_rtt and does 356 | not adjust for acknowledgment delays reported by the peer. Doing so allows the 357 | endpoint to set a lower bound for the smoothed_rtt based entirely on what it 358 | observes (see {{smoothed-rtt}}) and limits potential underestimation due to 359 | erroneously reported delays by the peer. 360 | 361 | The RTT for a network path may change over time. If a path's actual RTT 362 | decreases, the min_rtt will adapt immediately on the first low sample. If the 363 | path's actual RTT increases, however, the min_rtt will not adapt to it, allowing 364 | future RTT samples that are smaller than the new RTT to be included in 365 | smoothed_rtt. 366 | 367 | Endpoints SHOULD set the min_rtt to the newest RTT sample after persistent 368 | congestion is established. This avoids repeatedly declaring persistent 369 | congestion when the RTT increases. This also allows a connection to reset 370 | its estimate of min_rtt and smoothed_rtt after a disruptive network event; 371 | see {{smoothed-rtt}}. 372 | 373 | Endpoints MAY reestablish the min_rtt at other times in the connection, such as 374 | when traffic volume is low and an acknowledgment is received with a low 375 | acknowledgment delay. Implementations SHOULD NOT refresh the min_rtt 376 | value too often since the actual minimum RTT of the path is not 377 | frequently observable. 378 | 379 | 380 | ## Estimating smoothed_rtt and rttvar {#smoothed-rtt} 381 | 382 | smoothed_rtt is an exponentially weighted moving average of an endpoint's RTT 383 | samples, and rttvar estimates the variation in the RTT samples using a mean 384 | variation. 385 | 386 | The calculation of smoothed_rtt uses RTT samples after adjusting them for 387 | acknowledgment delays. These delays are decoded from the ACK Delay field of 388 | ACK frames as described in {{Section 19.3 of QUIC-TRANSPORT}}. 389 | 390 | The peer might report acknowledgment delays that are larger than the peer's 391 | max_ack_delay during the handshake ({{Section 13.2.1 of QUIC-TRANSPORT}}). To 392 | account for this, the endpoint SHOULD ignore max_ack_delay until the handshake 393 | is confirmed, as defined in {{Section 4.1.2 of QUIC-TLS}}. When they occur, 394 | these large acknowledgment delays are likely to be non-repeating and limited to 395 | the handshake. The endpoint can therefore use them without limiting them to the 396 | max_ack_delay, avoiding unnecessary inflation of the RTT estimate. 397 | 398 | Note that a large acknowledgment delay can result in a substantially inflated 399 | smoothed_rtt if there is an error either in the peer's reporting of the 400 | acknowledgment delay or in the endpoint's min_rtt estimate. Therefore, prior 401 | to handshake confirmation, an endpoint MAY ignore RTT samples if adjusting 402 | the RTT sample for acknowledgment delay causes the sample to be less than the 403 | min_rtt. 404 | 405 | After the handshake is confirmed, any acknowledgment delays reported by the 406 | peer that are greater than the peer's max_ack_delay are attributed to 407 | unintentional but potentially repeating delays, such as scheduler latency at the 408 | peer or loss of previous acknowledgments. Excess delays could also be due to 409 | a noncompliant receiver. Therefore, these extra delays are considered 410 | effectively part of path delay and incorporated into the RTT estimate. 411 | 412 | Therefore, when adjusting an RTT sample using peer-reported acknowledgment 413 | delays, an endpoint: 414 | 415 | - MAY ignore the acknowledgment delay for Initial packets, since these 416 | acknowledgments are not delayed by the peer ({{Section 13.2.1 of 417 | QUIC-TRANSPORT}}); 418 | 419 | - SHOULD ignore the peer's max_ack_delay until the handshake is confirmed; 420 | 421 | - MUST use the lesser of the acknowledgment delay and the peer's max_ack_delay 422 | after the handshake is confirmed; and 423 | 424 | - MUST NOT subtract the acknowledgment delay from the RTT sample if the 425 | resulting value is smaller than the min_rtt. This limits the underestimation 426 | of the smoothed_rtt due to a misreporting peer. 427 | 428 | Additionally, an endpoint might postpone the processing of acknowledgments when 429 | the corresponding decryption keys are not immediately available. For example, a 430 | client might receive an acknowledgment for a 0-RTT packet that it cannot 431 | decrypt because 1-RTT packet protection keys are not yet available to it. In 432 | such cases, an endpoint SHOULD subtract such local delays from its RTT sample 433 | until the handshake is confirmed. 434 | 435 | Similar to {{?RFC6298}}, smoothed_rtt and rttvar are computed as follows. 436 | 437 | An endpoint initializes the RTT estimator during connection establishment and 438 | when the estimator is reset during connection migration; see {{Section 9.4 of 439 | QUIC-TRANSPORT}}. Before any RTT samples are available for a new path or when 440 | the estimator is reset, the estimator is initialized using the initial RTT; see 441 | {{pto-handshake}}. 442 | 443 | smoothed_rtt and rttvar are initialized as follows, where kInitialRtt contains 444 | the initial RTT value: 445 | 446 | ~~~pseudocode 447 | smoothed_rtt = kInitialRtt 448 | rttvar = kInitialRtt / 2 449 | ~~~ 450 | 451 | RTT samples for the network path are recorded in latest_rtt; see 452 | {{latest-rtt}}. On the first RTT sample after initialization, the estimator is 453 | reset using that sample. This ensures that the estimator retains no history of 454 | past samples. Packets sent on other paths do not contribute RTT samples to the 455 | current path, as described in {{Section 9.4 of QUIC-TRANSPORT}}. 456 | 457 | On the first RTT sample after initialization, smoothed_rtt and rttvar are set as 458 | follows: 459 | 460 | ~~~pseudocode 461 | smoothed_rtt = latest_rtt 462 | rttvar = latest_rtt / 2 463 | ~~~ 464 | 465 | On subsequent RTT samples, smoothed_rtt and rttvar evolve as follows: 466 | 467 | ~~~pseudocode 468 | ack_delay = decoded acknowledgment delay from ACK frame 469 | if (handshake confirmed): 470 | ack_delay = min(ack_delay, max_ack_delay) 471 | adjusted_rtt = latest_rtt 472 | if (latest_rtt >= min_rtt + ack_delay): 473 | adjusted_rtt = latest_rtt - ack_delay 474 | smoothed_rtt = 7/8 * smoothed_rtt + 1/8 * adjusted_rtt 475 | rttvar_sample = abs(smoothed_rtt - adjusted_rtt) 476 | rttvar = 3/4 * rttvar + 1/4 * rttvar_sample 477 | ~~~ 478 | 479 | # Loss Detection {#loss-detection} 480 | 481 | QUIC senders use acknowledgments to detect lost packets and a PTO to ensure 482 | acknowledgments are received; see {{pto}}. This section provides a description 483 | of these algorithms. 484 | 485 | If a packet is lost, the QUIC transport needs to recover from that loss, such 486 | as by retransmitting the data, sending an updated frame, or discarding the 487 | frame. For more information, see {{Section 13.3 of QUIC-TRANSPORT}}. 488 | 489 | Loss detection is separate per packet number space, unlike RTT measurement and 490 | congestion control, because RTT and congestion control are properties of the 491 | path, whereas loss detection also relies upon key availability. 492 | 493 | ## Acknowledgment-Based Detection {#ack-loss-detection} 494 | 495 | Acknowledgment-based loss detection implements the spirit of TCP's Fast 496 | Retransmit {{?RFC5681}}, Early Retransmit {{?RFC5827}}, Forward Acknowledgment 497 | {{FACK}}, SACK loss recovery {{?RFC6675}}, and RACK-TLP {{?RFC8985}}. This 498 | section provides an overview of how these algorithms are implemented in QUIC. 499 | 500 | A packet is declared lost if it meets all of the following conditions: 501 | 502 | * The packet is unacknowledged, in flight, and was sent prior to an 503 | acknowledged packet. 504 | 505 | * The packet was sent kPacketThreshold packets before an acknowledged packet 506 | ({{packet-threshold}}), or it was sent long enough in the past 507 | ({{time-threshold}}). 508 | 509 | The acknowledgment indicates that a packet sent later was delivered, and the 510 | packet and time thresholds provide some tolerance for packet reordering. 511 | 512 | Spuriously declaring packets as lost leads to unnecessary retransmissions and 513 | may result in degraded performance due to the actions of the congestion 514 | controller upon detecting loss. Implementations can detect spurious 515 | retransmissions and increase the packet or time reordering threshold to 516 | reduce future spurious retransmissions and loss events. Implementations with 517 | adaptive time thresholds MAY choose to start with smaller initial reordering 518 | thresholds to minimize recovery latency. 519 | 520 | ### Packet Threshold {#packet-threshold} 521 | 522 | The RECOMMENDED initial value for the packet reordering threshold 523 | (kPacketThreshold) is 3, based on best practices for TCP loss detection 524 | {{?RFC5681}} {{?RFC6675}}. In order to remain similar to TCP, 525 | implementations SHOULD NOT use a packet threshold less than 3; see {{?RFC5681}}. 526 | 527 | Some networks may exhibit higher degrees of packet reordering, causing a sender 528 | to detect spurious losses. Additionally, packet reordering could be more common 529 | with QUIC than TCP because network elements that could observe and reorder TCP 530 | packets cannot do that for QUIC and also because QUIC packet numbers are 531 | encrypted. Algorithms that increase the reordering threshold after spuriously 532 | detecting losses, such as RACK {{?RFC8985}}, have proven to be useful in TCP and 533 | are expected to be at least as useful in QUIC. 534 | 535 | ### Time Threshold {#time-threshold} 536 | 537 | Once a later packet within the same packet number space has been acknowledged, 538 | an endpoint SHOULD declare an earlier packet lost if it was sent a threshold 539 | amount of time in the past. To avoid declaring packets as lost too early, this 540 | time threshold MUST be set to at least the local timer granularity, as 541 | indicated by the kGranularity constant. The time threshold is: 542 | 543 | ~~~pseudocode 544 | max(kTimeThreshold * max(smoothed_rtt, latest_rtt), kGranularity) 545 | ~~~ 546 | 547 | If packets sent prior to the largest acknowledged packet cannot yet be declared 548 | lost, then a timer SHOULD be set for the remaining time. 549 | 550 | Using max(smoothed_rtt, latest_rtt) protects from the two following cases: 551 | 552 | * the latest RTT sample is lower than the smoothed RTT, perhaps due to 553 | reordering where the acknowledgment encountered a shorter path; 554 | 555 | * the latest RTT sample is higher than the smoothed RTT, perhaps due to a 556 | sustained increase in the actual RTT, but the smoothed RTT has not yet caught 557 | up. 558 | 559 | The RECOMMENDED time threshold (kTimeThreshold), expressed as an RTT multiplier, 560 | is 9/8. The RECOMMENDED value of the timer granularity (kGranularity) is 1 561 | millisecond. 562 | 563 | 567 | 568 | Implementations MAY experiment with absolute thresholds, thresholds from 569 | previous connections, adaptive thresholds, or the including of RTT variation. 570 | Smaller thresholds reduce reordering resilience and increase spurious 571 | retransmissions, and larger thresholds increase loss detection delay. 572 | 573 | 574 | ## Probe Timeout {#pto} 575 | 576 | A Probe Timeout (PTO) triggers the sending of one or two probe datagrams when 577 | ack-eliciting packets are not acknowledged within the expected period of 578 | time or the server may not have validated the client's address. A PTO enables 579 | a connection to recover from loss of tail packets or acknowledgments. 580 | 581 | As with loss detection, the PTO is per packet number space. That is, a 582 | PTO value is computed per packet number space. 583 | 584 | A PTO timer expiration event does not indicate packet loss and MUST NOT cause 585 | prior unacknowledged packets to be marked as lost. When an acknowledgment is 586 | received that newly acknowledges packets, loss detection proceeds as dictated 587 | by the packet and time threshold mechanisms; see {{ack-loss-detection}}. 588 | 589 | The PTO algorithm used in QUIC implements the reliability functions of Tail Loss 590 | Probe {{?RFC8985}}, RTO {{?RFC5681}}, and F-RTO algorithms for TCP 591 | {{?RFC5682}}. The timeout computation is based on TCP's RTO period {{?RFC6298}}. 592 | 593 | ### Computing PTO 594 | 595 | When an ack-eliciting packet is transmitted, the sender schedules a timer for 596 | the PTO period as follows: 597 | 598 | ~~~pseudocode 599 | PTO = smoothed_rtt + max(4*rttvar, kGranularity) + max_ack_delay 600 | ~~~ 601 | 602 | The PTO period is the amount of time that a sender ought to wait for an 603 | acknowledgment of a sent packet. This time period includes the estimated 604 | network RTT (smoothed_rtt), the variation in the estimate (4*rttvar), 605 | and max_ack_delay, to account for the maximum time by which a receiver might 606 | delay sending an acknowledgment. 607 | 608 | When the PTO is armed for Initial or Handshake packet number spaces, the 609 | max_ack_delay in the PTO period computation is set to 0, since the peer is 610 | expected to not delay these packets intentionally; see {{Section 13.2.1 of 611 | QUIC-TRANSPORT}}. 612 | 613 | The PTO period MUST be at least kGranularity to avoid the timer expiring 614 | immediately. 615 | 616 | When ack-eliciting packets in multiple packet number spaces are in flight, the 617 | timer MUST be set to the earlier value of the Initial and Handshake packet 618 | number spaces. 619 | 620 | An endpoint MUST NOT set its PTO timer for the Application Data packet number 621 | space until the handshake is confirmed. Doing so prevents the endpoint from 622 | retransmitting information in packets when either the peer does not yet have the 623 | keys to process them or the endpoint does not yet have the keys to process their 624 | acknowledgments. For example, this can happen when a client sends 0-RTT packets 625 | to the server; it does so without knowing whether the server will be able to 626 | decrypt them. Similarly, this can happen when a server sends 1-RTT packets 627 | before confirming that the client has verified the server's certificate and can 628 | therefore read these 1-RTT packets. 629 | 630 | A sender SHOULD restart its PTO timer every time an ack-eliciting packet is 631 | sent or acknowledged, or when Initial or Handshake keys are discarded 632 | ({{Section 4.9 of QUIC-TLS}}). This ensures the PTO is always set based on the 633 | latest estimate of the RTT and for the correct packet across packet 634 | number spaces. 635 | 636 | When a PTO timer expires, the PTO backoff MUST be increased, resulting in the 637 | PTO period being set to twice its current value. The PTO backoff factor is reset 638 | when an acknowledgment is received, except in the following case. A server 639 | might take longer to respond to packets during the handshake than otherwise. To 640 | protect such a server from repeated client probes, the PTO backoff is not reset 641 | at a client that is not yet certain that the server has finished validating the 642 | client's address. That is, a client does not reset the PTO backoff factor on 643 | receiving acknowledgments in Initial packets. 644 | 645 | This exponential reduction in the sender's rate is important because consecutive 646 | PTOs might be caused by loss of packets or acknowledgments due to severe 647 | congestion. Even when there are ack-eliciting packets in flight in multiple 648 | packet number spaces, the exponential increase in PTO occurs across all spaces 649 | to prevent excess load on the network. For example, a timeout in the Initial 650 | packet number space doubles the length of the timeout in the Handshake packet 651 | number space. 652 | 653 | The total length of time over which consecutive PTOs expire is limited by the 654 | idle timeout. 655 | 656 | The PTO timer MUST NOT be set if a timer is set for time threshold 657 | loss detection; see {{time-threshold}}. A timer that is set for time 658 | threshold loss detection will expire earlier than the PTO timer 659 | in most cases and is less likely to spuriously retransmit data. 660 | 661 | ### Handshakes and New Paths {#pto-handshake} 662 | 663 | Resumed connections over the same network MAY use the previous connection's 664 | final smoothed RTT value as the resumed connection's initial RTT. When no 665 | previous RTT is available, the initial RTT SHOULD be set to 333 milliseconds. 666 | This results in handshakes starting with a PTO of 1 second, as recommended for 667 | TCP's initial RTO; see {{Section 2 of RFC6298}}. 668 | 669 | A connection MAY use the delay between sending a PATH_CHALLENGE and receiving a 670 | PATH_RESPONSE to set the initial RTT (see kInitialRtt in 671 | {{constants-of-interest}}) for a new path, but the delay SHOULD NOT be 672 | considered an RTT sample. 673 | 674 | When the Initial keys and Handshake keys are discarded (see 675 | {{discarding-packets}}), any Initial packets and Handshake packets can 676 | no longer be acknowledged, so they are removed from bytes in 677 | flight. When Initial or Handshake keys are discarded, the PTO and loss 678 | detection timers MUST be reset, because discarding keys indicates 679 | forward progress and the loss detection timer might have been set for 680 | a now-discarded packet number space. 681 | 682 | #### Before Address Validation 683 | 684 | Until the server has validated the client's address on the path, the amount of 685 | data it can send is limited to three times the amount of data received, 686 | as specified in {{Section 8.1 of QUIC-TRANSPORT}}. If no additional data can be 687 | sent, the server's PTO timer MUST NOT be armed until datagrams have been 688 | received from the client because packets sent on PTO count against the 689 | anti-amplification limit. 690 | 691 | When the server receives a datagram from the client, the amplification limit is 692 | increased and the server resets the PTO timer. If the PTO timer is then set to 693 | a time in the past, it is executed immediately. Doing so avoids sending new 694 | 1-RTT packets prior to packets critical to the completion of the handshake. 695 | In particular, this can happen when 0-RTT is accepted but the server fails to 696 | validate the client's address. 697 | 698 | Since the server could be blocked until more datagrams are received from the 699 | client, it is the client's responsibility to send packets to unblock the server 700 | until it is certain that the server has finished its address validation (see 701 | {{Section 8 of QUIC-TRANSPORT}}). That is, the client MUST set the PTO timer 702 | if the client has not received an acknowledgment for any of its Handshake 703 | packets and the handshake is not confirmed (see {{Section 4.1.2 of QUIC-TLS}}), 704 | even if there are no packets in flight. When the PTO fires, the client MUST 705 | send a Handshake packet if it has Handshake keys, otherwise it MUST send an 706 | Initial packet in a UDP datagram with a payload of at least 1200 bytes. 707 | 708 | ### Speeding up Handshake Completion 709 | 710 | When a server receives an Initial packet containing duplicate CRYPTO data, 711 | it can assume the client did not receive all of the server's CRYPTO data sent 712 | in Initial packets, or the client's estimated RTT is too small. When a 713 | client receives Handshake or 1-RTT packets prior to obtaining Handshake keys, 714 | it may assume some or all of the server's Initial packets were lost. 715 | 716 | To speed up handshake completion under these conditions, an endpoint MAY, for a 717 | limited number of times per connection, send a packet containing 718 | unacknowledged CRYPTO data earlier than the PTO expiry, subject to the address 719 | validation limits in {{Section 8.1 of QUIC-TRANSPORT}}. Doing so at most once 720 | for each connection is adequate to quickly recover from a single packet loss. 721 | An endpoint that always retransmits packets in response to receiving packets 722 | that it cannot process risks creating an infinite exchange of packets. 723 | 724 | Endpoints can also use coalesced packets (see {{Section 12.2 of 725 | QUIC-TRANSPORT}}) to ensure that each datagram elicits at least one 726 | acknowledgment. For example, a client can coalesce an Initial packet containing 727 | PING and PADDING frames with a 0-RTT data packet, and a server can coalesce an 728 | Initial packet containing a PING frame with one or more packets in its first 729 | flight. 730 | 731 | ### Sending Probe Packets 732 | 733 | When a PTO timer expires, a sender MUST send at least one ack-eliciting packet 734 | in the packet number space as a probe. An endpoint MAY send up to two 735 | full-sized datagrams containing ack-eliciting packets to avoid an expensive 736 | consecutive PTO expiration due to a single lost datagram or to transmit data 737 | from multiple packet number spaces. All probe packets sent on a PTO MUST be 738 | ack-eliciting. 739 | 740 | In addition to sending data in the packet number space for which the timer 741 | expired, the sender SHOULD send ack-eliciting packets from other packet number 742 | spaces with in-flight data, coalescing packets if possible. This is 743 | particularly valuable when the server has both Initial and Handshake data in 744 | flight or when the client has both Handshake and Application Data in flight 745 | because the peer might only have receive keys for one of the two packet number 746 | spaces. 747 | 748 | If the sender wants to elicit a faster acknowledgment on PTO, it can skip a 749 | packet number to eliminate the acknowledgment delay. 750 | 751 | An endpoint SHOULD include new data in packets that are sent on PTO expiration. 752 | Previously sent data MAY be sent if no new data can be sent. Implementations 753 | MAY use alternative strategies for determining the content of probe packets, 754 | including sending new or retransmitted data based on the application's 755 | priorities. 756 | 757 | It is possible the sender has no new or previously sent data to send. 758 | As an example, consider the following sequence of events: new application data 759 | is sent in a STREAM frame, deemed lost, then retransmitted in a new packet, 760 | and then the original transmission is acknowledged. When there is no data to 761 | send, the sender SHOULD send a PING or other ack-eliciting frame in a single 762 | packet, rearming the PTO timer. 763 | 764 | Alternatively, instead of sending an ack-eliciting packet, the sender MAY mark 765 | any packets still in flight as lost. Doing so avoids sending an additional 766 | packet but increases the risk that loss is declared too aggressively, resulting 767 | in an unnecessary rate reduction by the congestion controller. 768 | 769 | Consecutive PTO periods increase exponentially, and as a result, connection 770 | recovery latency increases exponentially as packets continue to be dropped in 771 | the network. Sending two packets on PTO expiration increases resilience to 772 | packet drops, thus reducing the probability of consecutive PTO events. 773 | 774 | When the PTO timer expires multiple times and new data cannot be sent, 775 | implementations must choose between sending the same payload every time 776 | or sending different payloads. Sending the same payload may be simpler 777 | and ensures the highest priority frames arrive first. Sending different 778 | payloads each time reduces the chances of spurious retransmission. 779 | 780 | 781 | ## Handling Retry Packets 782 | 783 | A Retry packet causes a client to send another Initial packet, effectively 784 | restarting the connection process. A Retry packet indicates that the Initial 785 | packet was received but not processed. A Retry packet cannot be treated as an 786 | acknowledgment because it does not indicate that a packet was processed or 787 | specify the packet number. 788 | 789 | Clients that receive a Retry packet reset congestion control and loss recovery 790 | state, including resetting any pending timers. Other connection state, in 791 | particular cryptographic handshake messages, is retained; see 792 | {{Section 17.2.5 of QUIC-TRANSPORT}}. 793 | 794 | The client MAY compute an RTT estimate to the server as the time period from 795 | when the first Initial packet was sent to when a Retry or a Version Negotiation 796 | packet is received. The client MAY use this value in place of its default for 797 | the initial RTT estimate. 798 | 799 | ## Discarding Keys and Packet State {#discarding-packets} 800 | 801 | When Initial and Handshake packet protection keys are discarded 802 | (see {{Section 4.9 of QUIC-TLS}}), all packets that were sent with those keys 803 | can no longer be acknowledged because their acknowledgments cannot be processed. 804 | The sender MUST discard all recovery state associated with those packets 805 | and MUST remove them from the count of bytes in flight. 806 | 807 | Endpoints stop sending and receiving Initial packets once they start exchanging 808 | Handshake packets; see {{Section 17.2.2.1 of QUIC-TRANSPORT}}. At this point, 809 | recovery state for all in-flight Initial packets is discarded. 810 | 811 | When 0-RTT is rejected, recovery state for all in-flight 0-RTT packets is 812 | discarded. 813 | 814 | If a server accepts 0-RTT, but does not buffer 0-RTT packets that arrive 815 | before Initial packets, early 0-RTT packets will be declared lost, but that 816 | is expected to be infrequent. 817 | 818 | It is expected that keys are discarded at some time after the packets 819 | encrypted with them are either acknowledged or declared lost. However, 820 | Initial and Handshake secrets are discarded as soon as Handshake and 821 | 1-RTT keys are proven to be available to both client and server; see 822 | {{Section 4.9.1 of QUIC-TLS}}. 823 | 824 | # Congestion Control {#congestion-control} 825 | 826 | This document specifies a sender-side congestion controller for QUIC similar to 827 | TCP NewReno {{?RFC6582}}. 828 | 829 | The signals QUIC provides for congestion control are generic and are designed to 830 | support different sender-side algorithms. A sender can unilaterally choose a 831 | different algorithm to use, such as CUBIC {{?RFC8312}}. 832 | 833 | If a sender uses a different controller than that specified in this document, 834 | the chosen controller MUST conform to the congestion control guidelines 835 | specified in {{Section 3.1 of RFC8085}}. 836 | 837 | Similar to TCP, packets containing only ACK frames do not count toward bytes 838 | in flight and are not congestion controlled. Unlike TCP, QUIC can detect the 839 | loss of these packets and MAY use that information to adjust the congestion 840 | controller or the rate of ACK-only packets being sent, but this document does 841 | not describe a mechanism for doing so. 842 | 843 | The congestion controller is per path, so packets sent on other paths do not 844 | alter the current path's congestion controller, as described in 845 | {{Section 9.4 of QUIC-TRANSPORT}}. 846 | 847 | The algorithm in this document specifies and uses the controller's congestion 848 | window in bytes. 849 | 850 | An endpoint MUST NOT send a packet if it would cause bytes_in_flight (see 851 | {{vars-of-interest}}) to be larger than the congestion window, unless the packet 852 | is sent on a PTO timer expiration (see {{pto}}) or when entering recovery 853 | (see {{recovery-period}}). 854 | 855 | ## Explicit Congestion Notification {#congestion-ecn} 856 | 857 | If a path has been validated to support Explicit Congestion Notification (ECN) 858 | {{!RFC3168}} {{?RFC8311}}, QUIC treats a Congestion Experienced (CE) codepoint 859 | in the IP header as a signal of congestion. This document specifies an 860 | endpoint's response when the peer-reported ECN-CE count increases; see {{Section 861 | 13.4.2 of QUIC-TRANSPORT}}. 862 | 863 | ## Initial and Minimum Congestion Window {#initial-cwnd} 864 | 865 | QUIC begins every connection in slow start with the congestion window set to an 866 | initial value. Endpoints SHOULD use an initial congestion window of ten times 867 | the maximum datagram size (max_datagram_size), while limiting the window to the 868 | larger of 14,720 bytes or twice the maximum datagram size. This follows the 869 | analysis and recommendations in {{?RFC6928}}, increasing the byte limit to 870 | account for the smaller 8-byte overhead of UDP compared to the 20-byte overhead 871 | for TCP. 872 | 873 | If the maximum datagram size changes during the connection, the initial 874 | congestion window SHOULD be recalculated with the new size. If the maximum 875 | datagram size is decreased in order to complete the handshake, the 876 | congestion window SHOULD be set to the new initial congestion window. 877 | 878 | Prior to validating the client's address, the server can be further limited by 879 | the anti-amplification limit as specified in {{Section 8.1 of QUIC-TRANSPORT}}. 880 | Though the anti-amplification limit can prevent the congestion window from 881 | being fully utilized and therefore slow down the increase in congestion window, 882 | it does not directly affect the congestion window. 883 | 884 | The minimum congestion window is the smallest value the congestion window can 885 | attain in response to loss, an increase in the peer-reported ECN-CE count, 886 | or persistent congestion. The RECOMMENDED value is 2 * max_datagram_size. 887 | 888 | ## Congestion Control States 889 | 890 | The NewReno congestion controller described in this document has three 891 | distinct states, as shown in {{fig-cc-fsm}}. 892 | 893 | ~~~ 894 | New path or +------------+ 895 | persistent congestion | Slow | 896 | (O)---------------------->| Start | 897 | +------------+ 898 | | 899 | Loss or | 900 | ECN-CE increase | 901 | v 902 | +------------+ Loss or +------------+ 903 | | Congestion | ECN-CE increase | Recovery | 904 | | Avoidance |------------------>| Period | 905 | +------------+ +------------+ 906 | ^ | 907 | | | 908 | +----------------------------+ 909 | Acknowledgment of packet 910 | sent during recovery 911 | ~~~ 912 | {: #fig-cc-fsm title="Congestion Control States and Transitions"} 913 | 914 | These states and the transitions between them are described in subsequent 915 | sections. 916 | 917 | ### Slow Start 918 | 919 | A NewReno sender is in slow start any time the congestion window is below the 920 | slow start threshold. A sender begins in slow start because the slow start 921 | threshold is initialized to an infinite value. 922 | 923 | While a sender is in slow start, the congestion window increases by the number 924 | of bytes acknowledged when each acknowledgment is processed. This results in 925 | exponential growth of the congestion window. 926 | 927 | The sender MUST exit slow start and enter a recovery period when a packet is 928 | lost or when the ECN-CE count reported by its peer increases. 929 | 930 | A sender reenters slow start any time the congestion window is less than the 931 | slow start threshold, which only occurs after persistent congestion is 932 | declared. 933 | 934 | ### Recovery {#recovery-period} 935 | 936 | A NewReno sender enters a recovery period when it detects the loss of a packet 937 | or when the ECN-CE count reported by its peer increases. A sender that is 938 | already in a recovery period stays in it and does not reenter it. 939 | 940 | On entering a recovery period, a sender MUST set the slow start threshold to 941 | half the value of the congestion window when loss is detected. The congestion 942 | window MUST be set to the reduced value of the slow start threshold before 943 | exiting the recovery period. 944 | 945 | Implementations MAY reduce the congestion window immediately upon entering a 946 | recovery period or use other mechanisms, such as Proportional Rate Reduction 947 | {{?PRR=RFC6937}}, to reduce the congestion window more gradually. If the 948 | congestion window is reduced immediately, a single packet can be sent prior to 949 | reduction. This speeds up loss recovery if the data in the lost packet is 950 | retransmitted and is similar to TCP as described in {{Section 5 of RFC6675}}. 951 | 952 | The recovery period aims to limit congestion window reduction to once per round 953 | trip. Therefore, during a recovery period, the congestion window does not change 954 | in response to new losses or increases in the ECN-CE count. 955 | 956 | A recovery period ends and the sender enters congestion avoidance when a packet 957 | sent during the recovery period is acknowledged. This is slightly different 958 | from TCP's definition of recovery, which ends when the lost segment that 959 | started recovery is acknowledged {{?RFC5681}}. 960 | 961 | ### Congestion Avoidance 962 | 963 | A NewReno sender is in congestion avoidance any time the congestion window is 964 | at or above the slow start threshold and not in a recovery period. 965 | 966 | A sender in congestion avoidance uses an Additive Increase Multiplicative 967 | Decrease (AIMD) approach that MUST limit the increase to the congestion window 968 | to at most one maximum datagram size for each congestion window that is 969 | acknowledged. 970 | 971 | The sender exits congestion avoidance and enters a recovery period when a 972 | packet is lost or when the ECN-CE count reported by its peer increases. 973 | 974 | ## Ignoring Loss of Undecryptable Packets 975 | 976 | During the handshake, some packet protection keys might not be available when 977 | a packet arrives, and the receiver can choose to drop the packet. In particular, 978 | Handshake and 0-RTT packets cannot be processed until the Initial packets 979 | arrive, and 1-RTT packets cannot be processed until the handshake completes. 980 | Endpoints MAY ignore the loss of Handshake, 0-RTT, and 1-RTT packets that might 981 | have arrived before the peer had packet protection keys to process those 982 | packets. Endpoints MUST NOT ignore the loss of packets that were sent after 983 | the earliest acknowledged packet in a given packet number space. 984 | 985 | ## Probe Timeout 986 | 987 | Probe packets MUST NOT be blocked by the congestion controller. A sender MUST 988 | however count these packets as being additionally in flight, since these packets 989 | add network load without establishing packet loss. Note that sending probe 990 | packets might cause the sender's bytes in flight to exceed the congestion window 991 | until an acknowledgment is received that establishes loss or delivery of 992 | packets. 993 | 994 | ## Persistent Congestion {#persistent-congestion} 995 | 996 | When a sender establishes loss of all packets sent over a long enough duration, 997 | the network is considered to be experiencing persistent congestion. 998 | 999 | ### Duration {#pc-duration} 1000 | 1001 | The persistent congestion duration is computed as follows: 1002 | 1003 | ~~~pseudocode 1004 | (smoothed_rtt + max(4*rttvar, kGranularity) + max_ack_delay) * 1005 | kPersistentCongestionThreshold 1006 | ~~~ 1007 | 1008 | Unlike the PTO computation in {{pto}}, this duration includes the max_ack_delay 1009 | irrespective of the packet number spaces in which losses are established. 1010 | 1011 | This duration allows a sender to send as many packets before establishing 1012 | persistent congestion, including some in response to PTO expiration, as TCP does 1013 | with Tail Loss Probes {{?RFC8985}} and an RTO {{?RFC5681}}. 1014 | 1015 | Larger values of kPersistentCongestionThreshold cause the sender to become less 1016 | responsive to persistent congestion in the network, which can result in 1017 | aggressive sending into a congested network. Too small a value can result in a 1018 | sender declaring persistent congestion unnecessarily, resulting in reduced 1019 | throughput for the sender. 1020 | 1021 | The RECOMMENDED value for kPersistentCongestionThreshold is 3, which results in 1022 | behavior that is approximately equivalent to a TCP sender declaring an RTO after 1023 | two TLPs. 1024 | 1025 | This design does not use consecutive PTO events to establish persistent 1026 | congestion, since application patterns impact PTO expiration. For example, a 1027 | sender that sends small amounts of data with silence periods between them 1028 | restarts the PTO timer every time it sends, potentially preventing the PTO timer 1029 | from expiring for a long period of time, even when no acknowledgments are being 1030 | received. The use of a duration enables a sender to establish persistent 1031 | congestion without depending on PTO expiration. 1032 | 1033 | ### Establishing Persistent Congestion 1034 | 1035 | A sender establishes persistent congestion after the receipt of an 1036 | acknowledgment if two packets that are ack-eliciting are declared lost, and: 1037 | 1038 | * across all packet number spaces, none of the packets sent between the send 1039 | times of these two packets are acknowledged; 1040 | 1041 | * the duration between the send times of these two packets exceeds the 1042 | persistent congestion duration ({{pc-duration}}); and 1043 | 1044 | * a prior RTT sample existed when these two packets were sent. 1045 | 1046 | These two packets MUST be ack-eliciting, since a receiver is required to 1047 | acknowledge only ack-eliciting packets within its maximum acknowledgment delay; 1048 | see {{Section 13.2 of QUIC-TRANSPORT}}. 1049 | 1050 | The persistent congestion period SHOULD NOT start until there is at least one 1051 | RTT sample. Before the first RTT sample, a sender arms its PTO timer based on 1052 | the initial RTT ({{pto-handshake}}), which could be substantially larger than 1053 | the actual RTT. Requiring a prior RTT sample prevents a sender from establishing 1054 | persistent congestion with potentially too few probes. 1055 | 1056 | Since network congestion is not affected by packet number spaces, persistent 1057 | congestion SHOULD consider packets sent across packet number spaces. A sender 1058 | that does not have state for all packet number spaces or an implementation that 1059 | cannot compare send times across packet number spaces MAY use state for just the 1060 | packet number space that was acknowledged. This might result in erroneously 1061 | declaring persistent congestion, but it will not lead to a failure to detect 1062 | persistent congestion. 1063 | 1064 | When persistent congestion is declared, the sender's congestion window MUST be 1065 | reduced to the minimum congestion window (kMinimumWindow), similar to a TCP 1066 | sender's response on an RTO {{RFC5681}}. 1067 | 1068 | ### Example 1069 | 1070 | The following example illustrates how a sender might establish persistent 1071 | congestion. Assume: 1072 | 1073 | ~~~pseudocode 1074 | smoothed_rtt + max(4*rttvar, kGranularity) + max_ack_delay = 2 1075 | kPersistentCongestionThreshold = 3 1076 | ~~~ 1077 | 1078 | Consider the following sequence of events: 1079 | 1080 | | Time | Action | 1081 | |:-------|:----------------------------------| 1082 | | t=0 | Send packet #1 (application data) | 1083 | | t=1 | Send packet #2 (application data) | 1084 | | t=1.2 | Receive acknowledgment of #1 | 1085 | | t=2 | Send packet #3 (application data) | 1086 | | t=3 | Send packet #4 (application data) | 1087 | | t=4 | Send packet #5 (application data) | 1088 | | t=5 | Send packet #6 (application data) | 1089 | | t=6 | Send packet #7 (application data) | 1090 | | t=8 | Send packet #8 (PTO 1) | 1091 | | t=12 | Send packet #9 (PTO 2) | 1092 | | t=12.2 | Receive acknowledgment of #9 | 1093 | 1094 | Packets 2 through 8 are declared lost when the acknowledgment for packet 9 is 1095 | received at `t = 12.2`. 1096 | 1097 | The congestion period is calculated as the time between the oldest and newest 1098 | lost packets: `8 - 1 = 7`. The persistent congestion duration is `2 * 3 = 6`. 1099 | Because the threshold was reached and because none of the packets between the 1100 | oldest and the newest lost packets were acknowledged, the network is considered 1101 | to have experienced persistent congestion. 1102 | 1103 | While this example shows PTO expiration, they are not required for persistent 1104 | congestion to be established. 1105 | 1106 | 1107 | ## Pacing {#pacing} 1108 | 1109 | A sender SHOULD pace sending of all in-flight packets based on input from the 1110 | congestion controller. 1111 | 1112 | Sending multiple packets into the network without any delay between them creates 1113 | a packet burst that might cause short-term congestion and losses. Senders MUST 1114 | either use pacing or limit such bursts. Senders SHOULD limit bursts to the 1115 | initial congestion window; see {{initial-cwnd}}. A sender with knowledge that 1116 | the network path to the receiver can absorb larger bursts MAY use a higher 1117 | limit. 1118 | 1119 | An implementation should take care to architect its congestion controller to 1120 | work well with a pacer. For instance, a pacer might wrap the congestion 1121 | controller and control the availability of the congestion window, or a pacer 1122 | might pace out packets handed to it by the congestion controller. 1123 | 1124 | Timely delivery of ACK frames is important for efficient loss recovery. To avoid 1125 | delaying their delivery to the peer, packets containing only ACK frames SHOULD 1126 | therefore not be paced. 1127 | 1128 | Endpoints can implement pacing as they choose. A perfectly paced sender spreads 1129 | packets exactly evenly over time. For a window-based congestion controller, such 1130 | as the one in this document, that rate can be computed by averaging the 1131 | congestion window over the RTT. Expressed as a rate in units of 1132 | bytes per time, where congestion_window is in bytes: 1133 | 1134 | ~~~pseudocode 1135 | rate = N * congestion_window / smoothed_rtt 1136 | ~~~ 1137 | 1138 | Or expressed as an inter-packet interval in units of time: 1139 | 1140 | ~~~pseudocode 1141 | interval = ( smoothed_rtt * packet_size / congestion_window ) / N 1142 | ~~~ 1143 | 1144 | Using a value for `N` that is small, but at least 1 (for example, 1.25) ensures 1145 | that variations in RTT do not result in underutilization of the 1146 | congestion window. 1147 | 1148 | Practical considerations, such as packetization, scheduling delays, and 1149 | computational efficiency, can cause a sender to deviate from this rate over time 1150 | periods that are much shorter than an RTT. 1151 | 1152 | One possible implementation strategy for pacing uses a leaky bucket algorithm, 1153 | where the capacity of the "bucket" is limited to the maximum burst size and the 1154 | rate the "bucket" fills is determined by the above function. 1155 | 1156 | ## Underutilizing the Congestion Window 1157 | 1158 | When bytes in flight is smaller than the congestion window and sending is not 1159 | pacing limited, the congestion window is underutilized. This can happen due to 1160 | insufficient application data or flow control limits. When this occurs, 1161 | the congestion window SHOULD NOT be increased in either slow start or 1162 | congestion avoidance. 1163 | 1164 | A sender that paces packets (see {{pacing}}) might delay sending packets 1165 | and not fully utilize the congestion window due to this delay. A sender 1166 | SHOULD NOT consider itself application limited if it would have fully 1167 | utilized the congestion window without pacing delay. 1168 | 1169 | A sender MAY implement alternative mechanisms to update its congestion window 1170 | after periods of underutilization, such as those proposed for TCP in 1171 | {{?RFC7661}}. 1172 | 1173 | 1174 | # Security Considerations 1175 | 1176 | ## Loss and Congestion Signals 1177 | 1178 | Loss detection and congestion control fundamentally involve the consumption of 1179 | signals, such as delay, loss, and ECN markings, from unauthenticated 1180 | entities. An attacker can cause endpoints to reduce their sending rate by 1181 | manipulating these signals: by dropping packets, by altering path delay 1182 | strategically, or by changing ECN codepoints. 1183 | 1184 | ## Traffic Analysis 1185 | 1186 | Packets that carry only ACK frames can be heuristically identified by observing 1187 | packet size. Acknowledgment patterns may expose information about link 1188 | characteristics or application behavior. To reduce leaked information, 1189 | endpoints can bundle acknowledgments with other frames, or they can use PADDING 1190 | frames at a potential cost to performance. 1191 | 1192 | ## Misreporting ECN Markings 1193 | 1194 | A receiver can misreport ECN markings to alter the congestion response of a 1195 | sender. Suppressing reports of ECN-CE markings could cause a sender to 1196 | increase their send rate. This increase could result in congestion and loss. 1197 | 1198 | A sender can detect suppression of reports by marking occasional packets that it 1199 | sends with an ECN-CE marking. If a packet sent with an ECN-CE marking is not 1200 | reported as having been CE marked when the packet is acknowledged, then the 1201 | sender can disable ECN for that path by not setting ECN-Capable Transport (ECT) 1202 | codepoints in subsequent packets sent on that path {{!RFC3168}}. 1203 | 1204 | Reporting additional ECN-CE markings will cause a sender to reduce their sending 1205 | rate, which is similar in effect to advertising reduced connection flow control 1206 | limits and so no advantage is gained by doing so. 1207 | 1208 | Endpoints choose the congestion controller that they use. Congestion controllers 1209 | respond to reports of ECN-CE by reducing their rate, but the response may vary. 1210 | Markings can be treated as equivalent to loss {{!RFC3168}}, but other 1211 | responses can be specified, such as {{?RFC8511}} or {{?RFC8311}}. 1212 | 1213 | 1214 | --- back 1215 | 1216 | # Loss Recovery Pseudocode 1217 | 1218 | We now describe an example implementation of the loss detection mechanisms 1219 | described in {{loss-detection}}. 1220 | 1221 | The pseudocode segments in this section are licensed as Code Components; see the 1222 | copyright notice. 1223 | 1224 | ## Tracking Sent Packets {#tracking-sent-packets} 1225 | 1226 | To correctly implement congestion control, a QUIC sender tracks every 1227 | ack-eliciting packet until the packet is acknowledged or lost. 1228 | It is expected that implementations will be able to access this information by 1229 | packet number and crypto context and store the per-packet fields 1230 | ({{sent-packets-fields}}) for loss recovery and congestion control. 1231 | 1232 | After a packet is declared lost, the endpoint can still maintain state for it 1233 | for an amount of time to allow for packet reordering; see {{Section 13.3 of 1234 | QUIC-TRANSPORT}}. This enables a sender to detect spurious retransmissions. 1235 | 1236 | Sent packets are tracked for each packet number space, and ACK 1237 | processing only applies to a single space. 1238 | 1239 | ### Sent Packet Fields {#sent-packets-fields} 1240 | 1241 | packet_number: 1242 | : The packet number of the sent packet. 1243 | 1244 | ack_eliciting: 1245 | : A Boolean that indicates whether a packet is ack-eliciting. 1246 | If true, it is expected that an acknowledgment will be received, 1247 | though the peer could delay sending the ACK frame containing it 1248 | by up to the max_ack_delay. 1249 | 1250 | in_flight: 1251 | : A Boolean that indicates whether the packet counts toward bytes in 1252 | flight. 1253 | 1254 | sent_bytes: 1255 | : The number of bytes sent in the packet, not including UDP or IP 1256 | overhead, but including QUIC framing overhead. 1257 | 1258 | time_sent: 1259 | : The time the packet was sent. 1260 | 1261 | 1262 | ## Constants of Interest {#constants-of-interest} 1263 | 1264 | Constants used in loss recovery are based on a combination of RFCs, papers, and 1265 | common practice. 1266 | 1267 | kPacketThreshold: 1268 | : Maximum reordering in packets before packet threshold loss detection 1269 | considers a packet lost. The value recommended in {{packet-threshold}} is 3. 1270 | 1271 | kTimeThreshold: 1272 | 1273 | : Maximum reordering in time before time threshold loss detection 1274 | considers a packet lost. Specified as an RTT multiplier. The value 1275 | recommended in {{time-threshold}} is 9/8. 1276 | 1277 | kGranularity: 1278 | 1279 | : Timer granularity. This is a system-dependent value, and {{time-threshold}} 1280 | recommends a value of 1 ms. 1281 | 1282 | kInitialRtt: 1283 | : The RTT used before an RTT sample is taken. The value recommended in 1284 | {{pto-handshake}} is 333 ms. 1285 | 1286 | kPacketNumberSpace: 1287 | : An enum to enumerate the three packet number spaces: 1288 | 1289 | ~~~ 1290 | enum kPacketNumberSpace { 1291 | Initial, 1292 | Handshake, 1293 | ApplicationData, 1294 | } 1295 | ~~~ 1296 | 1297 | ## Variables of Interest {#ld-vars-of-interest} 1298 | 1299 | Variables required to implement the congestion control mechanisms 1300 | are described in this section. 1301 | 1302 | latest_rtt: 1303 | : The most recent RTT measurement made when receiving an acknowledgment for 1304 | a previously unacknowledged packet. 1305 | 1306 | smoothed_rtt: 1307 | : The smoothed RTT of the connection, computed as described in 1308 | {{smoothed-rtt}}. 1309 | 1310 | rttvar: 1311 | : The RTT variation, computed as described in {{smoothed-rtt}}. 1312 | 1313 | min_rtt: 1314 | : The minimum RTT seen over a period of time, ignoring acknowledgment delay, as 1315 | described in {{min-rtt}}. 1316 | 1317 | first_rtt_sample: 1318 | : The time that the first RTT sample was obtained. 1319 | 1320 | max_ack_delay: 1321 | : The maximum amount of time by which the receiver intends to delay 1322 | acknowledgments for packets in the Application Data packet number 1323 | space, as defined by the eponymous transport parameter ({{Section 18.2 1324 | of QUIC-TRANSPORT}}). Note that the actual ack_delay in a received 1325 | ACK frame may be larger due to late timers, reordering, or loss. 1326 | 1327 | loss_detection_timer: 1328 | : Multi-modal timer used for loss detection. 1329 | 1330 | pto_count: 1331 | : The number of times a PTO has been sent without receiving an acknowledgment. 1332 | 1333 | time_of_last_ack_eliciting_packet\[kPacketNumberSpace]: 1334 | : The time the most recent ack-eliciting packet was sent. 1335 | 1336 | largest_acked_packet\[kPacketNumberSpace]: 1337 | : The largest packet number acknowledged in the packet number space so far. 1338 | 1339 | loss_time\[kPacketNumberSpace]: 1340 | : The time at which the next packet in that packet number space can be 1341 | considered lost based on exceeding the reordering window in time. 1342 | 1343 | sent_packets\[kPacketNumberSpace]: 1344 | : An association of packet numbers in a packet number space to information 1345 | about them. Described in detail above in {{tracking-sent-packets}}. 1346 | 1347 | 1348 | ## Initialization 1349 | 1350 | At the beginning of the connection, initialize the loss detection variables as 1351 | follows: 1352 | 1353 | ~~~pseudocode 1354 | loss_detection_timer.reset() 1355 | pto_count = 0 1356 | latest_rtt = 0 1357 | smoothed_rtt = kInitialRtt 1358 | rttvar = kInitialRtt / 2 1359 | min_rtt = 0 1360 | first_rtt_sample = 0 1361 | for pn_space in [ Initial, Handshake, ApplicationData ]: 1362 | largest_acked_packet[pn_space] = infinite 1363 | time_of_last_ack_eliciting_packet[pn_space] = 0 1364 | loss_time[pn_space] = 0 1365 | ~~~ 1366 | 1367 | 1368 | ## On Sending a Packet 1369 | 1370 | After a packet is sent, information about the packet is stored. The parameters 1371 | to OnPacketSent are described in detail above in {{sent-packets-fields}}. 1372 | 1373 | Pseudocode for OnPacketSent follows: 1374 | 1375 | ~~~pseudocode 1376 | OnPacketSent(packet_number, pn_space, ack_eliciting, 1377 | in_flight, sent_bytes): 1378 | sent_packets[pn_space][packet_number].packet_number = 1379 | packet_number 1380 | sent_packets[pn_space][packet_number].time_sent = now() 1381 | sent_packets[pn_space][packet_number].ack_eliciting = 1382 | ack_eliciting 1383 | sent_packets[pn_space][packet_number].in_flight = in_flight 1384 | sent_packets[pn_space][packet_number].sent_bytes = sent_bytes 1385 | if (in_flight): 1386 | if (ack_eliciting): 1387 | time_of_last_ack_eliciting_packet[pn_space] = now() 1388 | OnPacketSentCC(sent_bytes) 1389 | SetLossDetectionTimer() 1390 | ~~~ 1391 | 1392 | ## On Receiving a Datagram 1393 | 1394 | When a server is blocked by anti-amplification limits, receiving 1395 | a datagram unblocks it, even if none of the packets in the 1396 | datagram are successfully processed. In such a case, the PTO 1397 | timer will need to be rearmed. 1398 | 1399 | Pseudocode for OnDatagramReceived follows: 1400 | 1401 | ~~~pseudocode 1402 | OnDatagramReceived(datagram): 1403 | // If this datagram unblocks the server, arm the 1404 | // PTO timer to avoid deadlock. 1405 | if (server was at anti-amplification limit): 1406 | SetLossDetectionTimer() 1407 | if loss_detection_timer.timeout < now(): 1408 | // Execute PTO if it would have expired 1409 | // while the amplification limit applied. 1410 | OnLossDetectionTimeout() 1411 | ~~~ 1412 | 1413 | ## On Receiving an Acknowledgment 1414 | 1415 | When an ACK frame is received, it may newly acknowledge any number of packets. 1416 | 1417 | Pseudocode for OnAckReceived and UpdateRtt follow: 1418 | 1419 | ~~~pseudocode 1420 | IncludesAckEliciting(packets): 1421 | for packet in packets: 1422 | if (packet.ack_eliciting): 1423 | return true 1424 | return false 1425 | 1426 | OnAckReceived(ack, pn_space): 1427 | if (largest_acked_packet[pn_space] == infinite): 1428 | largest_acked_packet[pn_space] = ack.largest_acked 1429 | else: 1430 | largest_acked_packet[pn_space] = 1431 | max(largest_acked_packet[pn_space], ack.largest_acked) 1432 | 1433 | // DetectAndRemoveAckedPackets finds packets that are newly 1434 | // acknowledged and removes them from sent_packets. 1435 | newly_acked_packets = 1436 | DetectAndRemoveAckedPackets(ack, pn_space) 1437 | // Nothing to do if there are no newly acked packets. 1438 | if (newly_acked_packets.empty()): 1439 | return 1440 | 1441 | // Update the RTT if the largest acknowledged is newly acked 1442 | // and at least one ack-eliciting was newly acked. 1443 | if (newly_acked_packets.largest().packet_number == 1444 | ack.largest_acked && 1445 | IncludesAckEliciting(newly_acked_packets)): 1446 | latest_rtt = 1447 | now() - newly_acked_packets.largest().time_sent 1448 | UpdateRtt(ack.ack_delay) 1449 | 1450 | // Process ECN information if present. 1451 | if (ACK frame contains ECN information): 1452 | ProcessECN(ack, pn_space) 1453 | 1454 | lost_packets = DetectAndRemoveLostPackets(pn_space) 1455 | if (!lost_packets.empty()): 1456 | OnPacketsLost(lost_packets) 1457 | OnPacketsAcked(newly_acked_packets) 1458 | 1459 | // Reset pto_count unless the client is unsure if 1460 | // the server has validated the client's address. 1461 | if (PeerCompletedAddressValidation()): 1462 | pto_count = 0 1463 | SetLossDetectionTimer() 1464 | 1465 | 1466 | UpdateRtt(ack_delay): 1467 | if (first_rtt_sample == 0): 1468 | min_rtt = latest_rtt 1469 | smoothed_rtt = latest_rtt 1470 | rttvar = latest_rtt / 2 1471 | first_rtt_sample = now() 1472 | return 1473 | 1474 | // min_rtt ignores acknowledgment delay. 1475 | min_rtt = min(min_rtt, latest_rtt) 1476 | // Limit ack_delay by max_ack_delay after handshake 1477 | // confirmation. 1478 | if (handshake confirmed): 1479 | ack_delay = min(ack_delay, max_ack_delay) 1480 | 1481 | // Adjust for acknowledgment delay if plausible. 1482 | adjusted_rtt = latest_rtt 1483 | if (latest_rtt >= min_rtt + ack_delay): 1484 | adjusted_rtt = latest_rtt - ack_delay 1485 | 1486 | rttvar = 3/4 * rttvar + 1/4 * abs(smoothed_rtt - adjusted_rtt) 1487 | smoothed_rtt = 7/8 * smoothed_rtt + 1/8 * adjusted_rtt 1488 | ~~~ 1489 | 1490 | ## Setting the Loss Detection Timer 1491 | 1492 | QUIC loss detection uses a single timer for all timeout loss detection. The 1493 | duration of the timer is based on the timer's mode, which is set in the packet 1494 | and timer events further below. The function SetLossDetectionTimer defined 1495 | below shows how the single timer is set. 1496 | 1497 | This algorithm may result in the timer being set in the past, particularly if 1498 | timers wake up late. Timers set in the past fire immediately. 1499 | 1500 | Pseudocode for SetLossDetectionTimer follows (where the "^" operator represents 1501 | exponentiation): 1502 | 1503 | ~~~pseudocode 1504 | GetLossTimeAndSpace(): 1505 | time = loss_time[Initial] 1506 | space = Initial 1507 | for pn_space in [ Handshake, ApplicationData ]: 1508 | if (time == 0 || loss_time[pn_space] < time): 1509 | time = loss_time[pn_space]; 1510 | space = pn_space 1511 | return time, space 1512 | 1513 | GetPtoTimeAndSpace(): 1514 | duration = (smoothed_rtt + max(4 * rttvar, kGranularity)) 1515 | * (2 ^ pto_count) 1516 | // Anti-deadlock PTO starts from the current time 1517 | if (no ack-eliciting packets in flight): 1518 | assert(!PeerCompletedAddressValidation()) 1519 | if (has handshake keys): 1520 | return (now() + duration), Handshake 1521 | else: 1522 | return (now() + duration), Initial 1523 | pto_timeout = infinite 1524 | pto_space = Initial 1525 | for space in [ Initial, Handshake, ApplicationData ]: 1526 | if (no ack-eliciting packets in flight in space): 1527 | continue; 1528 | if (space == ApplicationData): 1529 | // Skip Application Data until handshake confirmed. 1530 | if (handshake is not confirmed): 1531 | return pto_timeout, pto_space 1532 | // Include max_ack_delay and backoff for Application Data. 1533 | duration += max_ack_delay * (2 ^ pto_count) 1534 | 1535 | t = time_of_last_ack_eliciting_packet[space] + duration 1536 | if (t < pto_timeout): 1537 | pto_timeout = t 1538 | pto_space = space 1539 | return pto_timeout, pto_space 1540 | 1541 | PeerCompletedAddressValidation(): 1542 | // Assume clients validate the server's address implicitly. 1543 | if (endpoint is server): 1544 | return true 1545 | // Servers complete address validation when a 1546 | // protected packet is received. 1547 | return has received Handshake ACK || 1548 | handshake confirmed 1549 | 1550 | SetLossDetectionTimer(): 1551 | earliest_loss_time, _ = GetLossTimeAndSpace() 1552 | if (earliest_loss_time != 0): 1553 | // Time threshold loss detection. 1554 | loss_detection_timer.update(earliest_loss_time) 1555 | return 1556 | 1557 | if (server is at anti-amplification limit): 1558 | // The server's timer is not set if nothing can be sent. 1559 | loss_detection_timer.cancel() 1560 | return 1561 | 1562 | if (no ack-eliciting packets in flight && 1563 | PeerCompletedAddressValidation()): 1564 | // There is nothing to detect lost, so no timer is set. 1565 | // However, the client needs to arm the timer if the 1566 | // server might be blocked by the anti-amplification limit. 1567 | loss_detection_timer.cancel() 1568 | return 1569 | 1570 | timeout, _ = GetPtoTimeAndSpace() 1571 | loss_detection_timer.update(timeout) 1572 | ~~~ 1573 | 1574 | 1575 | ## On Timeout 1576 | 1577 | When the loss detection timer expires, the timer's mode determines the action 1578 | to be performed. 1579 | 1580 | Pseudocode for OnLossDetectionTimeout follows: 1581 | 1582 | ~~~pseudocode 1583 | OnLossDetectionTimeout(): 1584 | earliest_loss_time, pn_space = GetLossTimeAndSpace() 1585 | if (earliest_loss_time != 0): 1586 | // Time threshold loss Detection 1587 | lost_packets = DetectAndRemoveLostPackets(pn_space) 1588 | assert(!lost_packets.empty()) 1589 | OnPacketsLost(lost_packets) 1590 | SetLossDetectionTimer() 1591 | return 1592 | 1593 | if (no ack-eliciting packets in flight): 1594 | assert(!PeerCompletedAddressValidation()) 1595 | // Client sends an anti-deadlock packet: Initial is padded 1596 | // to earn more anti-amplification credit, 1597 | // a Handshake packet proves address ownership. 1598 | if (has Handshake keys): 1599 | SendOneAckElicitingHandshakePacket() 1600 | else: 1601 | SendOneAckElicitingPaddedInitialPacket() 1602 | else: 1603 | // PTO. Send new data if available, else retransmit old data. 1604 | // If neither is available, send a single PING frame. 1605 | _, pn_space = GetPtoTimeAndSpace() 1606 | SendOneOrTwoAckElicitingPackets(pn_space) 1607 | 1608 | pto_count++ 1609 | SetLossDetectionTimer() 1610 | ~~~ 1611 | 1612 | 1613 | ## Detecting Lost Packets 1614 | 1615 | DetectAndRemoveLostPackets is called every time an ACK is received or the time 1616 | threshold loss detection timer expires. This function operates on the 1617 | sent_packets for that packet number space and returns a list of packets newly 1618 | detected as lost. 1619 | 1620 | Pseudocode for DetectAndRemoveLostPackets follows: 1621 | 1622 | ~~~pseudocode 1623 | DetectAndRemoveLostPackets(pn_space): 1624 | assert(largest_acked_packet[pn_space] != infinite) 1625 | loss_time[pn_space] = 0 1626 | lost_packets = [] 1627 | loss_delay = kTimeThreshold * max(latest_rtt, smoothed_rtt) 1628 | 1629 | // Minimum time of kGranularity before packets are deemed lost. 1630 | loss_delay = max(loss_delay, kGranularity) 1631 | 1632 | // Packets sent before this time are deemed lost. 1633 | lost_send_time = now() - loss_delay 1634 | 1635 | foreach unacked in sent_packets[pn_space]: 1636 | if (unacked.packet_number > largest_acked_packet[pn_space]): 1637 | continue 1638 | 1639 | // Mark packet as lost, or set time when it should be marked. 1640 | // Note: The use of kPacketThreshold here assumes that there 1641 | // were no sender-induced gaps in the packet number space. 1642 | if (unacked.time_sent <= lost_send_time || 1643 | largest_acked_packet[pn_space] >= 1644 | unacked.packet_number + kPacketThreshold): 1645 | sent_packets[pn_space].remove(unacked.packet_number) 1646 | lost_packets.insert(unacked) 1647 | else: 1648 | if (loss_time[pn_space] == 0): 1649 | loss_time[pn_space] = unacked.time_sent + loss_delay 1650 | else: 1651 | loss_time[pn_space] = min(loss_time[pn_space], 1652 | unacked.time_sent + loss_delay) 1653 | return lost_packets 1654 | ~~~ 1655 | 1656 | 1657 | ## Upon Dropping Initial or Handshake Keys 1658 | 1659 | When Initial or Handshake keys are discarded, packets from the space 1660 | are discarded and loss detection state is updated. 1661 | 1662 | Pseudocode for OnPacketNumberSpaceDiscarded follows: 1663 | 1664 | ~~~pseudocode 1665 | OnPacketNumberSpaceDiscarded(pn_space): 1666 | assert(pn_space != ApplicationData) 1667 | RemoveFromBytesInFlight(sent_packets[pn_space]) 1668 | sent_packets[pn_space].clear() 1669 | // Reset the loss detection and PTO timer 1670 | time_of_last_ack_eliciting_packet[pn_space] = 0 1671 | loss_time[pn_space] = 0 1672 | pto_count = 0 1673 | SetLossDetectionTimer() 1674 | ~~~ 1675 | 1676 | 1677 | # Congestion Control Pseudocode 1678 | 1679 | We now describe an example implementation of the congestion controller described 1680 | in {{congestion-control}}. 1681 | 1682 | The pseudocode segments in this section are licensed as Code Components; see the 1683 | copyright notice. 1684 | 1685 | ## Constants of Interest {#cc-consts-of-interest} 1686 | 1687 | Constants used in congestion control are based on a combination of RFCs, papers, 1688 | and common practice. 1689 | 1690 | kInitialWindow: 1691 | : Default limit on the initial bytes in flight as described in {{initial-cwnd}}. 1692 | 1693 | kMinimumWindow: 1694 | : Minimum congestion window in bytes as described in {{initial-cwnd}}. 1695 | 1696 | kLossReductionFactor: 1697 | : Scaling factor applied to reduce the congestion window when a new loss event 1698 | is detected. {{congestion-control}} recommends a value of 0.5. 1699 | 1700 | kPersistentCongestionThreshold: 1701 | : Period of time for persistent congestion to be established, specified as a PTO 1702 | multiplier. {{persistent-congestion}} recommends a value of 3. 1703 | 1704 | 1705 | ## Variables of Interest {#vars-of-interest} 1706 | 1707 | Variables required to implement the congestion control mechanisms 1708 | are described in this section. 1709 | 1710 | max_datagram_size: 1711 | : The sender's current maximum payload size. This does not include UDP or IP 1712 | overhead. The max datagram size is used for congestion window 1713 | computations. An endpoint sets the value of this variable based on its Path 1714 | Maximum Transmission Unit (PMTU; see {{Section 14.2 of QUIC-TRANSPORT}}), with 1715 | a minimum value of 1200 bytes. 1716 | 1717 | ecn_ce_counters\[kPacketNumberSpace]: 1718 | : The highest value reported for the ECN-CE counter in the packet number space 1719 | by the peer in an ACK frame. This value is used to detect increases in the 1720 | reported ECN-CE counter. 1721 | 1722 | bytes_in_flight: 1723 | : The sum of the size in bytes of all sent packets that contain at least one 1724 | ack-eliciting or PADDING frame and have not been acknowledged or declared 1725 | lost. The size does not include IP or UDP overhead, but does include the QUIC 1726 | header and Authenticated Encryption with Associated Data (AEAD) overhead. 1727 | Packets only containing ACK frames do not count toward bytes_in_flight to 1728 | ensure congestion control does not impede congestion feedback. 1729 | 1730 | congestion_window: 1731 | : Maximum number of bytes allowed to be in flight. 1732 | 1733 | congestion_recovery_start_time: 1734 | : The time the current recovery period started due to the detection of loss 1735 | or ECN. When a packet sent after this time is acknowledged, QUIC exits 1736 | congestion recovery. 1737 | 1738 | ssthresh: 1739 | : Slow start threshold in bytes. When the congestion window is below ssthresh, 1740 | the mode is slow start and the window grows by the number of bytes 1741 | acknowledged. 1742 | 1743 | The congestion control pseudocode also accesses some of the variables from the 1744 | loss recovery pseudocode. 1745 | 1746 | ## Initialization 1747 | 1748 | At the beginning of the connection, initialize the congestion control 1749 | variables as follows: 1750 | 1751 | ~~~pseudocode 1752 | congestion_window = kInitialWindow 1753 | bytes_in_flight = 0 1754 | congestion_recovery_start_time = 0 1755 | ssthresh = infinite 1756 | for pn_space in [ Initial, Handshake, ApplicationData ]: 1757 | ecn_ce_counters[pn_space] = 0 1758 | ~~~ 1759 | 1760 | 1761 | ## On Packet Sent 1762 | 1763 | Whenever a packet is sent and it contains non-ACK frames, the packet 1764 | increases bytes_in_flight. 1765 | 1766 | ~~~pseudocode 1767 | OnPacketSentCC(sent_bytes): 1768 | bytes_in_flight += sent_bytes 1769 | ~~~ 1770 | 1771 | 1772 | ## On Packet Acknowledgment 1773 | 1774 | This is invoked from loss detection's OnAckReceived and is supplied with the 1775 | newly acked_packets from sent_packets. 1776 | 1777 | In congestion avoidance, implementers that use an integer representation 1778 | for congestion_window should be careful with division and can use 1779 | the alternative approach suggested in {{Section 2.1 of RFC3465}}. 1780 | 1781 | ~~~pseudocode 1782 | InCongestionRecovery(sent_time): 1783 | return sent_time <= congestion_recovery_start_time 1784 | 1785 | OnPacketsAcked(acked_packets): 1786 | for acked_packet in acked_packets: 1787 | OnPacketAcked(acked_packet) 1788 | 1789 | OnPacketAcked(acked_packet): 1790 | if (!acked_packet.in_flight): 1791 | return; 1792 | // Remove from bytes_in_flight. 1793 | bytes_in_flight -= acked_packet.sent_bytes 1794 | // Do not increase congestion_window if application 1795 | // limited or flow control limited. 1796 | if (IsAppOrFlowControlLimited()) 1797 | return 1798 | // Do not increase congestion window in recovery period. 1799 | if (InCongestionRecovery(acked_packet.time_sent)): 1800 | return 1801 | if (congestion_window < ssthresh): 1802 | // Slow start. 1803 | congestion_window += acked_packet.sent_bytes 1804 | else: 1805 | // Congestion avoidance. 1806 | congestion_window += 1807 | max_datagram_size * acked_packet.sent_bytes 1808 | / congestion_window 1809 | ~~~ 1810 | 1811 | 1812 | ## On New Congestion Event 1813 | 1814 | This is invoked from ProcessECN and OnPacketsLost when a new congestion event is 1815 | detected. If not already in recovery, this starts a recovery period and 1816 | reduces the slow start threshold and congestion window immediately. 1817 | 1818 | ~~~pseudocode 1819 | OnCongestionEvent(sent_time): 1820 | // No reaction if already in a recovery period. 1821 | if (InCongestionRecovery(sent_time)): 1822 | return 1823 | 1824 | // Enter recovery period. 1825 | congestion_recovery_start_time = now() 1826 | ssthresh = congestion_window * kLossReductionFactor 1827 | congestion_window = max(ssthresh, kMinimumWindow) 1828 | // A packet can be sent to speed up loss recovery. 1829 | MaybeSendOnePacket() 1830 | ~~~ 1831 | 1832 | 1833 | ## Process ECN Information 1834 | 1835 | This is invoked when an ACK frame with an ECN section is received from the peer. 1836 | 1837 | ~~~pseudocode 1838 | ProcessECN(ack, pn_space): 1839 | // If the ECN-CE counter reported by the peer has increased, 1840 | // this could be a new congestion event. 1841 | if (ack.ce_counter > ecn_ce_counters[pn_space]): 1842 | ecn_ce_counters[pn_space] = ack.ce_counter 1843 | sent_time = sent_packets[ack.largest_acked].time_sent 1844 | OnCongestionEvent(sent_time) 1845 | ~~~ 1846 | 1847 | 1848 | ## On Packets Lost 1849 | 1850 | This is invoked when DetectAndRemoveLostPackets deems packets lost. 1851 | 1852 | ~~~pseudocode 1853 | OnPacketsLost(lost_packets): 1854 | sent_time_of_last_loss = 0 1855 | // Remove lost packets from bytes_in_flight. 1856 | for lost_packet in lost_packets: 1857 | if lost_packet.in_flight: 1858 | bytes_in_flight -= lost_packet.sent_bytes 1859 | sent_time_of_last_loss = 1860 | max(sent_time_of_last_loss, lost_packet.time_sent) 1861 | // Congestion event if in-flight packets were lost 1862 | if (sent_time_of_last_loss != 0): 1863 | OnCongestionEvent(sent_time_of_last_loss) 1864 | 1865 | // Reset the congestion window if the loss of these 1866 | // packets indicates persistent congestion. 1867 | // Only consider packets sent after getting an RTT sample. 1868 | if (first_rtt_sample == 0): 1869 | return 1870 | pc_lost = [] 1871 | for lost in lost_packets: 1872 | if lost.time_sent > first_rtt_sample: 1873 | pc_lost.insert(lost) 1874 | if (InPersistentCongestion(pc_lost)): 1875 | congestion_window = kMinimumWindow 1876 | congestion_recovery_start_time = 0 1877 | ~~~ 1878 | 1879 | 1880 | ## Removing Discarded Packets from Bytes in Flight 1881 | 1882 | When Initial or Handshake keys are discarded, packets sent in that space no 1883 | longer count toward bytes in flight. 1884 | 1885 | Pseudocode for RemoveFromBytesInFlight follows: 1886 | 1887 | ~~~pseudocode 1888 | RemoveFromBytesInFlight(discarded_packets): 1889 | // Remove any unacknowledged packets from flight. 1890 | foreach packet in discarded_packets: 1891 | if packet.in_flight 1892 | bytes_in_flight -= size 1893 | ~~~ 1894 | 1895 | 1896 | # Contributors 1897 | {: numbered="false"} 1898 | 1899 | The IETF QUIC Working Group received an enormous amount of support from many 1900 | people. The following people provided substantive contributions to this 1901 | document: 1902 | 1903 | 1918 | -------------------------------------------------------------------------------- /tag.sh: -------------------------------------------------------------------------------- 1 | # Tag files for submission. 2 | # 3 | # You shouldn't need to use this unless you are tagging files for which you are 4 | # not an author. Use `git tag -a` instead. 5 | # 6 | # This script exists because 7 | # https://trac.tools.ietf.org/tools/ietfdb/ticket/2390 still isn't fixed. 8 | 9 | if [[ $# -eq 0 ]]; then 10 | files=(invariants transport tls recovery http qpack) 11 | else 12 | files=("$@") 13 | fi 14 | 15 | enabled() { 16 | r="$1"; shift 17 | for e; do [[ "$e" == "$r" ]] && return 0; done 18 | return 1 19 | } 20 | 21 | declare -A authors=( \ 22 | [transport]=mt@lowentropy.net \ 23 | [tls]=mt@lowentropy.net \ 24 | [recovery]=ianswett@google.com \ 25 | [http]=mbishop@evequefou.be \ 26 | [invariants]=mt@lowentropy.net \ 27 | [qpack]=afrind@fb.com \ 28 | ) 29 | 30 | if ! make; then 31 | echo "FAILED TO BUILD STOP" 1>&2 32 | exit 1 33 | fi 34 | 35 | all=($(make show-next)) 36 | tags=() 37 | thisuser=$(git config --get user.name) 38 | 39 | for t in "${all[@]}"; do 40 | r="${t%-[0-9][0-9]}" 41 | r="${r#draft-ietf-quic-}" 42 | if enabled "$r" "${files[@]}"; then 43 | message="Tag for $t created by $thisuser" 44 | git -c user.email="${authors[$r]}" tag -am "$message" "$t" 45 | tags+=("$t") 46 | fi 47 | done 48 | for t in "${tags[@]}"; do 49 | git push origin "$t" 50 | done 51 | -------------------------------------------------------------------------------- /writeups/base-drafts.md: -------------------------------------------------------------------------------- 1 | # Shepherd Writeup for QUIC "base drafts" 2 | 3 | ## 1. Summary 4 | 9 | 10 | This publication requests covers the following I-Ds that together define the 11 | QUIC protocol: 12 | 13 | * **QUIC: A UDP-Based Multiplexed and Secure Transport**, 14 | draft-ietf-quic-transport-31 15 | * **QUIC Loss Detection and Congestion Control**, draft-ietf-quic-recovery-31 16 | * **Using TLS to Secure QUIC**, draft-ietf-quic-tls-31 17 | * **Version-Independent Properties of QUIC**, draft-ietf-quic-invariants-11 18 | * **Hypertext Transfer Protocol Version 3 (HTTP/3)**, draft-ietf-quic-http-31 19 | * **QPACK: Header Compression for HTTP/3**, draft-ietf-quic-qpack-18 20 | 21 | All of these I-Ds are intended to become Proposed Standard RFCs, and that 22 | intended status is indicated in their respective title page headers. 23 | 24 | 25 | ## 2. Document Announcement Write-Up 26 | 32 | 33 | 34 | ### Technical Summary: 35 | 40 | 41 | QUIC is a standards-track, UDP-based, stream-multiplexing, encrypted transport 42 | protocol. Its main features are minimizing connection establishment and overall 43 | transport latency for applications such as HTTP/3, providing multiplexing 44 | without head-of-line blocking, requiring only changes to path endpoints to 45 | enable deployment, providing always-secure transport using TLS 1.3. 46 | 47 | This document set specifies the QUIC transport protocol and it version-independent 48 | invariants, its loss detection and recovery approach, its use of TLS1.3 for 49 | providing security, and a new version of HTTP that uses QUIC (HTTP/3), 50 | along with QPACK for header compression in that protocol. 51 | 52 | 53 | ### Working Group Summary: 54 | 59 | 60 | As can be expected, discussion on many aspects of QUIC was quite intense. The 61 | resulting consensus, however, was judged by the chairs to be both strong and broad. 62 | 63 | 64 | ### Document Quality: 65 | 74 | 75 | There are over twenty implementations of QUIC that are participating in interop 76 | testing, including all major web browsers and many server, CDN and standalone 77 | library implementations. 78 | 79 | The acknowledgements sections of the I-Ds highlight the individuals that made 80 | major contributions to a given document. 81 | 82 | 83 | ### Personnel: 84 | 85 | 86 | The document shepherds for the individual I-Ds are: 87 | 88 | * **Lucas Pardue**: 89 | * draft-ietf-quic-http-31 90 | * draft-ietf-quic-qpack-18 91 | * **Lars Eggert**: 92 | * draft-ietf-quic-transport-31 93 | * draft-ietf-quic-recovery-31 94 | * **Mark Nottingham**: 95 | * draft-ietf-quic-tls-31 96 | * draft-ietf-quic-invariants-11 97 | 98 | The responsible AD for the document set is Magnus Westerlund. 99 | 100 | 101 | ## 3. Document Shepherd Review 102 | 107 | 108 | The document shepherds extensively reviewed the documents before this 109 | publication request. 110 | 111 | 112 | ## 4. Document Shepherd Review Concerns 113 | 117 | 118 | The document shepherds have no concerns about the depth or breadth of the 119 | reviews for these documents. 120 | 121 | 122 | ## 5. Broader Reviews 123 | 128 | 129 | Parts of the document set benefited from specialized reviews from the TLS, HTTP 130 | and transport IETF communities. 131 | 132 | 133 | ## 6. Document Shepherd General Concerns 134 | 142 | 143 | The document shepherds have no general concerns about these documents. 144 | 145 | 146 | # 7. IPR Disclosure Obligation 147 | 152 | 153 | The editors of the I-Ds have all declared that they have filed any and all 154 | appropriate IPR disclosures required for full conformance with the provisions of 155 | BCP 78 and BCP 79. 156 | 157 | 158 | ## 8. Filed IPR Disclosures 159 | 163 | 164 | draft-ietf-quic-recovery has had an IPR disclosure filed on it. No resulting 165 | technical changes were argued for. 166 | 167 | 168 | ## 9. Strength of Consensus 169 | 174 | 175 | The consensus behind the document set is very strong, also as evidenced by the 176 | substantial number of existing implementations. 177 | 178 | The WG last calls were forwarded to the TLS and HTTP WGs, due to the topical 179 | relationships. 180 | 181 | 182 | ## 10. Discontent 183 | 189 | 190 | No discontent was voiced. 191 | 192 | 193 | ## 11. Document Nits 194 | 199 | 200 | The IDNits tool does not appear to be functioning correctly, both locally and using the Web service, so it's difficult to ascertain whether its results are accurate (there are many "Failure fetching the file, proceeding without it." errors). 201 | 202 | 203 | ## 12. Formal Review Criteria 204 | 208 | 209 | No formal review requirements are applicable to this document set. 210 | 211 | 212 | ## 13. Split References 213 | 217 | 218 | All references within this document set have been identified as either normative 219 | or informative. 220 | 221 | 222 | ## 14. Normative References 223 | 228 | 229 | The document set contains the following normative references to I-Ds: 230 | 231 | * draft-ietf-httpbis-cache 232 | * draft-ietf-httpbis-semantics 233 | 234 | All of these are on track for timely publication in their respective WGs. 235 | 236 | 237 | ## 15. Downward References 238 | 243 | 244 | draft-ietf-quic-tls-31 document has a downref to RFC8439 (CHACHA). RFC7539, 245 | which RFC8439 obsoletes, is already listed in the IETF Downref registry. 246 | (draft-ietf-quic-tls-31 also cites a number of NIST standards.) 247 | 248 | ## 16. RFC Status Changes 249 | 257 | 258 | Publication of this document set will not change the status of any existing 259 | RFCs. 260 | 261 | 262 | ## 17. IANA Considerations Review 263 | 273 | 274 | The IANA considerations of the document set have been reviewed and no issues 275 | were identified. 276 | 277 | 278 | ## 18. New "Expert Review" Registries 279 | 284 | 285 | The document set defines several IANA registries that allow for "Provisional 286 | Registrations" and "Permanent Registrations", which both require Expert review. 287 | The IESG should select subject matter experts for these registration types; 288 | candidates include the document editors and the individuals named as 289 | contributors in the acknowledgment sections. 290 | 291 | 292 | ## 19. Validation of Formal Language Parts 293 | 298 | 299 | No formal code exists in the document set. draft-ietf-quic-transport, 300 | draft-ietf-quic-recovery and draft-ietf-quic-qpack contain python-like pseudo 301 | code, but not at a level of detail that would lend itself to automated checking. 302 | 303 | 304 | ## 20. YANG 305 | 314 | 315 | The document set does not contain a YANG model. 316 | -------------------------------------------------------------------------------- /xml2rfc-tidy.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # Tidy an xml2rfc file. 3 | # 4 | # This: 5 | # * removes non-semantic content (comments, processing instructions, DOCTYPE 6 | # declarations, broken entity references) 7 | # * wraps BCP 14 language in elements 8 | # * indents elements neatly 9 | 10 | import sys 11 | import xml.sax 12 | import re 13 | from xml.sax.saxutils import escape, quoteattr 14 | 15 | 16 | class Tidy(xml.sax.handler.ContentHandler): 17 | pattern = re.compile( 18 | r"\b((?:(?:MUST|SHOULD|SHALL)(?:\s+NOT)?)|(?:(?:NOT\s+)?RECOMMENDED)|MAY|OPTIONAL|REQUIRED)\b" 19 | ) 20 | 21 | def __init__(self): 22 | self.tags = [] 23 | self.nesting = 0 24 | self.c = "" 25 | self.state = "" 26 | 27 | def startDocument(self): 28 | print('') 29 | 30 | def preserve(tag): 31 | return tag in ["artwork", "sourcecode"] 32 | 33 | def textElement(tag): 34 | return tag in [ 35 | "annotation", 36 | "blockquote", 37 | "dd", 38 | "dt", 39 | "em", 40 | "li", 41 | "preamble", 42 | "refcontent", 43 | "strong", 44 | "sub", 45 | "sup", 46 | "t", 47 | "td", 48 | "th", 49 | "tt", 50 | ] 51 | 52 | def inline(tag): 53 | return tag in [ 54 | "code", 55 | "contact", 56 | "cref", 57 | "em", 58 | "eref", 59 | "iref", 60 | "sub", 61 | "sup", 62 | "tt", 63 | "xref", 64 | ] 65 | 66 | def flush(self, tag, start=None): 67 | if Tidy.preserve(tag): 68 | c = f"" 69 | else: 70 | c = escape(self.c) 71 | if Tidy.textElement(tag): 72 | if self.state == "open": 73 | # The element is opening, so strip left is safe. 74 | c = c.lstrip() 75 | if start is None or not Tidy.inline(start): 76 | # The element is closing, or the element that is starting 77 | # isn't inline, so strip right is safe. 78 | c = c.rstrip() 79 | c = Tidy.pattern.sub(r"\1", c) 80 | else: 81 | c = c.strip() 82 | 83 | if c != "": 84 | if self.state == "open": 85 | print(">", end="") 86 | print(c, end="") 87 | self.state = "text" 88 | self.nl = False 89 | 90 | self.c = "" 91 | 92 | def currentTag(self): 93 | return next(reversed(self.tags), False) 94 | 95 | def startElement(self, tag, attributes): 96 | parent = self.currentTag() 97 | self.flush(parent, tag) 98 | 99 | if self.state == "open": 100 | print(">", end="") 101 | if not Tidy.inline(tag): 102 | print() 103 | 104 | self.tags.append(tag) 105 | if not Tidy.inline(tag): 106 | print(" " * self.nesting, end="") 107 | self.nesting = self.nesting + 1 108 | 109 | print(f"<{tag}", end="") 110 | for name, value in attributes.items(): 111 | print(f" {name}={quoteattr(value)}", end="") 112 | 113 | self.state = "open" 114 | self.nl = False 115 | 116 | def endElement(self, tag): 117 | self.flush(self.tags.pop()) 118 | 119 | if not Tidy.inline(tag): 120 | self.nesting = self.nesting - 1 121 | if self.nl and not Tidy.inline(self.currentTag()): 122 | print(" " * self.nesting, end="") 123 | if self.state == "open": 124 | print("/>", end="") 125 | else: 126 | print(f"", end="") 127 | self.nl = not Tidy.inline(tag) 128 | if self.nl: 129 | print() 130 | self.state = "close" 131 | 132 | def characters(self, content): 133 | self.c = self.c + content 134 | 135 | def processingInstruction(self, target, data): 136 | pass 137 | 138 | 139 | parser = xml.sax.make_parser() 140 | parser.setContentHandler(Tidy()) 141 | if len(sys.argv) >= 2: 142 | parser.parse(sys.argv[1]) 143 | else: 144 | parser.parse(sys.stdin) 145 | --------------------------------------------------------------------------------