├── General ├── CentOS │ ├── bash │ │ ├── tcp_nanqinlang-1.3.2-nocheckvirt.sh │ │ └── tcp_nanqinlang-1.3.2.sh │ └── source │ │ ├── tcp_bbr.c │ │ └── tcp_nanqinlang.c └── Debian │ ├── mod │ └── tcp_nanqinlang-for-v4.10.2.ko │ └── source │ ├── kernel-v4.12andbelow │ ├── tcp_bbr.c │ └── tcp_nanqinlang.c │ ├── kernel-v4.13 │ ├── tcp_bbr.c │ └── tcp_nanqinlang.c │ ├── kernel-v4.14 │ ├── tcp_bbr.c │ └── tcp_nanqinlang.c │ ├── kernel-v4.15 │ ├── tcp_bbr.c │ └── tcp_nanqinlang.c │ └── kernel-v4.16 │ ├── tcp_bbr.c │ └── tcp_nanqinlang.c ├── Makefile ├── Makefile-CentOS ├── Makefile-Debian7or8 └── Makefile-Debian9 └── readme.md /General/CentOS/bash/tcp_nanqinlang-1.3.2-nocheckvirt.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | Green_font="\033[32m" && Yellow_font="\033[33m" && Red_font="\033[31m" && Font_suffix="\033[0m" 3 | Info="${Green_font}[Info]${Font_suffix}" 4 | Error="${Red_font}[Error]${Font_suffix}" 5 | echo -e "${Green_font} 6 | #====================================================== 7 | # Project: tcp_nanqinlang general 8 | # Platform: --CentOS_6/7_64bit --nocheckvirt 9 | # Version: 1.3.2 10 | # Author: nanqinlang 11 | # Blog: https://sometimesnaive.org 12 | # Github: https://github.com/nanqinlang 13 | #======================================================${Font_suffix}" 14 | 15 | check_system(){ 16 | #sort 17 | [[ -z "`cat /etc/redhat-release | grep -iE "CentOS"`" ]] && echo -e "${Error} only support CentOS !" && exit 1 18 | #number 19 | [[ ! -z "`cat /etc/redhat-release | grep -iE " 7."`" ]] && bit=7 20 | [[ ! -z "`cat /etc/redhat-release | grep -iE " 6."`" ]] && bit=6 21 | #bit 22 | [[ "`uname -m`" != "x86_64" ]] && echo -e "${Error} only support 64bit !" && exit 1 23 | } 24 | 25 | check_root(){ 26 | [[ "`id -u`" != "0" ]] && echo -e "${Error} must be root user !" && exit 1 27 | } 28 | 29 | check_kvm(){ 30 | yum update 31 | yum install -y virt-what 32 | [[ "`virt-what`" != "kvm" ]] && echo -e "${Error} only support KVM !" && exit 1 33 | } 34 | 35 | directory(){ 36 | [[ ! -d /home/tcp_nanqinlang ]] && mkdir -p /home/tcp_nanqinlang 37 | cd /home/tcp_nanqinlang 38 | } 39 | 40 | check_kernel(){ 41 | # check 4.12.10 already installed or not 42 | already_image=`rpm -qa | grep kernel-4.12.10` 43 | already_devel=`rpm -qa | grep kernel-devel-4.12.10` 44 | already_headers=`rpm -qa | grep kernel-headers-4.12.10` 45 | 46 | delete_surplus_1 47 | 48 | if [[ -z "${already_image}" ]]; then 49 | echo -e "${Info} installing image" && install_image 50 | else echo -e "${Info} noneed install image" 51 | fi 52 | 53 | if [[ -z "${already_devel}" ]]; then 54 | echo -e "${Info} installing devel" && install_devel 55 | else echo -e "${Info} noneed install devel" 56 | fi 57 | 58 | if [[ -z "${already_headers}" ]]; then 59 | echo -e "${Info} installing headers" && install_headers 60 | else echo -e "${Info} noneed install headers" 61 | fi 62 | 63 | update-grub 64 | 65 | } 66 | 67 | delete_surplus_1(){ 68 | #surplus_image=`rpm -qa | grep kernel | awk '{print $2}' | grep -v "4.12.10" | wc -l` 69 | #surplus_devel=`rpm -qa | grep kernel-devel | awk '{print $2}' | grep -v "4.12.10" | wc -l` 70 | #surplus_headers=`rpm -qa | grep kernel-headers | awk '{print $2}' | grep -v "4.12.10" | wc -l` 71 | 72 | surplus_count=`rpm -qa | grep kernel | grep -v "4.12.10" | wc -l` 73 | surplus_sort_1=`rpm -qa | grep kernel | grep -v "4.12.10"` 74 | 75 | while [[ "${surplus_count}" > "1" ]] 76 | do 77 | yum remove -y ${surplus_sort_1} 78 | surplus_count=`rpm -qa | grep kernel | grep -v "4.12.10" | wc -l` 79 | surplus_sort_1=`rpm -qa | grep kernel | grep -v "4.12.10"` 80 | done 81 | } 82 | 83 | delete_surplus_2(){ 84 | current=`uname -r | grep -v "4.12.10"` 85 | if [[ -z "${current}" ]]; then 86 | surplus_sort_2=`rpm -qa | grep kernel | grep -v "4.12.10" | grep -v "dracut-kernel-004-409.el6_8.2.noarch"` 87 | while [[ ! -z "${surplus_sort_2}" ]] 88 | do 89 | yum remove -y ${surplus_sort_2} 90 | surplus_sort_2=`rpm -qa | grep kernel | grep -v "4.12.10" | grep -v "dracut-kernel-004-409.el6_8.2.noarch"` 91 | done 92 | else 93 | echo -e "${Error} current running kernel is not v4.12.10, please check !" 94 | fi 95 | } 96 | 97 | # achieve 98 | # http://elrepo.mirror.angkasa.id/elrepo/archive/kernel/el6/x86_64/RPMS/ 99 | # http://elrepo.mirror.angkasa.id/elrepo/archive/kernel/el7/x86_64/RPMS/ 100 | # my backup: https://github.com/nanqinlang/CentOS-kernel 101 | install_image(){ 102 | #[[ ! -f kernel-ml-4.12.10-1.el${bit}.elrepo.x86_64.rpm ]] && wget http://elrepo.mirror.angkasa.id/elrepo/archive/kernel/el${bit}/x86_64/RPMS/kernel-ml-4.12.10-1.el${bit}.elrepo.x86_64.rpm 103 | [[ ! -f kernel-ml-4.12.10-1.el${bit}.elrepo.x86_64.rpm ]] && wget https://raw.githubusercontent.com/tcp-nanqinlang/CentOS-kernel/master/kernel-ml-4.12.10-1.el${bit}.elrepo.x86_64.rpm 104 | [[ ! -f kernel-ml-4.12.10-1.el${bit}.elrepo.x86_64.rpm ]] && echo -e "${Error} ==image download failed, please check !" && exit 1 105 | yum install -y kernel-ml-4.12.10-1.el${bit}.elrepo.x86_64.rpm 106 | } 107 | install_devel(){ 108 | #[[ ! -f kernel-ml-devel-4.12.10-1.el${bit}.elrepo.x86_64.rpm ]] && wget http://elrepo.mirror.angkasa.id/elrepo/archive/kernel/el${bit}/x86_64/RPMS/kernel-ml-devel-4.12.10-1.el${bit}.elrepo.x86_64.rpm 109 | [[ ! -f kernel-ml-devel-4.12.10-1.el${bit}.elrepo.x86_64.rpm ]] && wget https://raw.githubusercontent.com/tcp-nanqinlang/CentOS-kernel/master/kernel-ml-devel-4.12.10-1.el${bit}.elrepo.x86_64.rpm 110 | [[ ! -f kernel-ml-devel-4.12.10-1.el${bit}.elrepo.x86_64.rpm ]] && echo -e "${Error} devel download failed, please check !" && exit 1 111 | yum install -y kernel-ml-devel-4.12.10-1.el${bit}.elrepo.x86_64.rpm 112 | } 113 | install_headers(){ 114 | #[[ ! -f kernel-ml-headers-4.12.10-1.el${bit}.elrepo.x86_64.rpm ]] && wget http://elrepo.mirror.angkasa.id/elrepo/archive/kernel/el${bit}/x86_64/RPMS/kernel-ml-headers-4.12.10-1.el${bit}.elrepo.x86_64.rpm 115 | [[ ! -f kernel-ml-headers-4.12.10-1.el${bit}.elrepo.x86_64.rpm ]] && wget https://raw.githubusercontent.com/tcp-nanqinlang/CentOS-kernel/master/kernel-ml-headers-4.12.10-1.el${bit}.elrepo.x86_64.rpm 116 | [[ ! -f kernel-ml-headers-4.12.10-1.el${bit}.elrepo.x86_64.rpm ]] && echo -e "${Error} headers download failed, please check !" && exit 1 117 | yum install -y kernel-ml-headers-4.12.10-1.el${bit}.elrepo.x86_64.rpm 118 | } 119 | 120 | update-grub(){ 121 | [[ "${bit}" = "7" ]] && grub2-mkconfig -o /boot/grub2/grub.cfg && grub2-set-default 0 122 | [[ "${bit}" = "6" ]] && sed -i '/default=/d' /boot/grub/grub.conf && echo -e "\ndefault=0\c" >> /boot/grub/grub.conf 123 | } 124 | 125 | rpm_list(){ 126 | rpm -qa | grep kernel 127 | } 128 | 129 | maker(){ 130 | yum groupinstall -y "Development Tools" && yum update 131 | [[ ! -e /lib/modules/`uname -r`/kernel/net/ipv4/tcp_nanqinlang.ko ]] && compile 132 | [[ ! -e /lib/modules/`uname -r`/kernel/net/ipv4/tcp_nanqinlang.ko ]] && echo -e "${Error} load mod failed, please check!" && exit 1 133 | } 134 | 135 | compile(){ 136 | wget https://raw.githubusercontent.com/tcp-nanqinlang/general/master/General/CentOS/source/tcp_nanqinlang.c 137 | wget -O Makefile https://raw.githubusercontent.com/tcp-nanqinlang/general/master/Makefile/Makefile-CentOS 138 | make && make install 139 | } 140 | 141 | check_status(){ 142 | #status_sysctl=`sysctl net.ipv4.tcp_available_congestion_control | awk '{print $3}'` 143 | #status_lsmod=`lsmod | grep nanqinlang` 144 | if [[ "`lsmod | grep nanqinlang`" != "" ]]; then 145 | echo -e "${Info} tcp_nanqinlang is installed !" 146 | if [[ "`sysctl net.ipv4.tcp_available_congestion_control | awk '{print $3}'`" = "nanqinlang" ]]; then 147 | echo -e "${Info} tcp_nanqinlang is running !" 148 | else echo -e "${Error} tcp_nanqinlang is installed but not running !" 149 | fi 150 | else 151 | echo -e "${Error} tcp_nanqinlang not installed !" 152 | fi 153 | } 154 | 155 | 156 | 157 | ################################################################################################### 158 | install(){ 159 | check_system 160 | check_root 161 | #check_kvm 162 | directory 163 | check_kernel 164 | rpm_list 165 | echo -e "${Info} 请确认此行上面的列表显示的内核版本后,重启以应用新内核" 166 | } 167 | 168 | start(){ 169 | check_system 170 | check_root 171 | #check_kvm 172 | directory 173 | delete_surplus_2 && update-grub 174 | maker 175 | sed -i '/net\.core\.default_qdisc/d' /etc/sysctl.conf 176 | sed -i '/net\.ipv4\.tcp_congestion_control/d' /etc/sysctl.conf 177 | echo -e "\nnet.core.default_qdisc=fq" >> /etc/sysctl.conf 178 | echo -e "net.ipv4.tcp_congestion_control=nanqinlang\c" >> /etc/sysctl.conf 179 | sysctl -p 180 | check_status 181 | rm -rf /home/tcp_nanqinlang 182 | } 183 | 184 | status(){ 185 | check_status 186 | } 187 | 188 | uninstall(){ 189 | check_root 190 | sed -i '/net\.core\.default_qdisc=/d' /etc/sysctl.conf 191 | sed -i '/net\.ipv4\.tcp_congestion_control=/d' /etc/sysctl.conf 192 | sysctl -p 193 | rm /lib/modules/`uname -r`/kernel/net/ipv4/tcp_nanqinlang.ko 194 | echo -e "${Info} please remember ${reboot} to stop tcp_nanqinlang !" 195 | } 196 | 197 | echo -e "${Info} 选择你要使用的功能: " 198 | echo -e "1.安装内核\n2.开启算法\n3.检查算法运行状态\n4.卸载算法" 199 | read -p "输入数字以选择:" function 200 | 201 | while [[ ! "${function}" =~ ^[1-4]$ ]] 202 | do 203 | echo -e "${Error} 无效输入" 204 | echo -e "${Info} 请重新选择" && read -p "输入数字以选择:" function 205 | done 206 | 207 | if [[ "${function}" == "1" ]]; then 208 | install 209 | elif [[ "${function}" == "2" ]]; then 210 | start 211 | elif [[ "${function}" == "3" ]]; then 212 | status 213 | else 214 | uninstall 215 | fi 216 | -------------------------------------------------------------------------------- /General/CentOS/bash/tcp_nanqinlang-1.3.2.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | Green_font="\033[32m" && Yellow_font="\033[33m" && Red_font="\033[31m" && Font_suffix="\033[0m" 3 | Info="${Green_font}[Info]${Font_suffix}" 4 | Error="${Red_font}[Error]${Font_suffix}" 5 | echo -e "${Green_font} 6 | #====================================================== 7 | # Project: tcp_nanqinlang general 8 | # Platform: --CentOS_6/7_64bit --KVM 9 | # Version: 1.3.2 10 | # Author: nanqinlang 11 | # Blog: https://sometimesnaive.org 12 | # Github: https://github.com/nanqinlang 13 | #======================================================${Font_suffix}" 14 | 15 | check_system(){ 16 | #sort 17 | [[ -z "`cat /etc/redhat-release | grep -iE "CentOS"`" ]] && echo -e "${Error} only support CentOS !" && exit 1 18 | #number 19 | [[ ! -z "`cat /etc/redhat-release | grep -iE " 7."`" ]] && bit=7 20 | [[ ! -z "`cat /etc/redhat-release | grep -iE " 6."`" ]] && bit=6 21 | #bit 22 | [[ "`uname -m`" != "x86_64" ]] && echo -e "${Error} only support 64bit !" && exit 1 23 | } 24 | 25 | check_root(){ 26 | [[ "`id -u`" != "0" ]] && echo -e "${Error} must be root user !" && exit 1 27 | } 28 | 29 | check_kvm(){ 30 | yum update 31 | yum install -y virt-what 32 | [[ "`virt-what`" != "kvm" ]] && echo -e "${Error} only support KVM !" && exit 1 33 | } 34 | 35 | directory(){ 36 | [[ ! -d /home/tcp_nanqinlang ]] && mkdir -p /home/tcp_nanqinlang 37 | cd /home/tcp_nanqinlang 38 | } 39 | 40 | check_kernel(){ 41 | # check 4.12.10 already installed or not 42 | already_image=`rpm -qa | grep kernel-4.12.10` 43 | already_devel=`rpm -qa | grep kernel-devel-4.12.10` 44 | already_headers=`rpm -qa | grep kernel-headers-4.12.10` 45 | 46 | delete_surplus_1 47 | 48 | if [[ -z "${already_image}" ]]; then 49 | echo -e "${Info} installing image" && install_image 50 | else echo -e "${Info} noneed install image" 51 | fi 52 | 53 | if [[ -z "${already_devel}" ]]; then 54 | echo -e "${Info} installing devel" && install_devel 55 | else echo -e "${Info} noneed install devel" 56 | fi 57 | 58 | if [[ -z "${already_headers}" ]]; then 59 | echo -e "${Info} installing headers" && install_headers 60 | else echo -e "${Info} noneed install headers" 61 | fi 62 | 63 | update-grub 64 | 65 | } 66 | 67 | delete_surplus_1(){ 68 | #surplus_image=`rpm -qa | grep kernel | awk '{print $2}' | grep -v "4.12.10" | wc -l` 69 | #surplus_devel=`rpm -qa | grep kernel-devel | awk '{print $2}' | grep -v "4.12.10" | wc -l` 70 | #surplus_headers=`rpm -qa | grep kernel-headers | awk '{print $2}' | grep -v "4.12.10" | wc -l` 71 | 72 | surplus_count=`rpm -qa | grep kernel | grep -v "4.12.10" | wc -l` 73 | surplus_sort_1=`rpm -qa | grep kernel | grep -v "4.12.10"` 74 | 75 | while [[ "${surplus_count}" > "1" ]] 76 | do 77 | yum remove -y ${surplus_sort_1} 78 | surplus_count=`rpm -qa | grep kernel | grep -v "4.12.10" | wc -l` 79 | surplus_sort_1=`rpm -qa | grep kernel | grep -v "4.12.10"` 80 | done 81 | } 82 | 83 | delete_surplus_2(){ 84 | current=`uname -r | grep -v "4.12.10"` 85 | if [[ -z "${current}" ]]; then 86 | surplus_sort_2=`rpm -qa | grep kernel | grep -v "4.12.10" | grep -v "dracut-kernel-004-409.el6_8.2.noarch"` 87 | while [[ ! -z "${surplus_sort_2}" ]] 88 | do 89 | yum remove -y ${surplus_sort_2} 90 | surplus_sort_2=`rpm -qa | grep kernel | grep -v "4.12.10" | grep -v "dracut-kernel-004-409.el6_8.2.noarch"` 91 | done 92 | else 93 | echo -e "${Error} current running kernel is not v4.12.10, please check !" 94 | fi 95 | } 96 | 97 | # achieve 98 | # http://elrepo.mirror.angkasa.id/elrepo/archive/kernel/el6/x86_64/RPMS/ 99 | # http://elrepo.mirror.angkasa.id/elrepo/archive/kernel/el7/x86_64/RPMS/ 100 | # my backup: https://github.com/nanqinlang/CentOS-kernel 101 | install_image(){ 102 | #[[ ! -f kernel-ml-4.12.10-1.el${bit}.elrepo.x86_64.rpm ]] && wget http://elrepo.mirror.angkasa.id/elrepo/archive/kernel/el${bit}/x86_64/RPMS/kernel-ml-4.12.10-1.el${bit}.elrepo.x86_64.rpm 103 | [[ ! -f kernel-ml-4.12.10-1.el${bit}.elrepo.x86_64.rpm ]] && wget https://raw.githubusercontent.com/tcp-nanqinlang/CentOS-kernel/master/kernel-ml-4.12.10-1.el${bit}.elrepo.x86_64.rpm 104 | [[ ! -f kernel-ml-4.12.10-1.el${bit}.elrepo.x86_64.rpm ]] && echo -e "${Error} ==image download failed, please check !" && exit 1 105 | yum install -y kernel-ml-4.12.10-1.el${bit}.elrepo.x86_64.rpm 106 | } 107 | install_devel(){ 108 | #[[ ! -f kernel-ml-devel-4.12.10-1.el${bit}.elrepo.x86_64.rpm ]] && wget http://elrepo.mirror.angkasa.id/elrepo/archive/kernel/el${bit}/x86_64/RPMS/kernel-ml-devel-4.12.10-1.el${bit}.elrepo.x86_64.rpm 109 | [[ ! -f kernel-ml-devel-4.12.10-1.el${bit}.elrepo.x86_64.rpm ]] && wget https://raw.githubusercontent.com/tcp-nanqinlang/CentOS-kernel/master/kernel-ml-devel-4.12.10-1.el${bit}.elrepo.x86_64.rpm 110 | [[ ! -f kernel-ml-devel-4.12.10-1.el${bit}.elrepo.x86_64.rpm ]] && echo -e "${Error} devel download failed, please check !" && exit 1 111 | yum install -y kernel-ml-devel-4.12.10-1.el${bit}.elrepo.x86_64.rpm 112 | } 113 | install_headers(){ 114 | #[[ ! -f kernel-ml-headers-4.12.10-1.el${bit}.elrepo.x86_64.rpm ]] && wget http://elrepo.mirror.angkasa.id/elrepo/archive/kernel/el${bit}/x86_64/RPMS/kernel-ml-headers-4.12.10-1.el${bit}.elrepo.x86_64.rpm 115 | [[ ! -f kernel-ml-headers-4.12.10-1.el${bit}.elrepo.x86_64.rpm ]] && wget https://raw.githubusercontent.com/tcp-nanqinlang/CentOS-kernel/master/kernel-ml-headers-4.12.10-1.el${bit}.elrepo.x86_64.rpm 116 | [[ ! -f kernel-ml-headers-4.12.10-1.el${bit}.elrepo.x86_64.rpm ]] && echo -e "${Error} headers download failed, please check !" && exit 1 117 | yum install -y kernel-ml-headers-4.12.10-1.el${bit}.elrepo.x86_64.rpm 118 | } 119 | 120 | update-grub(){ 121 | [[ "${bit}" = "7" ]] && grub2-mkconfig -o /boot/grub2/grub.cfg && grub2-set-default 0 122 | [[ "${bit}" = "6" ]] && sed -i '/default=/d' /boot/grub/grub.conf && echo -e "\ndefault=0\c" >> /boot/grub/grub.conf 123 | } 124 | 125 | rpm_list(){ 126 | rpm -qa | grep kernel 127 | } 128 | 129 | maker(){ 130 | yum groupinstall -y "Development Tools" && yum update 131 | [[ ! -e /lib/modules/`uname -r`/kernel/net/ipv4/tcp_nanqinlang.ko ]] && compile 132 | [[ ! -e /lib/modules/`uname -r`/kernel/net/ipv4/tcp_nanqinlang.ko ]] && echo -e "${Error} load mod failed, please check!" && exit 1 133 | } 134 | 135 | compile(){ 136 | wget https://raw.githubusercontent.com/tcp-nanqinlang/general/master/General/CentOS/source/tcp_nanqinlang.c 137 | wget -O Makefile https://raw.githubusercontent.com/tcp-nanqinlang/general/master/Makefile/Makefile-CentOS 138 | make && make install 139 | } 140 | 141 | check_status(){ 142 | #status_sysctl=`sysctl net.ipv4.tcp_available_congestion_control | awk '{print $3}'` 143 | #status_lsmod=`lsmod | grep nanqinlang` 144 | if [[ "`lsmod | grep nanqinlang`" != "" ]]; then 145 | echo -e "${Info} tcp_nanqinlang is installed !" 146 | if [[ "`sysctl net.ipv4.tcp_available_congestion_control | awk '{print $3}'`" = "nanqinlang" ]]; then 147 | echo -e "${Info} tcp_nanqinlang is running !" 148 | else echo -e "${Error} tcp_nanqinlang is installed but not running !" 149 | fi 150 | else 151 | echo -e "${Error} tcp_nanqinlang not installed !" 152 | fi 153 | } 154 | 155 | 156 | 157 | ################################################################################################### 158 | install(){ 159 | check_system 160 | check_root 161 | check_kvm 162 | directory 163 | check_kernel 164 | rpm_list 165 | echo -e "${Info} 请确认此行上面的列表显示的内核版本后,重启以应用新内核" 166 | } 167 | 168 | start(){ 169 | check_system 170 | check_root 171 | check_kvm 172 | directory 173 | delete_surplus_2 && update-grub 174 | maker 175 | sed -i '/net\.core\.default_qdisc/d' /etc/sysctl.conf 176 | sed -i '/net\.ipv4\.tcp_congestion_control/d' /etc/sysctl.conf 177 | echo -e "\nnet.core.default_qdisc=fq" >> /etc/sysctl.conf 178 | echo -e "net.ipv4.tcp_congestion_control=nanqinlang\c" >> /etc/sysctl.conf 179 | sysctl -p 180 | check_status 181 | rm -rf /home/tcp_nanqinlang 182 | } 183 | 184 | status(){ 185 | check_status 186 | } 187 | 188 | uninstall(){ 189 | check_root 190 | sed -i '/net\.core\.default_qdisc=/d' /etc/sysctl.conf 191 | sed -i '/net\.ipv4\.tcp_congestion_control=/d' /etc/sysctl.conf 192 | sysctl -p 193 | rm /lib/modules/`uname -r`/kernel/net/ipv4/tcp_nanqinlang.ko 194 | echo -e "${Info} please remember ${reboot} to stop tcp_nanqinlang !" 195 | } 196 | 197 | echo -e "${Info} 选择你要使用的功能: " 198 | echo -e "1.安装内核\n2.开启算法\n3.检查算法运行状态\n4.卸载算法" 199 | read -p "输入数字以选择:" function 200 | 201 | while [[ ! "${function}" =~ ^[1-4]$ ]] 202 | do 203 | echo -e "${Error} 无效输入" 204 | echo -e "${Info} 请重新选择" && read -p "输入数字以选择:" function 205 | done 206 | 207 | if [[ "${function}" == "1" ]]; then 208 | install 209 | elif [[ "${function}" == "2" ]]; then 210 | start 211 | elif [[ "${function}" == "3" ]]; then 212 | status 213 | else 214 | uninstall 215 | fi -------------------------------------------------------------------------------- /General/CentOS/source/tcp_bbr.c: -------------------------------------------------------------------------------- 1 | /* Bottleneck Bandwidth and RTT (BBR) congestion control 2 | * 3 | * BBR congestion control computes the sending rate based on the delivery 4 | * rate (throughput) estimated from ACKs. In a nutshell: 5 | * 6 | * On each ACK, update our model of the network path: 7 | * bottleneck_bandwidth = windowed_max(delivered / elapsed, 10 round trips) 8 | * min_rtt = windowed_min(rtt, 10 seconds) 9 | * pacing_rate = pacing_gain * bottleneck_bandwidth 10 | * cwnd = max(cwnd_gain * bottleneck_bandwidth * min_rtt, 4) 11 | * 12 | * The core algorithm does not react directly to packet losses or delays, 13 | * although BBR may adjust the size of next send per ACK when loss is 14 | * observed, or adjust the sending rate if it estimates there is a 15 | * traffic policer, in order to keep the drop rate reasonable. 16 | * 17 | * Here is a state transition diagram for BBR: 18 | * 19 | * | 20 | * V 21 | * +---> STARTUP ----+ 22 | * | | | 23 | * | V | 24 | * | DRAIN ----+ 25 | * | | | 26 | * | V | 27 | * +---> PROBE_BW ----+ 28 | * | ^ | | 29 | * | | | | 30 | * | +----+ | 31 | * | | 32 | * +---- PROBE_RTT <--+ 33 | * 34 | * A BBR flow starts in STARTUP, and ramps up its sending rate quickly. 35 | * When it estimates the pipe is full, it enters DRAIN to drain the queue. 36 | * In steady state a BBR flow only uses PROBE_BW and PROBE_RTT. 37 | * A long-lived BBR flow spends the vast majority of its time remaining 38 | * (repeatedly) in PROBE_BW, fully probing and utilizing the pipe's bandwidth 39 | * in a fair manner, with a small, bounded queue. *If* a flow has been 40 | * continuously sending for the entire min_rtt window, and hasn't seen an RTT 41 | * sample that matches or decreases its min_rtt estimate for 10 seconds, then 42 | * it briefly enters PROBE_RTT to cut inflight to a minimum value to re-probe 43 | * the path's two-way propagation delay (min_rtt). When exiting PROBE_RTT, if 44 | * we estimated that we reached the full bw of the pipe then we enter PROBE_BW; 45 | * otherwise we enter STARTUP to try to fill the pipe. 46 | * 47 | * BBR is described in detail in: 48 | * "BBR: Congestion-Based Congestion Control", 49 | * Neal Cardwell, Yuchung Cheng, C. Stephen Gunn, Soheil Hassas Yeganeh, 50 | * Van Jacobson. ACM Queue, Vol. 14 No. 5, September-October 2016. 51 | * 52 | * There is a public e-mail list for discussing BBR development and testing: 53 | * https://groups.google.com/forum/#!forum/bbr-dev 54 | * 55 | * NOTE: BBR *must* be used with the fq qdisc ("man tc-fq") with pacing enabled, 56 | * since pacing is integral to the BBR design and implementation. 57 | * BBR without pacing would not function properly, and may incur unnecessary 58 | * high packet loss rates. 59 | */ 60 | #include 61 | #include 62 | #include 63 | #include 64 | #include 65 | #include 66 | 67 | /* Scale factor for rate in pkt/uSec unit to avoid truncation in bandwidth 68 | * estimation. The rate unit ~= (1500 bytes / 1 usec / 2^24) ~= 715 bps. 69 | * This handles bandwidths from 0.06pps (715bps) to 256Mpps (3Tbps) in a u32. 70 | * Since the minimum window is >=4 packets, the lower bound isn't 71 | * an issue. The upper bound isn't an issue with existing technologies. 72 | */ 73 | #define BW_SCALE 24 74 | #define BW_UNIT (1 << BW_SCALE) 75 | 76 | #define BBR_SCALE 8 /* scaling factor for fractions in BBR (e.g. gains) */ 77 | #define BBR_UNIT (1 << BBR_SCALE) 78 | 79 | /* BBR has the following modes for deciding how fast to send: */ 80 | enum bbr_mode { 81 | BBR_STARTUP, /* ramp up sending rate rapidly to fill pipe */ 82 | BBR_DRAIN, /* drain any queue created during startup */ 83 | BBR_PROBE_BW, /* discover, share bw: pace around estimated bw */ 84 | BBR_PROBE_RTT, /* cut inflight to min to probe min_rtt */ 85 | }; 86 | 87 | /* BBR congestion control block */ 88 | struct bbr { 89 | u32 min_rtt_us; /* min RTT in min_rtt_win_sec window */ 90 | u32 min_rtt_stamp; /* timestamp of min_rtt_us */ 91 | u32 probe_rtt_done_stamp; /* end time for BBR_PROBE_RTT mode */ 92 | struct minmax bw; /* Max recent delivery rate in pkts/uS << 24 */ 93 | u32 rtt_cnt; /* count of packet-timed rounds elapsed */ 94 | u32 next_rtt_delivered; /* scb->tx.delivered at end of round */ 95 | struct skb_mstamp cycle_mstamp; /* time of this cycle phase start */ 96 | u32 mode:3, /* current bbr_mode in state machine */ 97 | prev_ca_state:3, /* CA state on previous ACK */ 98 | packet_conservation:1, /* use packet conservation? */ 99 | restore_cwnd:1, /* decided to revert cwnd to old value */ 100 | round_start:1, /* start of packet-timed tx->ack round? */ 101 | tso_segs_goal:7, /* segments we want in each skb we send */ 102 | idle_restart:1, /* restarting after idle? */ 103 | probe_rtt_round_done:1, /* a BBR_PROBE_RTT round at 4 pkts? */ 104 | unused:5, 105 | lt_is_sampling:1, /* taking long-term ("LT") samples now? */ 106 | lt_rtt_cnt:7, /* round trips in long-term interval */ 107 | lt_use_bw:1; /* use lt_bw as our bw estimate? */ 108 | u32 lt_bw; /* LT est delivery rate in pkts/uS << 24 */ 109 | u32 lt_last_delivered; /* LT intvl start: tp->delivered */ 110 | u32 lt_last_stamp; /* LT intvl start: tp->delivered_mstamp */ 111 | u32 lt_last_lost; /* LT intvl start: tp->lost */ 112 | u32 pacing_gain:10, /* current gain for setting pacing rate */ 113 | cwnd_gain:10, /* current gain for setting cwnd */ 114 | full_bw_cnt:3, /* number of rounds without large bw gains */ 115 | cycle_idx:3, /* current index in pacing_gain cycle array */ 116 | unused_b:6; 117 | u32 prior_cwnd; /* prior cwnd upon entering loss recovery */ 118 | u32 full_bw; /* recent bw, to estimate if pipe is full */ 119 | }; 120 | 121 | #define CYCLE_LEN 8 /* number of phases in a pacing gain cycle */ 122 | 123 | /* Window length of bw filter (in rounds): */ 124 | static const int bbr_bw_rtts = CYCLE_LEN + 2; 125 | /* Window length of min_rtt filter (in sec): */ 126 | static const u32 bbr_min_rtt_win_sec = 10; 127 | /* Minimum time (in ms) spent at bbr_cwnd_min_target in BBR_PROBE_RTT mode: */ 128 | static const u32 bbr_probe_rtt_mode_ms = 200; 129 | /* Skip TSO below the following bandwidth (bits/sec): */ 130 | static const int bbr_min_tso_rate = 1200000; 131 | 132 | /* We use a high_gain value of 2/ln(2) because it's the smallest pacing gain 133 | * that will allow a smoothly increasing pacing rate that will double each RTT 134 | * and send the same number of packets per RTT that an un-paced, slow-starting 135 | * Reno or CUBIC flow would: 136 | */ 137 | static const int bbr_high_gain = BBR_UNIT * 2885 / 1000 + 1; 138 | /* The pacing gain of 1/high_gain in BBR_DRAIN is calculated to typically drain 139 | * the queue created in BBR_STARTUP in a single round: 140 | */ 141 | static const int bbr_drain_gain = BBR_UNIT * 1000 / 2885; 142 | /* The gain for deriving steady-state cwnd tolerates delayed/stretched ACKs: */ 143 | static const int bbr_cwnd_gain = BBR_UNIT * 2; 144 | /* The pacing_gain values for the PROBE_BW gain cycle, to discover/share bw: */ 145 | static const int bbr_pacing_gain[] = { 146 | BBR_UNIT * 5 / 4, /* probe for more available bw */ 147 | BBR_UNIT * 3 / 4, /* drain queue and/or yield bw to other flows */ 148 | BBR_UNIT, BBR_UNIT, BBR_UNIT, /* cruise at 1.0*bw to utilize pipe, */ 149 | BBR_UNIT, BBR_UNIT, BBR_UNIT /* without creating excess queue... */ 150 | }; 151 | /* Randomize the starting gain cycling phase over N phases: */ 152 | static const u32 bbr_cycle_rand = 7; 153 | 154 | /* Try to keep at least this many packets in flight, if things go smoothly. For 155 | * smooth functioning, a sliding window protocol ACKing every other packet 156 | * needs at least 4 packets in flight: 157 | */ 158 | static const u32 bbr_cwnd_min_target = 4; 159 | 160 | /* To estimate if BBR_STARTUP mode (i.e. high_gain) has filled pipe... */ 161 | /* If bw has increased significantly (1.25x), there may be more bw available: */ 162 | static const u32 bbr_full_bw_thresh = BBR_UNIT * 5 / 4; 163 | /* But after 3 rounds w/o significant bw growth, estimate pipe is full: */ 164 | static const u32 bbr_full_bw_cnt = 3; 165 | 166 | /* "long-term" ("LT") bandwidth estimator parameters... */ 167 | /* The minimum number of rounds in an LT bw sampling interval: */ 168 | static const u32 bbr_lt_intvl_min_rtts = 4; 169 | /* If lost/delivered ratio > 20%, interval is "lossy" and we may be policed: */ 170 | static const u32 bbr_lt_loss_thresh = 50; 171 | /* If 2 intervals have a bw ratio <= 1/8, their bw is "consistent": */ 172 | static const u32 bbr_lt_bw_ratio = BBR_UNIT / 8; 173 | /* If 2 intervals have a bw diff <= 4 Kbit/sec their bw is "consistent": */ 174 | static const u32 bbr_lt_bw_diff = 4000 / 8; 175 | /* If we estimate we're policed, use lt_bw for this many round trips: */ 176 | static const u32 bbr_lt_bw_max_rtts = 48; 177 | 178 | /* Do we estimate that STARTUP filled the pipe? */ 179 | static bool bbr_full_bw_reached(const struct sock *sk) 180 | { 181 | const struct bbr *bbr = inet_csk_ca(sk); 182 | 183 | return bbr->full_bw_cnt >= bbr_full_bw_cnt; 184 | } 185 | 186 | /* Return the windowed max recent bandwidth sample, in pkts/uS << BW_SCALE. */ 187 | static u32 bbr_max_bw(const struct sock *sk) 188 | { 189 | struct bbr *bbr = inet_csk_ca(sk); 190 | 191 | return minmax_get(&bbr->bw); 192 | } 193 | 194 | /* Return the estimated bandwidth of the path, in pkts/uS << BW_SCALE. */ 195 | static u32 bbr_bw(const struct sock *sk) 196 | { 197 | struct bbr *bbr = inet_csk_ca(sk); 198 | 199 | return bbr->lt_use_bw ? bbr->lt_bw : bbr_max_bw(sk); 200 | } 201 | 202 | /* Return rate in bytes per second, optionally with a gain. 203 | * The order here is chosen carefully to avoid overflow of u64. This should 204 | * work for input rates of up to 2.9Tbit/sec and gain of 2.89x. 205 | */ 206 | static u64 bbr_rate_bytes_per_sec(struct sock *sk, u64 rate, int gain) 207 | { 208 | rate *= tcp_mss_to_mtu(sk, tcp_sk(sk)->mss_cache); 209 | rate *= gain; 210 | rate >>= BBR_SCALE; 211 | rate *= USEC_PER_SEC; 212 | return rate >> BW_SCALE; 213 | } 214 | 215 | /* Pace using current bw estimate and a gain factor. In order to help drive the 216 | * network toward lower queues while maintaining high utilization and low 217 | * latency, the average pacing rate aims to be slightly (~1%) lower than the 218 | * estimated bandwidth. This is an important aspect of the design. In this 219 | * implementation this slightly lower pacing rate is achieved implicitly by not 220 | * including link-layer headers in the packet size used for the pacing rate. 221 | */ 222 | static void bbr_set_pacing_rate(struct sock *sk, u32 bw, int gain) 223 | { 224 | struct bbr *bbr = inet_csk_ca(sk); 225 | u64 rate = bw; 226 | 227 | rate = bbr_rate_bytes_per_sec(sk, rate, gain); 228 | rate = min_t(u64, rate, sk->sk_max_pacing_rate); 229 | if (bbr->mode != BBR_STARTUP || rate > sk->sk_pacing_rate) 230 | sk->sk_pacing_rate = rate; 231 | } 232 | 233 | /* Return count of segments we want in the skbs we send, or 0 for default. */ 234 | static u32 bbr_tso_segs_goal(struct sock *sk) 235 | { 236 | struct bbr *bbr = inet_csk_ca(sk); 237 | 238 | return bbr->tso_segs_goal; 239 | } 240 | 241 | static void bbr_set_tso_segs_goal(struct sock *sk) 242 | { 243 | struct tcp_sock *tp = tcp_sk(sk); 244 | struct bbr *bbr = inet_csk_ca(sk); 245 | u32 min_segs; 246 | 247 | min_segs = sk->sk_pacing_rate < (bbr_min_tso_rate >> 3) ? 1 : 2; 248 | bbr->tso_segs_goal = min(tcp_tso_autosize(sk, tp->mss_cache, min_segs), 249 | 0x7FU); 250 | } 251 | 252 | /* Save "last known good" cwnd so we can restore it after losses or PROBE_RTT */ 253 | static void bbr_save_cwnd(struct sock *sk) 254 | { 255 | struct tcp_sock *tp = tcp_sk(sk); 256 | struct bbr *bbr = inet_csk_ca(sk); 257 | 258 | if (bbr->prev_ca_state < TCP_CA_Recovery && bbr->mode != BBR_PROBE_RTT) 259 | bbr->prior_cwnd = tp->snd_cwnd; /* this cwnd is good enough */ 260 | else /* loss recovery or BBR_PROBE_RTT have temporarily cut cwnd */ 261 | bbr->prior_cwnd = max(bbr->prior_cwnd, tp->snd_cwnd); 262 | } 263 | 264 | static void bbr_cwnd_event(struct sock *sk, enum tcp_ca_event event) 265 | { 266 | struct tcp_sock *tp = tcp_sk(sk); 267 | struct bbr *bbr = inet_csk_ca(sk); 268 | 269 | if (event == CA_EVENT_TX_START && tp->app_limited) { 270 | bbr->idle_restart = 1; 271 | /* Avoid pointless buffer overflows: pace at est. bw if we don't 272 | * need more speed (we're restarting from idle and app-limited). 273 | */ 274 | if (bbr->mode == BBR_PROBE_BW) 275 | bbr_set_pacing_rate(sk, bbr_bw(sk), BBR_UNIT); 276 | } 277 | } 278 | 279 | /* Find target cwnd. Right-size the cwnd based on min RTT and the 280 | * estimated bottleneck bandwidth: 281 | * 282 | * cwnd = bw * min_rtt * gain = BDP * gain 283 | * 284 | * The key factor, gain, controls the amount of queue. While a small gain 285 | * builds a smaller queue, it becomes more vulnerable to noise in RTT 286 | * measurements (e.g., delayed ACKs or other ACK compression effects). This 287 | * noise may cause BBR to under-estimate the rate. 288 | * 289 | * To achieve full performance in high-speed paths, we budget enough cwnd to 290 | * fit full-sized skbs in-flight on both end hosts to fully utilize the path: 291 | * - one skb in sending host Qdisc, 292 | * - one skb in sending host TSO/GSO engine 293 | * - one skb being received by receiver host LRO/GRO/delayed-ACK engine 294 | * Don't worry, at low rates (bbr_min_tso_rate) this won't bloat cwnd because 295 | * in such cases tso_segs_goal is 1. The minimum cwnd is 4 packets, 296 | * which allows 2 outstanding 2-packet sequences, to try to keep pipe 297 | * full even with ACK-every-other-packet delayed ACKs. 298 | */ 299 | static u32 bbr_target_cwnd(struct sock *sk, u32 bw, int gain) 300 | { 301 | struct bbr *bbr = inet_csk_ca(sk); 302 | u32 cwnd; 303 | u64 w; 304 | 305 | /* If we've never had a valid RTT sample, cap cwnd at the initial 306 | * default. This should only happen when the connection is not using TCP 307 | * timestamps and has retransmitted all of the SYN/SYNACK/data packets 308 | * ACKed so far. In this case, an RTO can cut cwnd to 1, in which 309 | * case we need to slow-start up toward something safe: TCP_INIT_CWND. 310 | */ 311 | if (unlikely(bbr->min_rtt_us == ~0U)) /* no valid RTT samples yet? */ 312 | return TCP_INIT_CWND; /* be safe: cap at default initial cwnd*/ 313 | 314 | w = (u64)bw * bbr->min_rtt_us; 315 | 316 | /* Apply a gain to the given value, then remove the BW_SCALE shift. */ 317 | cwnd = (((w * gain) >> BBR_SCALE) + BW_UNIT - 1) / BW_UNIT; 318 | 319 | /* Allow enough full-sized skbs in flight to utilize end systems. */ 320 | cwnd += 3 * bbr->tso_segs_goal; 321 | 322 | /* Reduce delayed ACKs by rounding up cwnd to the next even number. */ 323 | cwnd = (cwnd + 1) & ~1U; 324 | 325 | return cwnd; 326 | } 327 | 328 | /* An optimization in BBR to reduce losses: On the first round of recovery, we 329 | * follow the packet conservation principle: send P packets per P packets acked. 330 | * After that, we slow-start and send at most 2*P packets per P packets acked. 331 | * After recovery finishes, or upon undo, we restore the cwnd we had when 332 | * recovery started (capped by the target cwnd based on estimated BDP). 333 | * 334 | * TODO(ycheng/ncardwell): implement a rate-based approach. 335 | */ 336 | static bool bbr_set_cwnd_to_recover_or_restore( 337 | struct sock *sk, const struct rate_sample *rs, u32 acked, u32 *new_cwnd) 338 | { 339 | struct tcp_sock *tp = tcp_sk(sk); 340 | struct bbr *bbr = inet_csk_ca(sk); 341 | u8 prev_state = bbr->prev_ca_state, state = inet_csk(sk)->icsk_ca_state; 342 | u32 cwnd = tp->snd_cwnd; 343 | 344 | /* An ACK for P pkts should release at most 2*P packets. We do this 345 | * in two steps. First, here we deduct the number of lost packets. 346 | * Then, in bbr_set_cwnd() we slow start up toward the target cwnd. 347 | */ 348 | if (rs->losses > 0) 349 | cwnd = max_t(s32, cwnd - rs->losses, 1); 350 | 351 | if (state == TCP_CA_Recovery && prev_state != TCP_CA_Recovery) { 352 | /* Starting 1st round of Recovery, so do packet conservation. */ 353 | bbr->packet_conservation = 1; 354 | bbr->next_rtt_delivered = tp->delivered; /* start round now */ 355 | /* Cut unused cwnd from app behavior, TSQ, or TSO deferral: */ 356 | cwnd = tcp_packets_in_flight(tp) + acked; 357 | } else if (prev_state >= TCP_CA_Recovery && state < TCP_CA_Recovery) { 358 | /* Exiting loss recovery; restore cwnd saved before recovery. */ 359 | bbr->restore_cwnd = 1; 360 | bbr->packet_conservation = 0; 361 | } 362 | bbr->prev_ca_state = state; 363 | 364 | if (bbr->restore_cwnd) { 365 | /* Restore cwnd after exiting loss recovery or PROBE_RTT. */ 366 | cwnd = max(cwnd, bbr->prior_cwnd); 367 | bbr->restore_cwnd = 0; 368 | } 369 | 370 | if (bbr->packet_conservation) { 371 | *new_cwnd = max(cwnd, tcp_packets_in_flight(tp) + acked); 372 | return true; /* yes, using packet conservation */ 373 | } 374 | *new_cwnd = cwnd; 375 | return false; 376 | } 377 | 378 | /* Slow-start up toward target cwnd (if bw estimate is growing, or packet loss 379 | * has drawn us down below target), or snap down to target if we're above it. 380 | */ 381 | static void bbr_set_cwnd(struct sock *sk, const struct rate_sample *rs, 382 | u32 acked, u32 bw, int gain) 383 | { 384 | struct tcp_sock *tp = tcp_sk(sk); 385 | struct bbr *bbr = inet_csk_ca(sk); 386 | u32 cwnd = 0, target_cwnd = 0; 387 | 388 | if (!acked) 389 | return; 390 | 391 | if (bbr_set_cwnd_to_recover_or_restore(sk, rs, acked, &cwnd)) 392 | goto done; 393 | 394 | /* If we're below target cwnd, slow start cwnd toward target cwnd. */ 395 | target_cwnd = bbr_target_cwnd(sk, bw, gain); 396 | if (bbr_full_bw_reached(sk)) /* only cut cwnd if we filled the pipe */ 397 | cwnd = min(cwnd + acked, target_cwnd); 398 | else if (cwnd < target_cwnd || tp->delivered < TCP_INIT_CWND) 399 | cwnd = cwnd + acked; 400 | cwnd = max(cwnd, bbr_cwnd_min_target); 401 | 402 | done: 403 | tp->snd_cwnd = min(cwnd, tp->snd_cwnd_clamp); /* apply global cap */ 404 | if (bbr->mode == BBR_PROBE_RTT) /* drain queue, refresh min_rtt */ 405 | tp->snd_cwnd = min(tp->snd_cwnd, bbr_cwnd_min_target); 406 | } 407 | 408 | /* End cycle phase if it's time and/or we hit the phase's in-flight target. */ 409 | static bool bbr_is_next_cycle_phase(struct sock *sk, 410 | const struct rate_sample *rs) 411 | { 412 | struct tcp_sock *tp = tcp_sk(sk); 413 | struct bbr *bbr = inet_csk_ca(sk); 414 | bool is_full_length = 415 | skb_mstamp_us_delta(&tp->delivered_mstamp, &bbr->cycle_mstamp) > 416 | bbr->min_rtt_us; 417 | u32 inflight, bw; 418 | 419 | /* The pacing_gain of 1.0 paces at the estimated bw to try to fully 420 | * use the pipe without increasing the queue. 421 | */ 422 | if (bbr->pacing_gain == BBR_UNIT) 423 | return is_full_length; /* just use wall clock time */ 424 | 425 | inflight = rs->prior_in_flight; /* what was in-flight before ACK? */ 426 | bw = bbr_max_bw(sk); 427 | 428 | /* A pacing_gain > 1.0 probes for bw by trying to raise inflight to at 429 | * least pacing_gain*BDP; this may take more than min_rtt if min_rtt is 430 | * small (e.g. on a LAN). We do not persist if packets are lost, since 431 | * a path with small buffers may not hold that much. 432 | */ 433 | if (bbr->pacing_gain > BBR_UNIT) 434 | return is_full_length && 435 | (rs->losses || /* perhaps pacing_gain*BDP won't fit */ 436 | inflight >= bbr_target_cwnd(sk, bw, bbr->pacing_gain)); 437 | 438 | /* A pacing_gain < 1.0 tries to drain extra queue we added if bw 439 | * probing didn't find more bw. If inflight falls to match BDP then we 440 | * estimate queue is drained; persisting would underutilize the pipe. 441 | */ 442 | return is_full_length || 443 | inflight <= bbr_target_cwnd(sk, bw, BBR_UNIT); 444 | } 445 | 446 | static void bbr_advance_cycle_phase(struct sock *sk) 447 | { 448 | struct tcp_sock *tp = tcp_sk(sk); 449 | struct bbr *bbr = inet_csk_ca(sk); 450 | 451 | bbr->cycle_idx = (bbr->cycle_idx + 1) & (CYCLE_LEN - 1); 452 | bbr->cycle_mstamp = tp->delivered_mstamp; 453 | bbr->pacing_gain = bbr_pacing_gain[bbr->cycle_idx]; 454 | } 455 | 456 | /* Gain cycling: cycle pacing gain to converge to fair share of available bw. */ 457 | static void bbr_update_cycle_phase(struct sock *sk, 458 | const struct rate_sample *rs) 459 | { 460 | struct bbr *bbr = inet_csk_ca(sk); 461 | 462 | if ((bbr->mode == BBR_PROBE_BW) && !bbr->lt_use_bw && 463 | bbr_is_next_cycle_phase(sk, rs)) 464 | bbr_advance_cycle_phase(sk); 465 | } 466 | 467 | static void bbr_reset_startup_mode(struct sock *sk) 468 | { 469 | struct bbr *bbr = inet_csk_ca(sk); 470 | 471 | bbr->mode = BBR_STARTUP; 472 | bbr->pacing_gain = bbr_high_gain; 473 | bbr->cwnd_gain = bbr_high_gain; 474 | } 475 | 476 | static void bbr_reset_probe_bw_mode(struct sock *sk) 477 | { 478 | struct bbr *bbr = inet_csk_ca(sk); 479 | 480 | bbr->mode = BBR_PROBE_BW; 481 | bbr->pacing_gain = BBR_UNIT; 482 | bbr->cwnd_gain = bbr_cwnd_gain; 483 | bbr->cycle_idx = CYCLE_LEN - 1 - prandom_u32_max(bbr_cycle_rand); 484 | bbr_advance_cycle_phase(sk); /* flip to next phase of gain cycle */ 485 | } 486 | 487 | static void bbr_reset_mode(struct sock *sk) 488 | { 489 | if (!bbr_full_bw_reached(sk)) 490 | bbr_reset_startup_mode(sk); 491 | else 492 | bbr_reset_probe_bw_mode(sk); 493 | } 494 | 495 | /* Start a new long-term sampling interval. */ 496 | static void bbr_reset_lt_bw_sampling_interval(struct sock *sk) 497 | { 498 | struct tcp_sock *tp = tcp_sk(sk); 499 | struct bbr *bbr = inet_csk_ca(sk); 500 | 501 | bbr->lt_last_stamp = tp->delivered_mstamp.stamp_jiffies; 502 | bbr->lt_last_delivered = tp->delivered; 503 | bbr->lt_last_lost = tp->lost; 504 | bbr->lt_rtt_cnt = 0; 505 | } 506 | 507 | /* Completely reset long-term bandwidth sampling. */ 508 | static void bbr_reset_lt_bw_sampling(struct sock *sk) 509 | { 510 | struct bbr *bbr = inet_csk_ca(sk); 511 | 512 | bbr->lt_bw = 0; 513 | bbr->lt_use_bw = 0; 514 | bbr->lt_is_sampling = false; 515 | bbr_reset_lt_bw_sampling_interval(sk); 516 | } 517 | 518 | /* Long-term bw sampling interval is done. Estimate whether we're policed. */ 519 | static void bbr_lt_bw_interval_done(struct sock *sk, u32 bw) 520 | { 521 | struct bbr *bbr = inet_csk_ca(sk); 522 | u32 diff; 523 | 524 | if (bbr->lt_bw) { /* do we have bw from a previous interval? */ 525 | /* Is new bw close to the lt_bw from the previous interval? */ 526 | diff = abs(bw - bbr->lt_bw); 527 | if ((diff * BBR_UNIT <= bbr_lt_bw_ratio * bbr->lt_bw) || 528 | (bbr_rate_bytes_per_sec(sk, diff, BBR_UNIT) <= 529 | bbr_lt_bw_diff)) { 530 | /* All criteria are met; estimate we're policed. */ 531 | bbr->lt_bw = (bw + bbr->lt_bw) >> 1; /* avg 2 intvls */ 532 | bbr->lt_use_bw = 1; 533 | bbr->pacing_gain = BBR_UNIT; /* try to avoid drops */ 534 | bbr->lt_rtt_cnt = 0; 535 | return; 536 | } 537 | } 538 | bbr->lt_bw = bw; 539 | bbr_reset_lt_bw_sampling_interval(sk); 540 | } 541 | 542 | /* Token-bucket traffic policers are common (see "An Internet-Wide Analysis of 543 | * Traffic Policing", SIGCOMM 2016). BBR detects token-bucket policers and 544 | * explicitly models their policed rate, to reduce unnecessary losses. We 545 | * estimate that we're policed if we see 2 consecutive sampling intervals with 546 | * consistent throughput and high packet loss. If we think we're being policed, 547 | * set lt_bw to the "long-term" average delivery rate from those 2 intervals. 548 | */ 549 | static void bbr_lt_bw_sampling(struct sock *sk, const struct rate_sample *rs) 550 | { 551 | struct tcp_sock *tp = tcp_sk(sk); 552 | struct bbr *bbr = inet_csk_ca(sk); 553 | u32 lost, delivered; 554 | u64 bw; 555 | s32 t; 556 | 557 | if (bbr->lt_use_bw) { /* already using long-term rate, lt_bw? */ 558 | if (bbr->mode == BBR_PROBE_BW && bbr->round_start && 559 | ++bbr->lt_rtt_cnt >= bbr_lt_bw_max_rtts) { 560 | bbr_reset_lt_bw_sampling(sk); /* stop using lt_bw */ 561 | bbr_reset_probe_bw_mode(sk); /* restart gain cycling */ 562 | } 563 | return; 564 | } 565 | 566 | /* Wait for the first loss before sampling, to let the policer exhaust 567 | * its tokens and estimate the steady-state rate allowed by the policer. 568 | * Starting samples earlier includes bursts that over-estimate the bw. 569 | */ 570 | if (!bbr->lt_is_sampling) { 571 | if (!rs->losses) 572 | return; 573 | bbr_reset_lt_bw_sampling_interval(sk); 574 | bbr->lt_is_sampling = true; 575 | } 576 | 577 | /* To avoid underestimates, reset sampling if we run out of data. */ 578 | if (rs->is_app_limited) { 579 | bbr_reset_lt_bw_sampling(sk); 580 | return; 581 | } 582 | 583 | if (bbr->round_start) 584 | bbr->lt_rtt_cnt++; /* count round trips in this interval */ 585 | if (bbr->lt_rtt_cnt < bbr_lt_intvl_min_rtts) 586 | return; /* sampling interval needs to be longer */ 587 | if (bbr->lt_rtt_cnt > 4 * bbr_lt_intvl_min_rtts) { 588 | bbr_reset_lt_bw_sampling(sk); /* interval is too long */ 589 | return; 590 | } 591 | 592 | /* End sampling interval when a packet is lost, so we estimate the 593 | * policer tokens were exhausted. Stopping the sampling before the 594 | * tokens are exhausted under-estimates the policed rate. 595 | */ 596 | if (!rs->losses) 597 | return; 598 | 599 | /* Calculate packets lost and delivered in sampling interval. */ 600 | lost = tp->lost - bbr->lt_last_lost; 601 | delivered = tp->delivered - bbr->lt_last_delivered; 602 | /* Is loss rate (lost/delivered) >= lt_loss_thresh? If not, wait. */ 603 | if (!delivered || (lost << BBR_SCALE) < bbr_lt_loss_thresh * delivered) 604 | return; 605 | 606 | /* Find average delivery rate in this sampling interval. */ 607 | t = (s32)(tp->delivered_mstamp.stamp_jiffies - bbr->lt_last_stamp); 608 | if (t < 1) 609 | return; /* interval is less than one jiffy, so wait */ 610 | t = jiffies_to_usecs(t); 611 | /* Interval long enough for jiffies_to_usecs() to return a bogus 0? */ 612 | if (t < 1) { 613 | bbr_reset_lt_bw_sampling(sk); /* interval too long; reset */ 614 | return; 615 | } 616 | bw = (u64)delivered * BW_UNIT; 617 | do_div(bw, t); 618 | bbr_lt_bw_interval_done(sk, bw); 619 | } 620 | 621 | /* Estimate the bandwidth based on how fast packets are delivered */ 622 | static void bbr_update_bw(struct sock *sk, const struct rate_sample *rs) 623 | { 624 | struct tcp_sock *tp = tcp_sk(sk); 625 | struct bbr *bbr = inet_csk_ca(sk); 626 | u64 bw; 627 | 628 | bbr->round_start = 0; 629 | if (rs->delivered < 0 || rs->interval_us <= 0) 630 | return; /* Not a valid observation */ 631 | 632 | /* See if we've reached the next RTT */ 633 | if (!before(rs->prior_delivered, bbr->next_rtt_delivered)) { 634 | bbr->next_rtt_delivered = tp->delivered; 635 | bbr->rtt_cnt++; 636 | bbr->round_start = 1; 637 | bbr->packet_conservation = 0; 638 | } 639 | 640 | bbr_lt_bw_sampling(sk, rs); 641 | 642 | /* Divide delivered by the interval to find a (lower bound) bottleneck 643 | * bandwidth sample. Delivered is in packets and interval_us in uS and 644 | * ratio will be <<1 for most connections. So delivered is first scaled. 645 | */ 646 | bw = (u64)rs->delivered * BW_UNIT; 647 | do_div(bw, rs->interval_us); 648 | 649 | /* If this sample is application-limited, it is likely to have a very 650 | * low delivered count that represents application behavior rather than 651 | * the available network rate. Such a sample could drag down estimated 652 | * bw, causing needless slow-down. Thus, to continue to send at the 653 | * last measured network rate, we filter out app-limited samples unless 654 | * they describe the path bw at least as well as our bw model. 655 | * 656 | * So the goal during app-limited phase is to proceed with the best 657 | * network rate no matter how long. We automatically leave this 658 | * phase when app writes faster than the network can deliver :) 659 | */ 660 | if (!rs->is_app_limited || bw >= bbr_max_bw(sk)) { 661 | /* Incorporate new sample into our max bw filter. */ 662 | minmax_running_max(&bbr->bw, bbr_bw_rtts, bbr->rtt_cnt, bw); 663 | } 664 | } 665 | 666 | /* Estimate when the pipe is full, using the change in delivery rate: BBR 667 | * estimates that STARTUP filled the pipe if the estimated bw hasn't changed by 668 | * at least bbr_full_bw_thresh (25%) after bbr_full_bw_cnt (3) non-app-limited 669 | * rounds. Why 3 rounds: 1: rwin autotuning grows the rwin, 2: we fill the 670 | * higher rwin, 3: we get higher delivery rate samples. Or transient 671 | * cross-traffic or radio noise can go away. CUBIC Hystart shares a similar 672 | * design goal, but uses delay and inter-ACK spacing instead of bandwidth. 673 | */ 674 | static void bbr_check_full_bw_reached(struct sock *sk, 675 | const struct rate_sample *rs) 676 | { 677 | struct bbr *bbr = inet_csk_ca(sk); 678 | u32 bw_thresh; 679 | 680 | if (bbr_full_bw_reached(sk) || !bbr->round_start || rs->is_app_limited) 681 | return; 682 | 683 | bw_thresh = (u64)bbr->full_bw * bbr_full_bw_thresh >> BBR_SCALE; 684 | if (bbr_max_bw(sk) >= bw_thresh) { 685 | bbr->full_bw = bbr_max_bw(sk); 686 | bbr->full_bw_cnt = 0; 687 | return; 688 | } 689 | ++bbr->full_bw_cnt; 690 | } 691 | 692 | /* If pipe is probably full, drain the queue and then enter steady-state. */ 693 | static void bbr_check_drain(struct sock *sk, const struct rate_sample *rs) 694 | { 695 | struct bbr *bbr = inet_csk_ca(sk); 696 | 697 | if (bbr->mode == BBR_STARTUP && bbr_full_bw_reached(sk)) { 698 | bbr->mode = BBR_DRAIN; /* drain queue we created */ 699 | bbr->pacing_gain = bbr_drain_gain; /* pace slow to drain */ 700 | bbr->cwnd_gain = bbr_high_gain; /* maintain cwnd */ 701 | } /* fall through to check if in-flight is already small: */ 702 | if (bbr->mode == BBR_DRAIN && 703 | tcp_packets_in_flight(tcp_sk(sk)) <= 704 | bbr_target_cwnd(sk, bbr_max_bw(sk), BBR_UNIT)) 705 | bbr_reset_probe_bw_mode(sk); /* we estimate queue is drained */ 706 | } 707 | 708 | /* The goal of PROBE_RTT mode is to have BBR flows cooperatively and 709 | * periodically drain the bottleneck queue, to converge to measure the true 710 | * min_rtt (unloaded propagation delay). This allows the flows to keep queues 711 | * small (reducing queuing delay and packet loss) and achieve fairness among 712 | * BBR flows. 713 | * 714 | * The min_rtt filter window is 10 seconds. When the min_rtt estimate expires, 715 | * we enter PROBE_RTT mode and cap the cwnd at bbr_cwnd_min_target=4 packets. 716 | * After at least bbr_probe_rtt_mode_ms=200ms and at least one packet-timed 717 | * round trip elapsed with that flight size <= 4, we leave PROBE_RTT mode and 718 | * re-enter the previous mode. BBR uses 200ms to approximately bound the 719 | * performance penalty of PROBE_RTT's cwnd capping to roughly 2% (200ms/10s). 720 | * 721 | * Note that flows need only pay 2% if they are busy sending over the last 10 722 | * seconds. Interactive applications (e.g., Web, RPCs, video chunks) often have 723 | * natural silences or low-rate periods within 10 seconds where the rate is low 724 | * enough for long enough to drain its queue in the bottleneck. We pick up 725 | * these min RTT measurements opportunistically with our min_rtt filter. :-) 726 | */ 727 | static void bbr_update_min_rtt(struct sock *sk, const struct rate_sample *rs) 728 | { 729 | struct tcp_sock *tp = tcp_sk(sk); 730 | struct bbr *bbr = inet_csk_ca(sk); 731 | bool filter_expired; 732 | 733 | /* Track min RTT seen in the min_rtt_win_sec filter window: */ 734 | filter_expired = after(tcp_time_stamp, 735 | bbr->min_rtt_stamp + bbr_min_rtt_win_sec * HZ); 736 | if (rs->rtt_us >= 0 && 737 | (rs->rtt_us <= bbr->min_rtt_us || filter_expired)) { 738 | bbr->min_rtt_us = rs->rtt_us; 739 | bbr->min_rtt_stamp = tcp_time_stamp; 740 | } 741 | 742 | if (bbr_probe_rtt_mode_ms > 0 && filter_expired && 743 | !bbr->idle_restart && bbr->mode != BBR_PROBE_RTT) { 744 | bbr->mode = BBR_PROBE_RTT; /* dip, drain queue */ 745 | bbr->pacing_gain = BBR_UNIT; 746 | bbr->cwnd_gain = BBR_UNIT; 747 | bbr_save_cwnd(sk); /* note cwnd so we can restore it */ 748 | bbr->probe_rtt_done_stamp = 0; 749 | } 750 | 751 | if (bbr->mode == BBR_PROBE_RTT) { 752 | /* Ignore low rate samples during this mode. */ 753 | tp->app_limited = 754 | (tp->delivered + tcp_packets_in_flight(tp)) ? : 1; 755 | /* Maintain min packets in flight for max(200 ms, 1 round). */ 756 | if (!bbr->probe_rtt_done_stamp && 757 | tcp_packets_in_flight(tp) <= bbr_cwnd_min_target) { 758 | bbr->probe_rtt_done_stamp = tcp_time_stamp + 759 | msecs_to_jiffies(bbr_probe_rtt_mode_ms); 760 | bbr->probe_rtt_round_done = 0; 761 | bbr->next_rtt_delivered = tp->delivered; 762 | } else if (bbr->probe_rtt_done_stamp) { 763 | if (bbr->round_start) 764 | bbr->probe_rtt_round_done = 1; 765 | if (bbr->probe_rtt_round_done && 766 | after(tcp_time_stamp, bbr->probe_rtt_done_stamp)) { 767 | bbr->min_rtt_stamp = tcp_time_stamp; 768 | bbr->restore_cwnd = 1; /* snap to prior_cwnd */ 769 | bbr_reset_mode(sk); 770 | } 771 | } 772 | } 773 | bbr->idle_restart = 0; 774 | } 775 | 776 | static void bbr_update_model(struct sock *sk, const struct rate_sample *rs) 777 | { 778 | bbr_update_bw(sk, rs); 779 | bbr_update_cycle_phase(sk, rs); 780 | bbr_check_full_bw_reached(sk, rs); 781 | bbr_check_drain(sk, rs); 782 | bbr_update_min_rtt(sk, rs); 783 | } 784 | 785 | static void bbr_main(struct sock *sk, const struct rate_sample *rs) 786 | { 787 | struct bbr *bbr = inet_csk_ca(sk); 788 | u32 bw; 789 | 790 | bbr_update_model(sk, rs); 791 | 792 | bw = bbr_bw(sk); 793 | bbr_set_pacing_rate(sk, bw, bbr->pacing_gain); 794 | bbr_set_tso_segs_goal(sk); 795 | bbr_set_cwnd(sk, rs, rs->acked_sacked, bw, bbr->cwnd_gain); 796 | } 797 | 798 | static void bbr_init(struct sock *sk) 799 | { 800 | struct tcp_sock *tp = tcp_sk(sk); 801 | struct bbr *bbr = inet_csk_ca(sk); 802 | u64 bw; 803 | 804 | bbr->prior_cwnd = 0; 805 | bbr->tso_segs_goal = 0; /* default segs per skb until first ACK */ 806 | bbr->rtt_cnt = 0; 807 | bbr->next_rtt_delivered = 0; 808 | bbr->prev_ca_state = TCP_CA_Open; 809 | bbr->packet_conservation = 0; 810 | 811 | bbr->probe_rtt_done_stamp = 0; 812 | bbr->probe_rtt_round_done = 0; 813 | bbr->min_rtt_us = tcp_min_rtt(tp); 814 | bbr->min_rtt_stamp = tcp_time_stamp; 815 | 816 | minmax_reset(&bbr->bw, bbr->rtt_cnt, 0); /* init max bw to 0 */ 817 | 818 | /* Initialize pacing rate to: high_gain * init_cwnd / RTT. */ 819 | bw = (u64)tp->snd_cwnd * BW_UNIT; 820 | do_div(bw, (tp->srtt_us >> 3) ? : USEC_PER_MSEC); 821 | sk->sk_pacing_rate = 0; /* force an update of sk_pacing_rate */ 822 | bbr_set_pacing_rate(sk, bw, bbr_high_gain); 823 | 824 | bbr->restore_cwnd = 0; 825 | bbr->round_start = 0; 826 | bbr->idle_restart = 0; 827 | bbr->full_bw = 0; 828 | bbr->full_bw_cnt = 0; 829 | bbr->cycle_mstamp.v64 = 0; 830 | bbr->cycle_idx = 0; 831 | bbr_reset_lt_bw_sampling(sk); 832 | bbr_reset_startup_mode(sk); 833 | } 834 | 835 | static u32 bbr_sndbuf_expand(struct sock *sk) 836 | { 837 | /* Provision 3 * cwnd since BBR may slow-start even during recovery. */ 838 | return 3; 839 | } 840 | 841 | /* In theory BBR does not need to undo the cwnd since it does not 842 | * always reduce cwnd on losses (see bbr_main()). Keep it for now. 843 | */ 844 | static u32 bbr_undo_cwnd(struct sock *sk) 845 | { 846 | return tcp_sk(sk)->snd_cwnd; 847 | } 848 | 849 | /* Entering loss recovery, so save cwnd for when we exit or undo recovery. */ 850 | static u32 bbr_ssthresh(struct sock *sk) 851 | { 852 | bbr_save_cwnd(sk); 853 | return TCP_INFINITE_SSTHRESH; /* BBR does not use ssthresh */ 854 | } 855 | 856 | static size_t bbr_get_info(struct sock *sk, u32 ext, int *attr, 857 | union tcp_cc_info *info) 858 | { 859 | if (ext & (1 << (INET_DIAG_BBRINFO - 1)) || 860 | ext & (1 << (INET_DIAG_VEGASINFO - 1))) { 861 | struct tcp_sock *tp = tcp_sk(sk); 862 | struct bbr *bbr = inet_csk_ca(sk); 863 | u64 bw = bbr_bw(sk); 864 | 865 | bw = bw * tp->mss_cache * USEC_PER_SEC >> BW_SCALE; 866 | memset(&info->bbr, 0, sizeof(info->bbr)); 867 | info->bbr.bbr_bw_lo = (u32)bw; 868 | info->bbr.bbr_bw_hi = (u32)(bw >> 32); 869 | info->bbr.bbr_min_rtt = bbr->min_rtt_us; 870 | info->bbr.bbr_pacing_gain = bbr->pacing_gain; 871 | info->bbr.bbr_cwnd_gain = bbr->cwnd_gain; 872 | *attr = INET_DIAG_BBRINFO; 873 | return sizeof(info->bbr); 874 | } 875 | return 0; 876 | } 877 | 878 | static void bbr_set_state(struct sock *sk, u8 new_state) 879 | { 880 | struct bbr *bbr = inet_csk_ca(sk); 881 | 882 | if (new_state == TCP_CA_Loss) { 883 | struct rate_sample rs = { .losses = 1 }; 884 | 885 | bbr->prev_ca_state = TCP_CA_Loss; 886 | bbr->full_bw = 0; 887 | bbr->round_start = 1; /* treat RTO like end of a round */ 888 | bbr_lt_bw_sampling(sk, &rs); 889 | } 890 | } 891 | 892 | static struct tcp_congestion_ops tcp_bbr_cong_ops __read_mostly = { 893 | .flags = TCP_CONG_NON_RESTRICTED, 894 | .name = "bbr", 895 | .owner = THIS_MODULE, 896 | .init = bbr_init, 897 | .cong_control = bbr_main, 898 | .sndbuf_expand = bbr_sndbuf_expand, 899 | .undo_cwnd = bbr_undo_cwnd, 900 | .cwnd_event = bbr_cwnd_event, 901 | .ssthresh = bbr_ssthresh, 902 | .tso_segs_goal = bbr_tso_segs_goal, 903 | .get_info = bbr_get_info, 904 | .set_state = bbr_set_state, 905 | }; 906 | 907 | static int __init bbr_register(void) 908 | { 909 | BUILD_BUG_ON(sizeof(struct bbr) > ICSK_CA_PRIV_SIZE); 910 | return tcp_register_congestion_control(&tcp_bbr_cong_ops); 911 | } 912 | 913 | static void __exit bbr_unregister(void) 914 | { 915 | tcp_unregister_congestion_control(&tcp_bbr_cong_ops); 916 | } 917 | 918 | module_init(bbr_register); 919 | module_exit(bbr_unregister); 920 | 921 | MODULE_AUTHOR("Van Jacobson "); 922 | MODULE_AUTHOR("Neal Cardwell "); 923 | MODULE_AUTHOR("Yuchung Cheng "); 924 | MODULE_AUTHOR("Soheil Hassas Yeganeh "); 925 | MODULE_LICENSE("Dual BSD/GPL"); 926 | MODULE_DESCRIPTION("TCP BBR (Bottleneck Bandwidth and RTT)"); 927 | -------------------------------------------------------------------------------- /General/Debian/mod/tcp_nanqinlang-for-v4.10.2.ko: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tcp-nanqinlang/general/0b92fc4efc3a5cc1a2e486b86c6123353f4d5bfe/General/Debian/mod/tcp_nanqinlang-for-v4.10.2.ko -------------------------------------------------------------------------------- /General/Debian/source/kernel-v4.12andbelow/tcp_bbr.c: -------------------------------------------------------------------------------- 1 | /* Bottleneck Bandwidth and RTT (BBR) congestion control 2 | * 3 | * BBR congestion control computes the sending rate based on the delivery 4 | * rate (throughput) estimated from ACKs. In a nutshell: 5 | * 6 | * On each ACK, update our model of the network path: 7 | * bottleneck_bandwidth = windowed_max(delivered / elapsed, 10 round trips) 8 | * min_rtt = windowed_min(rtt, 10 seconds) 9 | * pacing_rate = pacing_gain * bottleneck_bandwidth 10 | * cwnd = max(cwnd_gain * bottleneck_bandwidth * min_rtt, 4) 11 | * 12 | * The core algorithm does not react directly to packet losses or delays, 13 | * although BBR may adjust the size of next send per ACK when loss is 14 | * observed, or adjust the sending rate if it estimates there is a 15 | * traffic policer, in order to keep the drop rate reasonable. 16 | * 17 | * Here is a state transition diagram for BBR: 18 | * 19 | * | 20 | * V 21 | * +---> STARTUP ----+ 22 | * | | | 23 | * | V | 24 | * | DRAIN ----+ 25 | * | | | 26 | * | V | 27 | * +---> PROBE_BW ----+ 28 | * | ^ | | 29 | * | | | | 30 | * | +----+ | 31 | * | | 32 | * +---- PROBE_RTT <--+ 33 | * 34 | * A BBR flow starts in STARTUP, and ramps up its sending rate quickly. 35 | * When it estimates the pipe is full, it enters DRAIN to drain the queue. 36 | * In steady state a BBR flow only uses PROBE_BW and PROBE_RTT. 37 | * A long-lived BBR flow spends the vast majority of its time remaining 38 | * (repeatedly) in PROBE_BW, fully probing and utilizing the pipe's bandwidth 39 | * in a fair manner, with a small, bounded queue. *If* a flow has been 40 | * continuously sending for the entire min_rtt window, and hasn't seen an RTT 41 | * sample that matches or decreases its min_rtt estimate for 10 seconds, then 42 | * it briefly enters PROBE_RTT to cut inflight to a minimum value to re-probe 43 | * the path's two-way propagation delay (min_rtt). When exiting PROBE_RTT, if 44 | * we estimated that we reached the full bw of the pipe then we enter PROBE_BW; 45 | * otherwise we enter STARTUP to try to fill the pipe. 46 | * 47 | * BBR is described in detail in: 48 | * "BBR: Congestion-Based Congestion Control", 49 | * Neal Cardwell, Yuchung Cheng, C. Stephen Gunn, Soheil Hassas Yeganeh, 50 | * Van Jacobson. ACM Queue, Vol. 14 No. 5, September-October 2016. 51 | * 52 | * There is a public e-mail list for discussing BBR development and testing: 53 | * https://groups.google.com/forum/#!forum/bbr-dev 54 | * 55 | * NOTE: BBR *must* be used with the fq qdisc ("man tc-fq") with pacing enabled, 56 | * since pacing is integral to the BBR design and implementation. 57 | * BBR without pacing would not function properly, and may incur unnecessary 58 | * high packet loss rates. 59 | */ 60 | #include 61 | #include 62 | #include 63 | #include 64 | #include 65 | #include 66 | 67 | /* Scale factor for rate in pkt/uSec unit to avoid truncation in bandwidth 68 | * estimation. The rate unit ~= (1500 bytes / 1 usec / 2^24) ~= 715 bps. 69 | * This handles bandwidths from 0.06pps (715bps) to 256Mpps (3Tbps) in a u32. 70 | * Since the minimum window is >=4 packets, the lower bound isn't 71 | * an issue. The upper bound isn't an issue with existing technologies. 72 | */ 73 | #define BW_SCALE 24 74 | #define BW_UNIT (1 << BW_SCALE) 75 | 76 | #define BBR_SCALE 8 /* scaling factor for fractions in BBR (e.g. gains) */ 77 | #define BBR_UNIT (1 << BBR_SCALE) 78 | 79 | /* BBR has the following modes for deciding how fast to send: */ 80 | enum bbr_mode { 81 | BBR_STARTUP, /* ramp up sending rate rapidly to fill pipe */ 82 | BBR_DRAIN, /* drain any queue created during startup */ 83 | BBR_PROBE_BW, /* discover, share bw: pace around estimated bw */ 84 | BBR_PROBE_RTT, /* cut inflight to min to probe min_rtt */ 85 | }; 86 | 87 | /* BBR congestion control block */ 88 | struct bbr { 89 | u32 min_rtt_us; /* min RTT in min_rtt_win_sec window */ 90 | u32 min_rtt_stamp; /* timestamp of min_rtt_us */ 91 | u32 probe_rtt_done_stamp; /* end time for BBR_PROBE_RTT mode */ 92 | struct minmax bw; /* Max recent delivery rate in pkts/uS << 24 */ 93 | u32 rtt_cnt; /* count of packet-timed rounds elapsed */ 94 | u32 next_rtt_delivered; /* scb->tx.delivered at end of round */ 95 | struct skb_mstamp cycle_mstamp; /* time of this cycle phase start */ 96 | u32 mode:3, /* current bbr_mode in state machine */ 97 | prev_ca_state:3, /* CA state on previous ACK */ 98 | packet_conservation:1, /* use packet conservation? */ 99 | restore_cwnd:1, /* decided to revert cwnd to old value */ 100 | round_start:1, /* start of packet-timed tx->ack round? */ 101 | tso_segs_goal:7, /* segments we want in each skb we send */ 102 | idle_restart:1, /* restarting after idle? */ 103 | probe_rtt_round_done:1, /* a BBR_PROBE_RTT round at 4 pkts? */ 104 | unused:5, 105 | lt_is_sampling:1, /* taking long-term ("LT") samples now? */ 106 | lt_rtt_cnt:7, /* round trips in long-term interval */ 107 | lt_use_bw:1; /* use lt_bw as our bw estimate? */ 108 | u32 lt_bw; /* LT est delivery rate in pkts/uS << 24 */ 109 | u32 lt_last_delivered; /* LT intvl start: tp->delivered */ 110 | u32 lt_last_stamp; /* LT intvl start: tp->delivered_mstamp */ 111 | u32 lt_last_lost; /* LT intvl start: tp->lost */ 112 | u32 pacing_gain:10, /* current gain for setting pacing rate */ 113 | cwnd_gain:10, /* current gain for setting cwnd */ 114 | full_bw_cnt:3, /* number of rounds without large bw gains */ 115 | cycle_idx:3, /* current index in pacing_gain cycle array */ 116 | unused_b:6; 117 | u32 prior_cwnd; /* prior cwnd upon entering loss recovery */ 118 | u32 full_bw; /* recent bw, to estimate if pipe is full */ 119 | }; 120 | 121 | #define CYCLE_LEN 8 /* number of phases in a pacing gain cycle */ 122 | 123 | /* Window length of bw filter (in rounds): */ 124 | static const int bbr_bw_rtts = CYCLE_LEN + 2; 125 | /* Window length of min_rtt filter (in sec): */ 126 | static const u32 bbr_min_rtt_win_sec = 10; 127 | /* Minimum time (in ms) spent at bbr_cwnd_min_target in BBR_PROBE_RTT mode: */ 128 | static const u32 bbr_probe_rtt_mode_ms = 200; 129 | /* Skip TSO below the following bandwidth (bits/sec): */ 130 | static const int bbr_min_tso_rate = 1200000; 131 | 132 | /* We use a high_gain value of 2/ln(2) because it's the smallest pacing gain 133 | * that will allow a smoothly increasing pacing rate that will double each RTT 134 | * and send the same number of packets per RTT that an un-paced, slow-starting 135 | * Reno or CUBIC flow would: 136 | */ 137 | static const int bbr_high_gain = BBR_UNIT * 2885 / 1000 + 1; 138 | /* The pacing gain of 1/high_gain in BBR_DRAIN is calculated to typically drain 139 | * the queue created in BBR_STARTUP in a single round: 140 | */ 141 | static const int bbr_drain_gain = BBR_UNIT * 1000 / 2885; 142 | /* The gain for deriving steady-state cwnd tolerates delayed/stretched ACKs: */ 143 | static const int bbr_cwnd_gain = BBR_UNIT * 2; 144 | /* The pacing_gain values for the PROBE_BW gain cycle, to discover/share bw: */ 145 | static const int bbr_pacing_gain[] = { 146 | BBR_UNIT * 5 / 4, /* probe for more available bw */ 147 | BBR_UNIT * 3 / 4, /* drain queue and/or yield bw to other flows */ 148 | BBR_UNIT, BBR_UNIT, BBR_UNIT, /* cruise at 1.0*bw to utilize pipe, */ 149 | BBR_UNIT, BBR_UNIT, BBR_UNIT /* without creating excess queue... */ 150 | }; 151 | /* Randomize the starting gain cycling phase over N phases: */ 152 | static const u32 bbr_cycle_rand = 7; 153 | 154 | /* Try to keep at least this many packets in flight, if things go smoothly. For 155 | * smooth functioning, a sliding window protocol ACKing every other packet 156 | * needs at least 4 packets in flight: 157 | */ 158 | static const u32 bbr_cwnd_min_target = 4; 159 | 160 | /* To estimate if BBR_STARTUP mode (i.e. high_gain) has filled pipe... */ 161 | /* If bw has increased significantly (1.25x), there may be more bw available: */ 162 | static const u32 bbr_full_bw_thresh = BBR_UNIT * 5 / 4; 163 | /* But after 3 rounds w/o significant bw growth, estimate pipe is full: */ 164 | static const u32 bbr_full_bw_cnt = 3; 165 | 166 | /* "long-term" ("LT") bandwidth estimator parameters... */ 167 | /* The minimum number of rounds in an LT bw sampling interval: */ 168 | static const u32 bbr_lt_intvl_min_rtts = 4; 169 | /* If lost/delivered ratio > 20%, interval is "lossy" and we may be policed: */ 170 | static const u32 bbr_lt_loss_thresh = 50; 171 | /* If 2 intervals have a bw ratio <= 1/8, their bw is "consistent": */ 172 | static const u32 bbr_lt_bw_ratio = BBR_UNIT / 8; 173 | /* If 2 intervals have a bw diff <= 4 Kbit/sec their bw is "consistent": */ 174 | static const u32 bbr_lt_bw_diff = 4000 / 8; 175 | /* If we estimate we're policed, use lt_bw for this many round trips: */ 176 | static const u32 bbr_lt_bw_max_rtts = 48; 177 | 178 | /* Do we estimate that STARTUP filled the pipe? */ 179 | static bool bbr_full_bw_reached(const struct sock *sk) 180 | { 181 | const struct bbr *bbr = inet_csk_ca(sk); 182 | 183 | return bbr->full_bw_cnt >= bbr_full_bw_cnt; 184 | } 185 | 186 | /* Return the windowed max recent bandwidth sample, in pkts/uS << BW_SCALE. */ 187 | static u32 bbr_max_bw(const struct sock *sk) 188 | { 189 | struct bbr *bbr = inet_csk_ca(sk); 190 | 191 | return minmax_get(&bbr->bw); 192 | } 193 | 194 | /* Return the estimated bandwidth of the path, in pkts/uS << BW_SCALE. */ 195 | static u32 bbr_bw(const struct sock *sk) 196 | { 197 | struct bbr *bbr = inet_csk_ca(sk); 198 | 199 | return bbr->lt_use_bw ? bbr->lt_bw : bbr_max_bw(sk); 200 | } 201 | 202 | /* Return rate in bytes per second, optionally with a gain. 203 | * The order here is chosen carefully to avoid overflow of u64. This should 204 | * work for input rates of up to 2.9Tbit/sec and gain of 2.89x. 205 | */ 206 | static u64 bbr_rate_bytes_per_sec(struct sock *sk, u64 rate, int gain) 207 | { 208 | rate *= tcp_mss_to_mtu(sk, tcp_sk(sk)->mss_cache); 209 | rate *= gain; 210 | rate >>= BBR_SCALE; 211 | rate *= USEC_PER_SEC; 212 | return rate >> BW_SCALE; 213 | } 214 | 215 | /* Pace using current bw estimate and a gain factor. In order to help drive the 216 | * network toward lower queues while maintaining high utilization and low 217 | * latency, the average pacing rate aims to be slightly (~1%) lower than the 218 | * estimated bandwidth. This is an important aspect of the design. In this 219 | * implementation this slightly lower pacing rate is achieved implicitly by not 220 | * including link-layer headers in the packet size used for the pacing rate. 221 | */ 222 | static void bbr_set_pacing_rate(struct sock *sk, u32 bw, int gain) 223 | { 224 | struct bbr *bbr = inet_csk_ca(sk); 225 | u64 rate = bw; 226 | 227 | rate = bbr_rate_bytes_per_sec(sk, rate, gain); 228 | rate = min_t(u64, rate, sk->sk_max_pacing_rate); 229 | if (bbr->mode != BBR_STARTUP || rate > sk->sk_pacing_rate) 230 | sk->sk_pacing_rate = rate; 231 | } 232 | 233 | /* Return count of segments we want in the skbs we send, or 0 for default. */ 234 | static u32 bbr_tso_segs_goal(struct sock *sk) 235 | { 236 | struct bbr *bbr = inet_csk_ca(sk); 237 | 238 | return bbr->tso_segs_goal; 239 | } 240 | 241 | static void bbr_set_tso_segs_goal(struct sock *sk) 242 | { 243 | struct tcp_sock *tp = tcp_sk(sk); 244 | struct bbr *bbr = inet_csk_ca(sk); 245 | u32 min_segs; 246 | 247 | min_segs = sk->sk_pacing_rate < (bbr_min_tso_rate >> 3) ? 1 : 2; 248 | bbr->tso_segs_goal = min(tcp_tso_autosize(sk, tp->mss_cache, min_segs), 249 | 0x7FU); 250 | } 251 | 252 | /* Save "last known good" cwnd so we can restore it after losses or PROBE_RTT */ 253 | static void bbr_save_cwnd(struct sock *sk) 254 | { 255 | struct tcp_sock *tp = tcp_sk(sk); 256 | struct bbr *bbr = inet_csk_ca(sk); 257 | 258 | if (bbr->prev_ca_state < TCP_CA_Recovery && bbr->mode != BBR_PROBE_RTT) 259 | bbr->prior_cwnd = tp->snd_cwnd; /* this cwnd is good enough */ 260 | else /* loss recovery or BBR_PROBE_RTT have temporarily cut cwnd */ 261 | bbr->prior_cwnd = max(bbr->prior_cwnd, tp->snd_cwnd); 262 | } 263 | 264 | static void bbr_cwnd_event(struct sock *sk, enum tcp_ca_event event) 265 | { 266 | struct tcp_sock *tp = tcp_sk(sk); 267 | struct bbr *bbr = inet_csk_ca(sk); 268 | 269 | if (event == CA_EVENT_TX_START && tp->app_limited) { 270 | bbr->idle_restart = 1; 271 | /* Avoid pointless buffer overflows: pace at est. bw if we don't 272 | * need more speed (we're restarting from idle and app-limited). 273 | */ 274 | if (bbr->mode == BBR_PROBE_BW) 275 | bbr_set_pacing_rate(sk, bbr_bw(sk), BBR_UNIT); 276 | } 277 | } 278 | 279 | /* Find target cwnd. Right-size the cwnd based on min RTT and the 280 | * estimated bottleneck bandwidth: 281 | * 282 | * cwnd = bw * min_rtt * gain = BDP * gain 283 | * 284 | * The key factor, gain, controls the amount of queue. While a small gain 285 | * builds a smaller queue, it becomes more vulnerable to noise in RTT 286 | * measurements (e.g., delayed ACKs or other ACK compression effects). This 287 | * noise may cause BBR to under-estimate the rate. 288 | * 289 | * To achieve full performance in high-speed paths, we budget enough cwnd to 290 | * fit full-sized skbs in-flight on both end hosts to fully utilize the path: 291 | * - one skb in sending host Qdisc, 292 | * - one skb in sending host TSO/GSO engine 293 | * - one skb being received by receiver host LRO/GRO/delayed-ACK engine 294 | * Don't worry, at low rates (bbr_min_tso_rate) this won't bloat cwnd because 295 | * in such cases tso_segs_goal is 1. The minimum cwnd is 4 packets, 296 | * which allows 2 outstanding 2-packet sequences, to try to keep pipe 297 | * full even with ACK-every-other-packet delayed ACKs. 298 | */ 299 | static u32 bbr_target_cwnd(struct sock *sk, u32 bw, int gain) 300 | { 301 | struct bbr *bbr = inet_csk_ca(sk); 302 | u32 cwnd; 303 | u64 w; 304 | 305 | /* If we've never had a valid RTT sample, cap cwnd at the initial 306 | * default. This should only happen when the connection is not using TCP 307 | * timestamps and has retransmitted all of the SYN/SYNACK/data packets 308 | * ACKed so far. In this case, an RTO can cut cwnd to 1, in which 309 | * case we need to slow-start up toward something safe: TCP_INIT_CWND. 310 | */ 311 | if (unlikely(bbr->min_rtt_us == ~0U)) /* no valid RTT samples yet? */ 312 | return TCP_INIT_CWND; /* be safe: cap at default initial cwnd*/ 313 | 314 | w = (u64)bw * bbr->min_rtt_us; 315 | 316 | /* Apply a gain to the given value, then remove the BW_SCALE shift. */ 317 | cwnd = (((w * gain) >> BBR_SCALE) + BW_UNIT - 1) / BW_UNIT; 318 | 319 | /* Allow enough full-sized skbs in flight to utilize end systems. */ 320 | cwnd += 3 * bbr->tso_segs_goal; 321 | 322 | /* Reduce delayed ACKs by rounding up cwnd to the next even number. */ 323 | cwnd = (cwnd + 1) & ~1U; 324 | 325 | return cwnd; 326 | } 327 | 328 | /* An optimization in BBR to reduce losses: On the first round of recovery, we 329 | * follow the packet conservation principle: send P packets per P packets acked. 330 | * After that, we slow-start and send at most 2*P packets per P packets acked. 331 | * After recovery finishes, or upon undo, we restore the cwnd we had when 332 | * recovery started (capped by the target cwnd based on estimated BDP). 333 | * 334 | * TODO(ycheng/ncardwell): implement a rate-based approach. 335 | */ 336 | static bool bbr_set_cwnd_to_recover_or_restore( 337 | struct sock *sk, const struct rate_sample *rs, u32 acked, u32 *new_cwnd) 338 | { 339 | struct tcp_sock *tp = tcp_sk(sk); 340 | struct bbr *bbr = inet_csk_ca(sk); 341 | u8 prev_state = bbr->prev_ca_state, state = inet_csk(sk)->icsk_ca_state; 342 | u32 cwnd = tp->snd_cwnd; 343 | 344 | /* An ACK for P pkts should release at most 2*P packets. We do this 345 | * in two steps. First, here we deduct the number of lost packets. 346 | * Then, in bbr_set_cwnd() we slow start up toward the target cwnd. 347 | */ 348 | if (rs->losses > 0) 349 | cwnd = max_t(s32, cwnd - rs->losses, 1); 350 | 351 | if (state == TCP_CA_Recovery && prev_state != TCP_CA_Recovery) { 352 | /* Starting 1st round of Recovery, so do packet conservation. */ 353 | bbr->packet_conservation = 1; 354 | bbr->next_rtt_delivered = tp->delivered; /* start round now */ 355 | /* Cut unused cwnd from app behavior, TSQ, or TSO deferral: */ 356 | cwnd = tcp_packets_in_flight(tp) + acked; 357 | } else if (prev_state >= TCP_CA_Recovery && state < TCP_CA_Recovery) { 358 | /* Exiting loss recovery; restore cwnd saved before recovery. */ 359 | bbr->restore_cwnd = 1; 360 | bbr->packet_conservation = 0; 361 | } 362 | bbr->prev_ca_state = state; 363 | 364 | if (bbr->restore_cwnd) { 365 | /* Restore cwnd after exiting loss recovery or PROBE_RTT. */ 366 | cwnd = max(cwnd, bbr->prior_cwnd); 367 | bbr->restore_cwnd = 0; 368 | } 369 | 370 | if (bbr->packet_conservation) { 371 | *new_cwnd = max(cwnd, tcp_packets_in_flight(tp) + acked); 372 | return true; /* yes, using packet conservation */ 373 | } 374 | *new_cwnd = cwnd; 375 | return false; 376 | } 377 | 378 | /* Slow-start up toward target cwnd (if bw estimate is growing, or packet loss 379 | * has drawn us down below target), or snap down to target if we're above it. 380 | */ 381 | static void bbr_set_cwnd(struct sock *sk, const struct rate_sample *rs, 382 | u32 acked, u32 bw, int gain) 383 | { 384 | struct tcp_sock *tp = tcp_sk(sk); 385 | struct bbr *bbr = inet_csk_ca(sk); 386 | u32 cwnd = 0, target_cwnd = 0; 387 | 388 | if (!acked) 389 | return; 390 | 391 | if (bbr_set_cwnd_to_recover_or_restore(sk, rs, acked, &cwnd)) 392 | goto done; 393 | 394 | /* If we're below target cwnd, slow start cwnd toward target cwnd. */ 395 | target_cwnd = bbr_target_cwnd(sk, bw, gain); 396 | if (bbr_full_bw_reached(sk)) /* only cut cwnd if we filled the pipe */ 397 | cwnd = min(cwnd + acked, target_cwnd); 398 | else if (cwnd < target_cwnd || tp->delivered < TCP_INIT_CWND) 399 | cwnd = cwnd + acked; 400 | cwnd = max(cwnd, bbr_cwnd_min_target); 401 | 402 | done: 403 | tp->snd_cwnd = min(cwnd, tp->snd_cwnd_clamp); /* apply global cap */ 404 | if (bbr->mode == BBR_PROBE_RTT) /* drain queue, refresh min_rtt */ 405 | tp->snd_cwnd = min(tp->snd_cwnd, bbr_cwnd_min_target); 406 | } 407 | 408 | /* End cycle phase if it's time and/or we hit the phase's in-flight target. */ 409 | static bool bbr_is_next_cycle_phase(struct sock *sk, 410 | const struct rate_sample *rs) 411 | { 412 | struct tcp_sock *tp = tcp_sk(sk); 413 | struct bbr *bbr = inet_csk_ca(sk); 414 | bool is_full_length = 415 | skb_mstamp_us_delta(&tp->delivered_mstamp, &bbr->cycle_mstamp) > 416 | bbr->min_rtt_us; 417 | u32 inflight, bw; 418 | 419 | /* The pacing_gain of 1.0 paces at the estimated bw to try to fully 420 | * use the pipe without increasing the queue. 421 | */ 422 | if (bbr->pacing_gain == BBR_UNIT) 423 | return is_full_length; /* just use wall clock time */ 424 | 425 | inflight = rs->prior_in_flight; /* what was in-flight before ACK? */ 426 | bw = bbr_max_bw(sk); 427 | 428 | /* A pacing_gain > 1.0 probes for bw by trying to raise inflight to at 429 | * least pacing_gain*BDP; this may take more than min_rtt if min_rtt is 430 | * small (e.g. on a LAN). We do not persist if packets are lost, since 431 | * a path with small buffers may not hold that much. 432 | */ 433 | if (bbr->pacing_gain > BBR_UNIT) 434 | return is_full_length && 435 | (rs->losses || /* perhaps pacing_gain*BDP won't fit */ 436 | inflight >= bbr_target_cwnd(sk, bw, bbr->pacing_gain)); 437 | 438 | /* A pacing_gain < 1.0 tries to drain extra queue we added if bw 439 | * probing didn't find more bw. If inflight falls to match BDP then we 440 | * estimate queue is drained; persisting would underutilize the pipe. 441 | */ 442 | return is_full_length || 443 | inflight <= bbr_target_cwnd(sk, bw, BBR_UNIT); 444 | } 445 | 446 | static void bbr_advance_cycle_phase(struct sock *sk) 447 | { 448 | struct tcp_sock *tp = tcp_sk(sk); 449 | struct bbr *bbr = inet_csk_ca(sk); 450 | 451 | bbr->cycle_idx = (bbr->cycle_idx + 1) & (CYCLE_LEN - 1); 452 | bbr->cycle_mstamp = tp->delivered_mstamp; 453 | bbr->pacing_gain = bbr_pacing_gain[bbr->cycle_idx]; 454 | } 455 | 456 | /* Gain cycling: cycle pacing gain to converge to fair share of available bw. */ 457 | static void bbr_update_cycle_phase(struct sock *sk, 458 | const struct rate_sample *rs) 459 | { 460 | struct bbr *bbr = inet_csk_ca(sk); 461 | 462 | if ((bbr->mode == BBR_PROBE_BW) && !bbr->lt_use_bw && 463 | bbr_is_next_cycle_phase(sk, rs)) 464 | bbr_advance_cycle_phase(sk); 465 | } 466 | 467 | static void bbr_reset_startup_mode(struct sock *sk) 468 | { 469 | struct bbr *bbr = inet_csk_ca(sk); 470 | 471 | bbr->mode = BBR_STARTUP; 472 | bbr->pacing_gain = bbr_high_gain; 473 | bbr->cwnd_gain = bbr_high_gain; 474 | } 475 | 476 | static void bbr_reset_probe_bw_mode(struct sock *sk) 477 | { 478 | struct bbr *bbr = inet_csk_ca(sk); 479 | 480 | bbr->mode = BBR_PROBE_BW; 481 | bbr->pacing_gain = BBR_UNIT; 482 | bbr->cwnd_gain = bbr_cwnd_gain; 483 | bbr->cycle_idx = CYCLE_LEN - 1 - prandom_u32_max(bbr_cycle_rand); 484 | bbr_advance_cycle_phase(sk); /* flip to next phase of gain cycle */ 485 | } 486 | 487 | static void bbr_reset_mode(struct sock *sk) 488 | { 489 | if (!bbr_full_bw_reached(sk)) 490 | bbr_reset_startup_mode(sk); 491 | else 492 | bbr_reset_probe_bw_mode(sk); 493 | } 494 | 495 | /* Start a new long-term sampling interval. */ 496 | static void bbr_reset_lt_bw_sampling_interval(struct sock *sk) 497 | { 498 | struct tcp_sock *tp = tcp_sk(sk); 499 | struct bbr *bbr = inet_csk_ca(sk); 500 | 501 | bbr->lt_last_stamp = tp->delivered_mstamp.stamp_jiffies; 502 | bbr->lt_last_delivered = tp->delivered; 503 | bbr->lt_last_lost = tp->lost; 504 | bbr->lt_rtt_cnt = 0; 505 | } 506 | 507 | /* Completely reset long-term bandwidth sampling. */ 508 | static void bbr_reset_lt_bw_sampling(struct sock *sk) 509 | { 510 | struct bbr *bbr = inet_csk_ca(sk); 511 | 512 | bbr->lt_bw = 0; 513 | bbr->lt_use_bw = 0; 514 | bbr->lt_is_sampling = false; 515 | bbr_reset_lt_bw_sampling_interval(sk); 516 | } 517 | 518 | /* Long-term bw sampling interval is done. Estimate whether we're policed. */ 519 | static void bbr_lt_bw_interval_done(struct sock *sk, u32 bw) 520 | { 521 | struct bbr *bbr = inet_csk_ca(sk); 522 | u32 diff; 523 | 524 | if (bbr->lt_bw) { /* do we have bw from a previous interval? */ 525 | /* Is new bw close to the lt_bw from the previous interval? */ 526 | diff = abs(bw - bbr->lt_bw); 527 | if ((diff * BBR_UNIT <= bbr_lt_bw_ratio * bbr->lt_bw) || 528 | (bbr_rate_bytes_per_sec(sk, diff, BBR_UNIT) <= 529 | bbr_lt_bw_diff)) { 530 | /* All criteria are met; estimate we're policed. */ 531 | bbr->lt_bw = (bw + bbr->lt_bw) >> 1; /* avg 2 intvls */ 532 | bbr->lt_use_bw = 1; 533 | bbr->pacing_gain = BBR_UNIT; /* try to avoid drops */ 534 | bbr->lt_rtt_cnt = 0; 535 | return; 536 | } 537 | } 538 | bbr->lt_bw = bw; 539 | bbr_reset_lt_bw_sampling_interval(sk); 540 | } 541 | 542 | /* Token-bucket traffic policers are common (see "An Internet-Wide Analysis of 543 | * Traffic Policing", SIGCOMM 2016). BBR detects token-bucket policers and 544 | * explicitly models their policed rate, to reduce unnecessary losses. We 545 | * estimate that we're policed if we see 2 consecutive sampling intervals with 546 | * consistent throughput and high packet loss. If we think we're being policed, 547 | * set lt_bw to the "long-term" average delivery rate from those 2 intervals. 548 | */ 549 | static void bbr_lt_bw_sampling(struct sock *sk, const struct rate_sample *rs) 550 | { 551 | struct tcp_sock *tp = tcp_sk(sk); 552 | struct bbr *bbr = inet_csk_ca(sk); 553 | u32 lost, delivered; 554 | u64 bw; 555 | s32 t; 556 | 557 | if (bbr->lt_use_bw) { /* already using long-term rate, lt_bw? */ 558 | if (bbr->mode == BBR_PROBE_BW && bbr->round_start && 559 | ++bbr->lt_rtt_cnt >= bbr_lt_bw_max_rtts) { 560 | bbr_reset_lt_bw_sampling(sk); /* stop using lt_bw */ 561 | bbr_reset_probe_bw_mode(sk); /* restart gain cycling */ 562 | } 563 | return; 564 | } 565 | 566 | /* Wait for the first loss before sampling, to let the policer exhaust 567 | * its tokens and estimate the steady-state rate allowed by the policer. 568 | * Starting samples earlier includes bursts that over-estimate the bw. 569 | */ 570 | if (!bbr->lt_is_sampling) { 571 | if (!rs->losses) 572 | return; 573 | bbr_reset_lt_bw_sampling_interval(sk); 574 | bbr->lt_is_sampling = true; 575 | } 576 | 577 | /* To avoid underestimates, reset sampling if we run out of data. */ 578 | if (rs->is_app_limited) { 579 | bbr_reset_lt_bw_sampling(sk); 580 | return; 581 | } 582 | 583 | if (bbr->round_start) 584 | bbr->lt_rtt_cnt++; /* count round trips in this interval */ 585 | if (bbr->lt_rtt_cnt < bbr_lt_intvl_min_rtts) 586 | return; /* sampling interval needs to be longer */ 587 | if (bbr->lt_rtt_cnt > 4 * bbr_lt_intvl_min_rtts) { 588 | bbr_reset_lt_bw_sampling(sk); /* interval is too long */ 589 | return; 590 | } 591 | 592 | /* End sampling interval when a packet is lost, so we estimate the 593 | * policer tokens were exhausted. Stopping the sampling before the 594 | * tokens are exhausted under-estimates the policed rate. 595 | */ 596 | if (!rs->losses) 597 | return; 598 | 599 | /* Calculate packets lost and delivered in sampling interval. */ 600 | lost = tp->lost - bbr->lt_last_lost; 601 | delivered = tp->delivered - bbr->lt_last_delivered; 602 | /* Is loss rate (lost/delivered) >= lt_loss_thresh? If not, wait. */ 603 | if (!delivered || (lost << BBR_SCALE) < bbr_lt_loss_thresh * delivered) 604 | return; 605 | 606 | /* Find average delivery rate in this sampling interval. */ 607 | t = (s32)(tp->delivered_mstamp.stamp_jiffies - bbr->lt_last_stamp); 608 | if (t < 1) 609 | return; /* interval is less than one jiffy, so wait */ 610 | t = jiffies_to_usecs(t); 611 | /* Interval long enough for jiffies_to_usecs() to return a bogus 0? */ 612 | if (t < 1) { 613 | bbr_reset_lt_bw_sampling(sk); /* interval too long; reset */ 614 | return; 615 | } 616 | bw = (u64)delivered * BW_UNIT; 617 | do_div(bw, t); 618 | bbr_lt_bw_interval_done(sk, bw); 619 | } 620 | 621 | /* Estimate the bandwidth based on how fast packets are delivered */ 622 | static void bbr_update_bw(struct sock *sk, const struct rate_sample *rs) 623 | { 624 | struct tcp_sock *tp = tcp_sk(sk); 625 | struct bbr *bbr = inet_csk_ca(sk); 626 | u64 bw; 627 | 628 | bbr->round_start = 0; 629 | if (rs->delivered < 0 || rs->interval_us <= 0) 630 | return; /* Not a valid observation */ 631 | 632 | /* See if we've reached the next RTT */ 633 | if (!before(rs->prior_delivered, bbr->next_rtt_delivered)) { 634 | bbr->next_rtt_delivered = tp->delivered; 635 | bbr->rtt_cnt++; 636 | bbr->round_start = 1; 637 | bbr->packet_conservation = 0; 638 | } 639 | 640 | bbr_lt_bw_sampling(sk, rs); 641 | 642 | /* Divide delivered by the interval to find a (lower bound) bottleneck 643 | * bandwidth sample. Delivered is in packets and interval_us in uS and 644 | * ratio will be <<1 for most connections. So delivered is first scaled. 645 | */ 646 | bw = (u64)rs->delivered * BW_UNIT; 647 | do_div(bw, rs->interval_us); 648 | 649 | /* If this sample is application-limited, it is likely to have a very 650 | * low delivered count that represents application behavior rather than 651 | * the available network rate. Such a sample could drag down estimated 652 | * bw, causing needless slow-down. Thus, to continue to send at the 653 | * last measured network rate, we filter out app-limited samples unless 654 | * they describe the path bw at least as well as our bw model. 655 | * 656 | * So the goal during app-limited phase is to proceed with the best 657 | * network rate no matter how long. We automatically leave this 658 | * phase when app writes faster than the network can deliver :) 659 | */ 660 | if (!rs->is_app_limited || bw >= bbr_max_bw(sk)) { 661 | /* Incorporate new sample into our max bw filter. */ 662 | minmax_running_max(&bbr->bw, bbr_bw_rtts, bbr->rtt_cnt, bw); 663 | } 664 | } 665 | 666 | /* Estimate when the pipe is full, using the change in delivery rate: BBR 667 | * estimates that STARTUP filled the pipe if the estimated bw hasn't changed by 668 | * at least bbr_full_bw_thresh (25%) after bbr_full_bw_cnt (3) non-app-limited 669 | * rounds. Why 3 rounds: 1: rwin autotuning grows the rwin, 2: we fill the 670 | * higher rwin, 3: we get higher delivery rate samples. Or transient 671 | * cross-traffic or radio noise can go away. CUBIC Hystart shares a similar 672 | * design goal, but uses delay and inter-ACK spacing instead of bandwidth. 673 | */ 674 | static void bbr_check_full_bw_reached(struct sock *sk, 675 | const struct rate_sample *rs) 676 | { 677 | struct bbr *bbr = inet_csk_ca(sk); 678 | u32 bw_thresh; 679 | 680 | if (bbr_full_bw_reached(sk) || !bbr->round_start || rs->is_app_limited) 681 | return; 682 | 683 | bw_thresh = (u64)bbr->full_bw * bbr_full_bw_thresh >> BBR_SCALE; 684 | if (bbr_max_bw(sk) >= bw_thresh) { 685 | bbr->full_bw = bbr_max_bw(sk); 686 | bbr->full_bw_cnt = 0; 687 | return; 688 | } 689 | ++bbr->full_bw_cnt; 690 | } 691 | 692 | /* If pipe is probably full, drain the queue and then enter steady-state. */ 693 | static void bbr_check_drain(struct sock *sk, const struct rate_sample *rs) 694 | { 695 | struct bbr *bbr = inet_csk_ca(sk); 696 | 697 | if (bbr->mode == BBR_STARTUP && bbr_full_bw_reached(sk)) { 698 | bbr->mode = BBR_DRAIN; /* drain queue we created */ 699 | bbr->pacing_gain = bbr_drain_gain; /* pace slow to drain */ 700 | bbr->cwnd_gain = bbr_high_gain; /* maintain cwnd */ 701 | } /* fall through to check if in-flight is already small: */ 702 | if (bbr->mode == BBR_DRAIN && 703 | tcp_packets_in_flight(tcp_sk(sk)) <= 704 | bbr_target_cwnd(sk, bbr_max_bw(sk), BBR_UNIT)) 705 | bbr_reset_probe_bw_mode(sk); /* we estimate queue is drained */ 706 | } 707 | 708 | /* The goal of PROBE_RTT mode is to have BBR flows cooperatively and 709 | * periodically drain the bottleneck queue, to converge to measure the true 710 | * min_rtt (unloaded propagation delay). This allows the flows to keep queues 711 | * small (reducing queuing delay and packet loss) and achieve fairness among 712 | * BBR flows. 713 | * 714 | * The min_rtt filter window is 10 seconds. When the min_rtt estimate expires, 715 | * we enter PROBE_RTT mode and cap the cwnd at bbr_cwnd_min_target=4 packets. 716 | * After at least bbr_probe_rtt_mode_ms=200ms and at least one packet-timed 717 | * round trip elapsed with that flight size <= 4, we leave PROBE_RTT mode and 718 | * re-enter the previous mode. BBR uses 200ms to approximately bound the 719 | * performance penalty of PROBE_RTT's cwnd capping to roughly 2% (200ms/10s). 720 | * 721 | * Note that flows need only pay 2% if they are busy sending over the last 10 722 | * seconds. Interactive applications (e.g., Web, RPCs, video chunks) often have 723 | * natural silences or low-rate periods within 10 seconds where the rate is low 724 | * enough for long enough to drain its queue in the bottleneck. We pick up 725 | * these min RTT measurements opportunistically with our min_rtt filter. :-) 726 | */ 727 | static void bbr_update_min_rtt(struct sock *sk, const struct rate_sample *rs) 728 | { 729 | struct tcp_sock *tp = tcp_sk(sk); 730 | struct bbr *bbr = inet_csk_ca(sk); 731 | bool filter_expired; 732 | 733 | /* Track min RTT seen in the min_rtt_win_sec filter window: */ 734 | filter_expired = after(tcp_time_stamp, 735 | bbr->min_rtt_stamp + bbr_min_rtt_win_sec * HZ); 736 | if (rs->rtt_us >= 0 && 737 | (rs->rtt_us <= bbr->min_rtt_us || filter_expired)) { 738 | bbr->min_rtt_us = rs->rtt_us; 739 | bbr->min_rtt_stamp = tcp_time_stamp; 740 | } 741 | 742 | if (bbr_probe_rtt_mode_ms > 0 && filter_expired && 743 | !bbr->idle_restart && bbr->mode != BBR_PROBE_RTT) { 744 | bbr->mode = BBR_PROBE_RTT; /* dip, drain queue */ 745 | bbr->pacing_gain = BBR_UNIT; 746 | bbr->cwnd_gain = BBR_UNIT; 747 | bbr_save_cwnd(sk); /* note cwnd so we can restore it */ 748 | bbr->probe_rtt_done_stamp = 0; 749 | } 750 | 751 | if (bbr->mode == BBR_PROBE_RTT) { 752 | /* Ignore low rate samples during this mode. */ 753 | tp->app_limited = 754 | (tp->delivered + tcp_packets_in_flight(tp)) ? : 1; 755 | /* Maintain min packets in flight for max(200 ms, 1 round). */ 756 | if (!bbr->probe_rtt_done_stamp && 757 | tcp_packets_in_flight(tp) <= bbr_cwnd_min_target) { 758 | bbr->probe_rtt_done_stamp = tcp_time_stamp + 759 | msecs_to_jiffies(bbr_probe_rtt_mode_ms); 760 | bbr->probe_rtt_round_done = 0; 761 | bbr->next_rtt_delivered = tp->delivered; 762 | } else if (bbr->probe_rtt_done_stamp) { 763 | if (bbr->round_start) 764 | bbr->probe_rtt_round_done = 1; 765 | if (bbr->probe_rtt_round_done && 766 | after(tcp_time_stamp, bbr->probe_rtt_done_stamp)) { 767 | bbr->min_rtt_stamp = tcp_time_stamp; 768 | bbr->restore_cwnd = 1; /* snap to prior_cwnd */ 769 | bbr_reset_mode(sk); 770 | } 771 | } 772 | } 773 | bbr->idle_restart = 0; 774 | } 775 | 776 | static void bbr_update_model(struct sock *sk, const struct rate_sample *rs) 777 | { 778 | bbr_update_bw(sk, rs); 779 | bbr_update_cycle_phase(sk, rs); 780 | bbr_check_full_bw_reached(sk, rs); 781 | bbr_check_drain(sk, rs); 782 | bbr_update_min_rtt(sk, rs); 783 | } 784 | 785 | static void bbr_main(struct sock *sk, const struct rate_sample *rs) 786 | { 787 | struct bbr *bbr = inet_csk_ca(sk); 788 | u32 bw; 789 | 790 | bbr_update_model(sk, rs); 791 | 792 | bw = bbr_bw(sk); 793 | bbr_set_pacing_rate(sk, bw, bbr->pacing_gain); 794 | bbr_set_tso_segs_goal(sk); 795 | bbr_set_cwnd(sk, rs, rs->acked_sacked, bw, bbr->cwnd_gain); 796 | } 797 | 798 | static void bbr_init(struct sock *sk) 799 | { 800 | struct tcp_sock *tp = tcp_sk(sk); 801 | struct bbr *bbr = inet_csk_ca(sk); 802 | u64 bw; 803 | 804 | bbr->prior_cwnd = 0; 805 | bbr->tso_segs_goal = 0; /* default segs per skb until first ACK */ 806 | bbr->rtt_cnt = 0; 807 | bbr->next_rtt_delivered = 0; 808 | bbr->prev_ca_state = TCP_CA_Open; 809 | bbr->packet_conservation = 0; 810 | 811 | bbr->probe_rtt_done_stamp = 0; 812 | bbr->probe_rtt_round_done = 0; 813 | bbr->min_rtt_us = tcp_min_rtt(tp); 814 | bbr->min_rtt_stamp = tcp_time_stamp; 815 | 816 | minmax_reset(&bbr->bw, bbr->rtt_cnt, 0); /* init max bw to 0 */ 817 | 818 | /* Initialize pacing rate to: high_gain * init_cwnd / RTT. */ 819 | bw = (u64)tp->snd_cwnd * BW_UNIT; 820 | do_div(bw, (tp->srtt_us >> 3) ? : USEC_PER_MSEC); 821 | sk->sk_pacing_rate = 0; /* force an update of sk_pacing_rate */ 822 | bbr_set_pacing_rate(sk, bw, bbr_high_gain); 823 | 824 | bbr->restore_cwnd = 0; 825 | bbr->round_start = 0; 826 | bbr->idle_restart = 0; 827 | bbr->full_bw = 0; 828 | bbr->full_bw_cnt = 0; 829 | bbr->cycle_mstamp.v64 = 0; 830 | bbr->cycle_idx = 0; 831 | bbr_reset_lt_bw_sampling(sk); 832 | bbr_reset_startup_mode(sk); 833 | } 834 | 835 | static u32 bbr_sndbuf_expand(struct sock *sk) 836 | { 837 | /* Provision 3 * cwnd since BBR may slow-start even during recovery. */ 838 | return 3; 839 | } 840 | 841 | /* In theory BBR does not need to undo the cwnd since it does not 842 | * always reduce cwnd on losses (see bbr_main()). Keep it for now. 843 | */ 844 | static u32 bbr_undo_cwnd(struct sock *sk) 845 | { 846 | return tcp_sk(sk)->snd_cwnd; 847 | } 848 | 849 | /* Entering loss recovery, so save cwnd for when we exit or undo recovery. */ 850 | static u32 bbr_ssthresh(struct sock *sk) 851 | { 852 | bbr_save_cwnd(sk); 853 | return TCP_INFINITE_SSTHRESH; /* BBR does not use ssthresh */ 854 | } 855 | 856 | static size_t bbr_get_info(struct sock *sk, u32 ext, int *attr, 857 | union tcp_cc_info *info) 858 | { 859 | if (ext & (1 << (INET_DIAG_BBRINFO - 1)) || 860 | ext & (1 << (INET_DIAG_VEGASINFO - 1))) { 861 | struct tcp_sock *tp = tcp_sk(sk); 862 | struct bbr *bbr = inet_csk_ca(sk); 863 | u64 bw = bbr_bw(sk); 864 | 865 | bw = bw * tp->mss_cache * USEC_PER_SEC >> BW_SCALE; 866 | memset(&info->bbr, 0, sizeof(info->bbr)); 867 | info->bbr.bbr_bw_lo = (u32)bw; 868 | info->bbr.bbr_bw_hi = (u32)(bw >> 32); 869 | info->bbr.bbr_min_rtt = bbr->min_rtt_us; 870 | info->bbr.bbr_pacing_gain = bbr->pacing_gain; 871 | info->bbr.bbr_cwnd_gain = bbr->cwnd_gain; 872 | *attr = INET_DIAG_BBRINFO; 873 | return sizeof(info->bbr); 874 | } 875 | return 0; 876 | } 877 | 878 | static void bbr_set_state(struct sock *sk, u8 new_state) 879 | { 880 | struct bbr *bbr = inet_csk_ca(sk); 881 | 882 | if (new_state == TCP_CA_Loss) { 883 | struct rate_sample rs = { .losses = 1 }; 884 | 885 | bbr->prev_ca_state = TCP_CA_Loss; 886 | bbr->full_bw = 0; 887 | bbr->round_start = 1; /* treat RTO like end of a round */ 888 | bbr_lt_bw_sampling(sk, &rs); 889 | } 890 | } 891 | 892 | static struct tcp_congestion_ops tcp_bbr_cong_ops __read_mostly = { 893 | .flags = TCP_CONG_NON_RESTRICTED, 894 | .name = "bbr", 895 | .owner = THIS_MODULE, 896 | .init = bbr_init, 897 | .cong_control = bbr_main, 898 | .sndbuf_expand = bbr_sndbuf_expand, 899 | .undo_cwnd = bbr_undo_cwnd, 900 | .cwnd_event = bbr_cwnd_event, 901 | .ssthresh = bbr_ssthresh, 902 | .tso_segs_goal = bbr_tso_segs_goal, 903 | .get_info = bbr_get_info, 904 | .set_state = bbr_set_state, 905 | }; 906 | 907 | static int __init bbr_register(void) 908 | { 909 | BUILD_BUG_ON(sizeof(struct bbr) > ICSK_CA_PRIV_SIZE); 910 | return tcp_register_congestion_control(&tcp_bbr_cong_ops); 911 | } 912 | 913 | static void __exit bbr_unregister(void) 914 | { 915 | tcp_unregister_congestion_control(&tcp_bbr_cong_ops); 916 | } 917 | 918 | module_init(bbr_register); 919 | module_exit(bbr_unregister); 920 | 921 | MODULE_AUTHOR("Van Jacobson "); 922 | MODULE_AUTHOR("Neal Cardwell "); 923 | MODULE_AUTHOR("Yuchung Cheng "); 924 | MODULE_AUTHOR("Soheil Hassas Yeganeh "); 925 | MODULE_LICENSE("Dual BSD/GPL"); 926 | MODULE_DESCRIPTION("TCP BBR (Bottleneck Bandwidth and RTT)"); 927 | -------------------------------------------------------------------------------- /General/Debian/source/kernel-v4.13/tcp_bbr.c: -------------------------------------------------------------------------------- 1 | /* Bottleneck Bandwidth and RTT (BBR) congestion control 2 | * 3 | * BBR congestion control computes the sending rate based on the delivery 4 | * rate (throughput) estimated from ACKs. In a nutshell: 5 | * 6 | * On each ACK, update our model of the network path: 7 | * bottleneck_bandwidth = windowed_max(delivered / elapsed, 10 round trips) 8 | * min_rtt = windowed_min(rtt, 10 seconds) 9 | * pacing_rate = pacing_gain * bottleneck_bandwidth 10 | * cwnd = max(cwnd_gain * bottleneck_bandwidth * min_rtt, 4) 11 | * 12 | * The core algorithm does not react directly to packet losses or delays, 13 | * although BBR may adjust the size of next send per ACK when loss is 14 | * observed, or adjust the sending rate if it estimates there is a 15 | * traffic policer, in order to keep the drop rate reasonable. 16 | * 17 | * Here is a state transition diagram for BBR: 18 | * 19 | * | 20 | * V 21 | * +---> STARTUP ----+ 22 | * | | | 23 | * | V | 24 | * | DRAIN ----+ 25 | * | | | 26 | * | V | 27 | * +---> PROBE_BW ----+ 28 | * | ^ | | 29 | * | | | | 30 | * | +----+ | 31 | * | | 32 | * +---- PROBE_RTT <--+ 33 | * 34 | * A BBR flow starts in STARTUP, and ramps up its sending rate quickly. 35 | * When it estimates the pipe is full, it enters DRAIN to drain the queue. 36 | * In steady state a BBR flow only uses PROBE_BW and PROBE_RTT. 37 | * A long-lived BBR flow spends the vast majority of its time remaining 38 | * (repeatedly) in PROBE_BW, fully probing and utilizing the pipe's bandwidth 39 | * in a fair manner, with a small, bounded queue. *If* a flow has been 40 | * continuously sending for the entire min_rtt window, and hasn't seen an RTT 41 | * sample that matches or decreases its min_rtt estimate for 10 seconds, then 42 | * it briefly enters PROBE_RTT to cut inflight to a minimum value to re-probe 43 | * the path's two-way propagation delay (min_rtt). When exiting PROBE_RTT, if 44 | * we estimated that we reached the full bw of the pipe then we enter PROBE_BW; 45 | * otherwise we enter STARTUP to try to fill the pipe. 46 | * 47 | * BBR is described in detail in: 48 | * "BBR: Congestion-Based Congestion Control", 49 | * Neal Cardwell, Yuchung Cheng, C. Stephen Gunn, Soheil Hassas Yeganeh, 50 | * Van Jacobson. ACM Queue, Vol. 14 No. 5, September-October 2016. 51 | * 52 | * There is a public e-mail list for discussing BBR development and testing: 53 | * https://groups.google.com/forum/#!forum/bbr-dev 54 | * 55 | * NOTE: BBR might be used with the fq qdisc ("man tc-fq") with pacing enabled, 56 | * otherwise TCP stack falls back to an internal pacing using one high 57 | * resolution timer per TCP socket and may use more resources. 58 | */ 59 | #include 60 | #include 61 | #include 62 | #include 63 | #include 64 | #include 65 | 66 | /* Scale factor for rate in pkt/uSec unit to avoid truncation in bandwidth 67 | * estimation. The rate unit ~= (1500 bytes / 1 usec / 2^24) ~= 715 bps. 68 | * This handles bandwidths from 0.06pps (715bps) to 256Mpps (3Tbps) in a u32. 69 | * Since the minimum window is >=4 packets, the lower bound isn't 70 | * an issue. The upper bound isn't an issue with existing technologies. 71 | */ 72 | #define BW_SCALE 24 73 | #define BW_UNIT (1 << BW_SCALE) 74 | 75 | #define BBR_SCALE 8 /* scaling factor for fractions in BBR (e.g. gains) */ 76 | #define BBR_UNIT (1 << BBR_SCALE) 77 | 78 | /* BBR has the following modes for deciding how fast to send: */ 79 | enum bbr_mode { 80 | BBR_STARTUP, /* ramp up sending rate rapidly to fill pipe */ 81 | BBR_DRAIN, /* drain any queue created during startup */ 82 | BBR_PROBE_BW, /* discover, share bw: pace around estimated bw */ 83 | BBR_PROBE_RTT, /* cut inflight to min to probe min_rtt */ 84 | }; 85 | 86 | /* BBR congestion control block */ 87 | struct bbr { 88 | u32 min_rtt_us; /* min RTT in min_rtt_win_sec window */ 89 | u32 min_rtt_stamp; /* timestamp of min_rtt_us */ 90 | u32 probe_rtt_done_stamp; /* end time for BBR_PROBE_RTT mode */ 91 | struct minmax bw; /* Max recent delivery rate in pkts/uS << 24 */ 92 | u32 rtt_cnt; /* count of packet-timed rounds elapsed */ 93 | u32 next_rtt_delivered; /* scb->tx.delivered at end of round */ 94 | u64 cycle_mstamp; /* time of this cycle phase start */ 95 | u32 mode:3, /* current bbr_mode in state machine */ 96 | prev_ca_state:3, /* CA state on previous ACK */ 97 | packet_conservation:1, /* use packet conservation? */ 98 | restore_cwnd:1, /* decided to revert cwnd to old value */ 99 | round_start:1, /* start of packet-timed tx->ack round? */ 100 | tso_segs_goal:7, /* segments we want in each skb we send */ 101 | idle_restart:1, /* restarting after idle? */ 102 | probe_rtt_round_done:1, /* a BBR_PROBE_RTT round at 4 pkts? */ 103 | unused:5, 104 | lt_is_sampling:1, /* taking long-term ("LT") samples now? */ 105 | lt_rtt_cnt:7, /* round trips in long-term interval */ 106 | lt_use_bw:1; /* use lt_bw as our bw estimate? */ 107 | u32 lt_bw; /* LT est delivery rate in pkts/uS << 24 */ 108 | u32 lt_last_delivered; /* LT intvl start: tp->delivered */ 109 | u32 lt_last_stamp; /* LT intvl start: tp->delivered_mstamp */ 110 | u32 lt_last_lost; /* LT intvl start: tp->lost */ 111 | u32 pacing_gain:10, /* current gain for setting pacing rate */ 112 | cwnd_gain:10, /* current gain for setting cwnd */ 113 | full_bw_cnt:3, /* number of rounds without large bw gains */ 114 | cycle_idx:3, /* current index in pacing_gain cycle array */ 115 | has_seen_rtt:1, /* have we seen an RTT sample yet? */ 116 | unused_b:5; 117 | u32 prior_cwnd; /* prior cwnd upon entering loss recovery */ 118 | u32 full_bw; /* recent bw, to estimate if pipe is full */ 119 | }; 120 | 121 | #define CYCLE_LEN 8 /* number of phases in a pacing gain cycle */ 122 | 123 | /* Window length of bw filter (in rounds): */ 124 | static const int bbr_bw_rtts = CYCLE_LEN + 2; 125 | /* Window length of min_rtt filter (in sec): */ 126 | static const u32 bbr_min_rtt_win_sec = 10; 127 | /* Minimum time (in ms) spent at bbr_cwnd_min_target in BBR_PROBE_RTT mode: */ 128 | static const u32 bbr_probe_rtt_mode_ms = 200; 129 | /* Skip TSO below the following bandwidth (bits/sec): */ 130 | static const int bbr_min_tso_rate = 1200000; 131 | 132 | /* We use a high_gain value of 2/ln(2) because it's the smallest pacing gain 133 | * that will allow a smoothly increasing pacing rate that will double each RTT 134 | * and send the same number of packets per RTT that an un-paced, slow-starting 135 | * Reno or CUBIC flow would: 136 | */ 137 | static const int bbr_high_gain = BBR_UNIT * 2885 / 1000 + 1; 138 | /* The pacing gain of 1/high_gain in BBR_DRAIN is calculated to typically drain 139 | * the queue created in BBR_STARTUP in a single round: 140 | */ 141 | static const int bbr_drain_gain = BBR_UNIT * 1000 / 2885; 142 | /* The gain for deriving steady-state cwnd tolerates delayed/stretched ACKs: */ 143 | static const int bbr_cwnd_gain = BBR_UNIT * 2; 144 | /* The pacing_gain values for the PROBE_BW gain cycle, to discover/share bw: */ 145 | static const int bbr_pacing_gain[] = { 146 | BBR_UNIT * 5 / 4, /* probe for more available bw */ 147 | BBR_UNIT * 3 / 4, /* drain queue and/or yield bw to other flows */ 148 | BBR_UNIT, BBR_UNIT, BBR_UNIT, /* cruise at 1.0*bw to utilize pipe, */ 149 | BBR_UNIT, BBR_UNIT, BBR_UNIT /* without creating excess queue... */ 150 | }; 151 | /* Randomize the starting gain cycling phase over N phases: */ 152 | static const u32 bbr_cycle_rand = 7; 153 | 154 | /* Try to keep at least this many packets in flight, if things go smoothly. For 155 | * smooth functioning, a sliding window protocol ACKing every other packet 156 | * needs at least 4 packets in flight: 157 | */ 158 | static const u32 bbr_cwnd_min_target = 4; 159 | 160 | /* To estimate if BBR_STARTUP mode (i.e. high_gain) has filled pipe... */ 161 | /* If bw has increased significantly (1.25x), there may be more bw available: */ 162 | static const u32 bbr_full_bw_thresh = BBR_UNIT * 5 / 4; 163 | /* But after 3 rounds w/o significant bw growth, estimate pipe is full: */ 164 | static const u32 bbr_full_bw_cnt = 3; 165 | 166 | /* "long-term" ("LT") bandwidth estimator parameters... */ 167 | /* The minimum number of rounds in an LT bw sampling interval: */ 168 | static const u32 bbr_lt_intvl_min_rtts = 4; 169 | /* If lost/delivered ratio > 20%, interval is "lossy" and we may be policed: */ 170 | static const u32 bbr_lt_loss_thresh = 50; 171 | /* If 2 intervals have a bw ratio <= 1/8, their bw is "consistent": */ 172 | static const u32 bbr_lt_bw_ratio = BBR_UNIT / 8; 173 | /* If 2 intervals have a bw diff <= 4 Kbit/sec their bw is "consistent": */ 174 | static const u32 bbr_lt_bw_diff = 4000 / 8; 175 | /* If we estimate we're policed, use lt_bw for this many round trips: */ 176 | static const u32 bbr_lt_bw_max_rtts = 48; 177 | 178 | /* Do we estimate that STARTUP filled the pipe? */ 179 | static bool bbr_full_bw_reached(const struct sock *sk) 180 | { 181 | const struct bbr *bbr = inet_csk_ca(sk); 182 | 183 | return bbr->full_bw_cnt >= bbr_full_bw_cnt; 184 | } 185 | 186 | /* Return the windowed max recent bandwidth sample, in pkts/uS << BW_SCALE. */ 187 | static u32 bbr_max_bw(const struct sock *sk) 188 | { 189 | struct bbr *bbr = inet_csk_ca(sk); 190 | 191 | return minmax_get(&bbr->bw); 192 | } 193 | 194 | /* Return the estimated bandwidth of the path, in pkts/uS << BW_SCALE. */ 195 | static u32 bbr_bw(const struct sock *sk) 196 | { 197 | struct bbr *bbr = inet_csk_ca(sk); 198 | 199 | return bbr->lt_use_bw ? bbr->lt_bw : bbr_max_bw(sk); 200 | } 201 | 202 | /* Return rate in bytes per second, optionally with a gain. 203 | * The order here is chosen carefully to avoid overflow of u64. This should 204 | * work for input rates of up to 2.9Tbit/sec and gain of 2.89x. 205 | */ 206 | static u64 bbr_rate_bytes_per_sec(struct sock *sk, u64 rate, int gain) 207 | { 208 | rate *= tcp_mss_to_mtu(sk, tcp_sk(sk)->mss_cache); 209 | rate *= gain; 210 | rate >>= BBR_SCALE; 211 | rate *= USEC_PER_SEC; 212 | return rate >> BW_SCALE; 213 | } 214 | 215 | /* Convert a BBR bw and gain factor to a pacing rate in bytes per second. */ 216 | static u32 bbr_bw_to_pacing_rate(struct sock *sk, u32 bw, int gain) 217 | { 218 | u64 rate = bw; 219 | 220 | rate = bbr_rate_bytes_per_sec(sk, rate, gain); 221 | rate = min_t(u64, rate, sk->sk_max_pacing_rate); 222 | return rate; 223 | } 224 | 225 | /* Initialize pacing rate to: high_gain * init_cwnd / RTT. */ 226 | static void bbr_init_pacing_rate_from_rtt(struct sock *sk) 227 | { 228 | struct tcp_sock *tp = tcp_sk(sk); 229 | struct bbr *bbr = inet_csk_ca(sk); 230 | u64 bw; 231 | u32 rtt_us; 232 | 233 | if (tp->srtt_us) { /* any RTT sample yet? */ 234 | rtt_us = max(tp->srtt_us >> 3, 1U); 235 | bbr->has_seen_rtt = 1; 236 | } else { /* no RTT sample yet */ 237 | rtt_us = USEC_PER_MSEC; /* use nominal default RTT */ 238 | } 239 | bw = (u64)tp->snd_cwnd * BW_UNIT; 240 | do_div(bw, rtt_us); 241 | sk->sk_pacing_rate = bbr_bw_to_pacing_rate(sk, bw, bbr_high_gain); 242 | } 243 | 244 | /* Pace using current bw estimate and a gain factor. In order to help drive the 245 | * network toward lower queues while maintaining high utilization and low 246 | * latency, the average pacing rate aims to be slightly (~1%) lower than the 247 | * estimated bandwidth. This is an important aspect of the design. In this 248 | * implementation this slightly lower pacing rate is achieved implicitly by not 249 | * including link-layer headers in the packet size used for the pacing rate. 250 | */ 251 | static void bbr_set_pacing_rate(struct sock *sk, u32 bw, int gain) 252 | { 253 | struct tcp_sock *tp = tcp_sk(sk); 254 | struct bbr *bbr = inet_csk_ca(sk); 255 | u32 rate = bbr_bw_to_pacing_rate(sk, bw, gain); 256 | 257 | if (unlikely(!bbr->has_seen_rtt && tp->srtt_us)) 258 | bbr_init_pacing_rate_from_rtt(sk); 259 | if (bbr_full_bw_reached(sk) || rate > sk->sk_pacing_rate) 260 | sk->sk_pacing_rate = rate; 261 | } 262 | 263 | /* Return count of segments we want in the skbs we send, or 0 for default. */ 264 | static u32 bbr_tso_segs_goal(struct sock *sk) 265 | { 266 | struct bbr *bbr = inet_csk_ca(sk); 267 | 268 | return bbr->tso_segs_goal; 269 | } 270 | 271 | static void bbr_set_tso_segs_goal(struct sock *sk) 272 | { 273 | struct tcp_sock *tp = tcp_sk(sk); 274 | struct bbr *bbr = inet_csk_ca(sk); 275 | u32 min_segs; 276 | 277 | min_segs = sk->sk_pacing_rate < (bbr_min_tso_rate >> 3) ? 1 : 2; 278 | bbr->tso_segs_goal = min(tcp_tso_autosize(sk, tp->mss_cache, min_segs), 279 | 0x7FU); 280 | } 281 | 282 | /* Save "last known good" cwnd so we can restore it after losses or PROBE_RTT */ 283 | static void bbr_save_cwnd(struct sock *sk) 284 | { 285 | struct tcp_sock *tp = tcp_sk(sk); 286 | struct bbr *bbr = inet_csk_ca(sk); 287 | 288 | if (bbr->prev_ca_state < TCP_CA_Recovery && bbr->mode != BBR_PROBE_RTT) 289 | bbr->prior_cwnd = tp->snd_cwnd; /* this cwnd is good enough */ 290 | else /* loss recovery or BBR_PROBE_RTT have temporarily cut cwnd */ 291 | bbr->prior_cwnd = max(bbr->prior_cwnd, tp->snd_cwnd); 292 | } 293 | 294 | static void bbr_cwnd_event(struct sock *sk, enum tcp_ca_event event) 295 | { 296 | struct tcp_sock *tp = tcp_sk(sk); 297 | struct bbr *bbr = inet_csk_ca(sk); 298 | 299 | if (event == CA_EVENT_TX_START && tp->app_limited) { 300 | bbr->idle_restart = 1; 301 | /* Avoid pointless buffer overflows: pace at est. bw if we don't 302 | * need more speed (we're restarting from idle and app-limited). 303 | */ 304 | if (bbr->mode == BBR_PROBE_BW) 305 | bbr_set_pacing_rate(sk, bbr_bw(sk), BBR_UNIT); 306 | } 307 | } 308 | 309 | /* Find target cwnd. Right-size the cwnd based on min RTT and the 310 | * estimated bottleneck bandwidth: 311 | * 312 | * cwnd = bw * min_rtt * gain = BDP * gain 313 | * 314 | * The key factor, gain, controls the amount of queue. While a small gain 315 | * builds a smaller queue, it becomes more vulnerable to noise in RTT 316 | * measurements (e.g., delayed ACKs or other ACK compression effects). This 317 | * noise may cause BBR to under-estimate the rate. 318 | * 319 | * To achieve full performance in high-speed paths, we budget enough cwnd to 320 | * fit full-sized skbs in-flight on both end hosts to fully utilize the path: 321 | * - one skb in sending host Qdisc, 322 | * - one skb in sending host TSO/GSO engine 323 | * - one skb being received by receiver host LRO/GRO/delayed-ACK engine 324 | * Don't worry, at low rates (bbr_min_tso_rate) this won't bloat cwnd because 325 | * in such cases tso_segs_goal is 1. The minimum cwnd is 4 packets, 326 | * which allows 2 outstanding 2-packet sequences, to try to keep pipe 327 | * full even with ACK-every-other-packet delayed ACKs. 328 | */ 329 | static u32 bbr_target_cwnd(struct sock *sk, u32 bw, int gain) 330 | { 331 | struct bbr *bbr = inet_csk_ca(sk); 332 | u32 cwnd; 333 | u64 w; 334 | 335 | /* If we've never had a valid RTT sample, cap cwnd at the initial 336 | * default. This should only happen when the connection is not using TCP 337 | * timestamps and has retransmitted all of the SYN/SYNACK/data packets 338 | * ACKed so far. In this case, an RTO can cut cwnd to 1, in which 339 | * case we need to slow-start up toward something safe: TCP_INIT_CWND. 340 | */ 341 | if (unlikely(bbr->min_rtt_us == ~0U)) /* no valid RTT samples yet? */ 342 | return TCP_INIT_CWND; /* be safe: cap at default initial cwnd*/ 343 | 344 | w = (u64)bw * bbr->min_rtt_us; 345 | 346 | /* Apply a gain to the given value, then remove the BW_SCALE shift. */ 347 | cwnd = (((w * gain) >> BBR_SCALE) + BW_UNIT - 1) / BW_UNIT; 348 | 349 | /* Allow enough full-sized skbs in flight to utilize end systems. */ 350 | cwnd += 3 * bbr->tso_segs_goal; 351 | 352 | /* Reduce delayed ACKs by rounding up cwnd to the next even number. */ 353 | cwnd = (cwnd + 1) & ~1U; 354 | 355 | return cwnd; 356 | } 357 | 358 | /* An optimization in BBR to reduce losses: On the first round of recovery, we 359 | * follow the packet conservation principle: send P packets per P packets acked. 360 | * After that, we slow-start and send at most 2*P packets per P packets acked. 361 | * After recovery finishes, or upon undo, we restore the cwnd we had when 362 | * recovery started (capped by the target cwnd based on estimated BDP). 363 | * 364 | * TODO(ycheng/ncardwell): implement a rate-based approach. 365 | */ 366 | static bool bbr_set_cwnd_to_recover_or_restore( 367 | struct sock *sk, const struct rate_sample *rs, u32 acked, u32 *new_cwnd) 368 | { 369 | struct tcp_sock *tp = tcp_sk(sk); 370 | struct bbr *bbr = inet_csk_ca(sk); 371 | u8 prev_state = bbr->prev_ca_state, state = inet_csk(sk)->icsk_ca_state; 372 | u32 cwnd = tp->snd_cwnd; 373 | 374 | /* An ACK for P pkts should release at most 2*P packets. We do this 375 | * in two steps. First, here we deduct the number of lost packets. 376 | * Then, in bbr_set_cwnd() we slow start up toward the target cwnd. 377 | */ 378 | if (rs->losses > 0) 379 | cwnd = max_t(s32, cwnd - rs->losses, 1); 380 | 381 | if (state == TCP_CA_Recovery && prev_state != TCP_CA_Recovery) { 382 | /* Starting 1st round of Recovery, so do packet conservation. */ 383 | bbr->packet_conservation = 1; 384 | bbr->next_rtt_delivered = tp->delivered; /* start round now */ 385 | /* Cut unused cwnd from app behavior, TSQ, or TSO deferral: */ 386 | cwnd = tcp_packets_in_flight(tp) + acked; 387 | } else if (prev_state >= TCP_CA_Recovery && state < TCP_CA_Recovery) { 388 | /* Exiting loss recovery; restore cwnd saved before recovery. */ 389 | bbr->restore_cwnd = 1; 390 | bbr->packet_conservation = 0; 391 | } 392 | bbr->prev_ca_state = state; 393 | 394 | if (bbr->restore_cwnd) { 395 | /* Restore cwnd after exiting loss recovery or PROBE_RTT. */ 396 | cwnd = max(cwnd, bbr->prior_cwnd); 397 | bbr->restore_cwnd = 0; 398 | } 399 | 400 | if (bbr->packet_conservation) { 401 | *new_cwnd = max(cwnd, tcp_packets_in_flight(tp) + acked); 402 | return true; /* yes, using packet conservation */ 403 | } 404 | *new_cwnd = cwnd; 405 | return false; 406 | } 407 | 408 | /* Slow-start up toward target cwnd (if bw estimate is growing, or packet loss 409 | * has drawn us down below target), or snap down to target if we're above it. 410 | */ 411 | static void bbr_set_cwnd(struct sock *sk, const struct rate_sample *rs, 412 | u32 acked, u32 bw, int gain) 413 | { 414 | struct tcp_sock *tp = tcp_sk(sk); 415 | struct bbr *bbr = inet_csk_ca(sk); 416 | u32 cwnd = 0, target_cwnd = 0; 417 | 418 | if (!acked) 419 | return; 420 | 421 | if (bbr_set_cwnd_to_recover_or_restore(sk, rs, acked, &cwnd)) 422 | goto done; 423 | 424 | /* If we're below target cwnd, slow start cwnd toward target cwnd. */ 425 | target_cwnd = bbr_target_cwnd(sk, bw, gain); 426 | if (bbr_full_bw_reached(sk)) /* only cut cwnd if we filled the pipe */ 427 | cwnd = min(cwnd + acked, target_cwnd); 428 | else if (cwnd < target_cwnd || tp->delivered < TCP_INIT_CWND) 429 | cwnd = cwnd + acked; 430 | cwnd = max(cwnd, bbr_cwnd_min_target); 431 | 432 | done: 433 | tp->snd_cwnd = min(cwnd, tp->snd_cwnd_clamp); /* apply global cap */ 434 | if (bbr->mode == BBR_PROBE_RTT) /* drain queue, refresh min_rtt */ 435 | tp->snd_cwnd = min(tp->snd_cwnd, bbr_cwnd_min_target); 436 | } 437 | 438 | /* End cycle phase if it's time and/or we hit the phase's in-flight target. */ 439 | static bool bbr_is_next_cycle_phase(struct sock *sk, 440 | const struct rate_sample *rs) 441 | { 442 | struct tcp_sock *tp = tcp_sk(sk); 443 | struct bbr *bbr = inet_csk_ca(sk); 444 | bool is_full_length = 445 | tcp_stamp_us_delta(tp->delivered_mstamp, bbr->cycle_mstamp) > 446 | bbr->min_rtt_us; 447 | u32 inflight, bw; 448 | 449 | /* The pacing_gain of 1.0 paces at the estimated bw to try to fully 450 | * use the pipe without increasing the queue. 451 | */ 452 | if (bbr->pacing_gain == BBR_UNIT) 453 | return is_full_length; /* just use wall clock time */ 454 | 455 | inflight = rs->prior_in_flight; /* what was in-flight before ACK? */ 456 | bw = bbr_max_bw(sk); 457 | 458 | /* A pacing_gain > 1.0 probes for bw by trying to raise inflight to at 459 | * least pacing_gain*BDP; this may take more than min_rtt if min_rtt is 460 | * small (e.g. on a LAN). We do not persist if packets are lost, since 461 | * a path with small buffers may not hold that much. 462 | */ 463 | if (bbr->pacing_gain > BBR_UNIT) 464 | return is_full_length && 465 | (rs->losses || /* perhaps pacing_gain*BDP won't fit */ 466 | inflight >= bbr_target_cwnd(sk, bw, bbr->pacing_gain)); 467 | 468 | /* A pacing_gain < 1.0 tries to drain extra queue we added if bw 469 | * probing didn't find more bw. If inflight falls to match BDP then we 470 | * estimate queue is drained; persisting would underutilize the pipe. 471 | */ 472 | return is_full_length || 473 | inflight <= bbr_target_cwnd(sk, bw, BBR_UNIT); 474 | } 475 | 476 | static void bbr_advance_cycle_phase(struct sock *sk) 477 | { 478 | struct tcp_sock *tp = tcp_sk(sk); 479 | struct bbr *bbr = inet_csk_ca(sk); 480 | 481 | bbr->cycle_idx = (bbr->cycle_idx + 1) & (CYCLE_LEN - 1); 482 | bbr->cycle_mstamp = tp->delivered_mstamp; 483 | bbr->pacing_gain = bbr_pacing_gain[bbr->cycle_idx]; 484 | } 485 | 486 | /* Gain cycling: cycle pacing gain to converge to fair share of available bw. */ 487 | static void bbr_update_cycle_phase(struct sock *sk, 488 | const struct rate_sample *rs) 489 | { 490 | struct bbr *bbr = inet_csk_ca(sk); 491 | 492 | if ((bbr->mode == BBR_PROBE_BW) && !bbr->lt_use_bw && 493 | bbr_is_next_cycle_phase(sk, rs)) 494 | bbr_advance_cycle_phase(sk); 495 | } 496 | 497 | static void bbr_reset_startup_mode(struct sock *sk) 498 | { 499 | struct bbr *bbr = inet_csk_ca(sk); 500 | 501 | bbr->mode = BBR_STARTUP; 502 | bbr->pacing_gain = bbr_high_gain; 503 | bbr->cwnd_gain = bbr_high_gain; 504 | } 505 | 506 | static void bbr_reset_probe_bw_mode(struct sock *sk) 507 | { 508 | struct bbr *bbr = inet_csk_ca(sk); 509 | 510 | bbr->mode = BBR_PROBE_BW; 511 | bbr->pacing_gain = BBR_UNIT; 512 | bbr->cwnd_gain = bbr_cwnd_gain; 513 | bbr->cycle_idx = CYCLE_LEN - 1 - prandom_u32_max(bbr_cycle_rand); 514 | bbr_advance_cycle_phase(sk); /* flip to next phase of gain cycle */ 515 | } 516 | 517 | static void bbr_reset_mode(struct sock *sk) 518 | { 519 | if (!bbr_full_bw_reached(sk)) 520 | bbr_reset_startup_mode(sk); 521 | else 522 | bbr_reset_probe_bw_mode(sk); 523 | } 524 | 525 | /* Start a new long-term sampling interval. */ 526 | static void bbr_reset_lt_bw_sampling_interval(struct sock *sk) 527 | { 528 | struct tcp_sock *tp = tcp_sk(sk); 529 | struct bbr *bbr = inet_csk_ca(sk); 530 | 531 | bbr->lt_last_stamp = div_u64(tp->delivered_mstamp, USEC_PER_MSEC); 532 | bbr->lt_last_delivered = tp->delivered; 533 | bbr->lt_last_lost = tp->lost; 534 | bbr->lt_rtt_cnt = 0; 535 | } 536 | 537 | /* Completely reset long-term bandwidth sampling. */ 538 | static void bbr_reset_lt_bw_sampling(struct sock *sk) 539 | { 540 | struct bbr *bbr = inet_csk_ca(sk); 541 | 542 | bbr->lt_bw = 0; 543 | bbr->lt_use_bw = 0; 544 | bbr->lt_is_sampling = false; 545 | bbr_reset_lt_bw_sampling_interval(sk); 546 | } 547 | 548 | /* Long-term bw sampling interval is done. Estimate whether we're policed. */ 549 | static void bbr_lt_bw_interval_done(struct sock *sk, u32 bw) 550 | { 551 | struct bbr *bbr = inet_csk_ca(sk); 552 | u32 diff; 553 | 554 | if (bbr->lt_bw) { /* do we have bw from a previous interval? */ 555 | /* Is new bw close to the lt_bw from the previous interval? */ 556 | diff = abs(bw - bbr->lt_bw); 557 | if ((diff * BBR_UNIT <= bbr_lt_bw_ratio * bbr->lt_bw) || 558 | (bbr_rate_bytes_per_sec(sk, diff, BBR_UNIT) <= 559 | bbr_lt_bw_diff)) { 560 | /* All criteria are met; estimate we're policed. */ 561 | bbr->lt_bw = (bw + bbr->lt_bw) >> 1; /* avg 2 intvls */ 562 | bbr->lt_use_bw = 1; 563 | bbr->pacing_gain = BBR_UNIT; /* try to avoid drops */ 564 | bbr->lt_rtt_cnt = 0; 565 | return; 566 | } 567 | } 568 | bbr->lt_bw = bw; 569 | bbr_reset_lt_bw_sampling_interval(sk); 570 | } 571 | 572 | /* Token-bucket traffic policers are common (see "An Internet-Wide Analysis of 573 | * Traffic Policing", SIGCOMM 2016). BBR detects token-bucket policers and 574 | * explicitly models their policed rate, to reduce unnecessary losses. We 575 | * estimate that we're policed if we see 2 consecutive sampling intervals with 576 | * consistent throughput and high packet loss. If we think we're being policed, 577 | * set lt_bw to the "long-term" average delivery rate from those 2 intervals. 578 | */ 579 | static void bbr_lt_bw_sampling(struct sock *sk, const struct rate_sample *rs) 580 | { 581 | struct tcp_sock *tp = tcp_sk(sk); 582 | struct bbr *bbr = inet_csk_ca(sk); 583 | u32 lost, delivered; 584 | u64 bw; 585 | u32 t; 586 | 587 | if (bbr->lt_use_bw) { /* already using long-term rate, lt_bw? */ 588 | if (bbr->mode == BBR_PROBE_BW && bbr->round_start && 589 | ++bbr->lt_rtt_cnt >= bbr_lt_bw_max_rtts) { 590 | bbr_reset_lt_bw_sampling(sk); /* stop using lt_bw */ 591 | bbr_reset_probe_bw_mode(sk); /* restart gain cycling */ 592 | } 593 | return; 594 | } 595 | 596 | /* Wait for the first loss before sampling, to let the policer exhaust 597 | * its tokens and estimate the steady-state rate allowed by the policer. 598 | * Starting samples earlier includes bursts that over-estimate the bw. 599 | */ 600 | if (!bbr->lt_is_sampling) { 601 | if (!rs->losses) 602 | return; 603 | bbr_reset_lt_bw_sampling_interval(sk); 604 | bbr->lt_is_sampling = true; 605 | } 606 | 607 | /* To avoid underestimates, reset sampling if we run out of data. */ 608 | if (rs->is_app_limited) { 609 | bbr_reset_lt_bw_sampling(sk); 610 | return; 611 | } 612 | 613 | if (bbr->round_start) 614 | bbr->lt_rtt_cnt++; /* count round trips in this interval */ 615 | if (bbr->lt_rtt_cnt < bbr_lt_intvl_min_rtts) 616 | return; /* sampling interval needs to be longer */ 617 | if (bbr->lt_rtt_cnt > 4 * bbr_lt_intvl_min_rtts) { 618 | bbr_reset_lt_bw_sampling(sk); /* interval is too long */ 619 | return; 620 | } 621 | 622 | /* End sampling interval when a packet is lost, so we estimate the 623 | * policer tokens were exhausted. Stopping the sampling before the 624 | * tokens are exhausted under-estimates the policed rate. 625 | */ 626 | if (!rs->losses) 627 | return; 628 | 629 | /* Calculate packets lost and delivered in sampling interval. */ 630 | lost = tp->lost - bbr->lt_last_lost; 631 | delivered = tp->delivered - bbr->lt_last_delivered; 632 | /* Is loss rate (lost/delivered) >= lt_loss_thresh? If not, wait. */ 633 | if (!delivered || (lost << BBR_SCALE) < bbr_lt_loss_thresh * delivered) 634 | return; 635 | 636 | /* Find average delivery rate in this sampling interval. */ 637 | t = div_u64(tp->delivered_mstamp, USEC_PER_MSEC) - bbr->lt_last_stamp; 638 | if ((s32)t < 1) 639 | return; /* interval is less than one ms, so wait */ 640 | /* Check if can multiply without overflow */ 641 | if (t >= ~0U / USEC_PER_MSEC) { 642 | bbr_reset_lt_bw_sampling(sk); /* interval too long; reset */ 643 | return; 644 | } 645 | t *= USEC_PER_MSEC; 646 | bw = (u64)delivered * BW_UNIT; 647 | do_div(bw, t); 648 | bbr_lt_bw_interval_done(sk, bw); 649 | } 650 | 651 | /* Estimate the bandwidth based on how fast packets are delivered */ 652 | static void bbr_update_bw(struct sock *sk, const struct rate_sample *rs) 653 | { 654 | struct tcp_sock *tp = tcp_sk(sk); 655 | struct bbr *bbr = inet_csk_ca(sk); 656 | u64 bw; 657 | 658 | bbr->round_start = 0; 659 | if (rs->delivered < 0 || rs->interval_us <= 0) 660 | return; /* Not a valid observation */ 661 | 662 | /* See if we've reached the next RTT */ 663 | if (!before(rs->prior_delivered, bbr->next_rtt_delivered)) { 664 | bbr->next_rtt_delivered = tp->delivered; 665 | bbr->rtt_cnt++; 666 | bbr->round_start = 1; 667 | bbr->packet_conservation = 0; 668 | } 669 | 670 | bbr_lt_bw_sampling(sk, rs); 671 | 672 | /* Divide delivered by the interval to find a (lower bound) bottleneck 673 | * bandwidth sample. Delivered is in packets and interval_us in uS and 674 | * ratio will be <<1 for most connections. So delivered is first scaled. 675 | */ 676 | bw = (u64)rs->delivered * BW_UNIT; 677 | do_div(bw, rs->interval_us); 678 | 679 | /* If this sample is application-limited, it is likely to have a very 680 | * low delivered count that represents application behavior rather than 681 | * the available network rate. Such a sample could drag down estimated 682 | * bw, causing needless slow-down. Thus, to continue to send at the 683 | * last measured network rate, we filter out app-limited samples unless 684 | * they describe the path bw at least as well as our bw model. 685 | * 686 | * So the goal during app-limited phase is to proceed with the best 687 | * network rate no matter how long. We automatically leave this 688 | * phase when app writes faster than the network can deliver :) 689 | */ 690 | if (!rs->is_app_limited || bw >= bbr_max_bw(sk)) { 691 | /* Incorporate new sample into our max bw filter. */ 692 | minmax_running_max(&bbr->bw, bbr_bw_rtts, bbr->rtt_cnt, bw); 693 | } 694 | } 695 | 696 | /* Estimate when the pipe is full, using the change in delivery rate: BBR 697 | * estimates that STARTUP filled the pipe if the estimated bw hasn't changed by 698 | * at least bbr_full_bw_thresh (25%) after bbr_full_bw_cnt (3) non-app-limited 699 | * rounds. Why 3 rounds: 1: rwin autotuning grows the rwin, 2: we fill the 700 | * higher rwin, 3: we get higher delivery rate samples. Or transient 701 | * cross-traffic or radio noise can go away. CUBIC Hystart shares a similar 702 | * design goal, but uses delay and inter-ACK spacing instead of bandwidth. 703 | */ 704 | static void bbr_check_full_bw_reached(struct sock *sk, 705 | const struct rate_sample *rs) 706 | { 707 | struct bbr *bbr = inet_csk_ca(sk); 708 | u32 bw_thresh; 709 | 710 | if (bbr_full_bw_reached(sk) || !bbr->round_start || rs->is_app_limited) 711 | return; 712 | 713 | bw_thresh = (u64)bbr->full_bw * bbr_full_bw_thresh >> BBR_SCALE; 714 | if (bbr_max_bw(sk) >= bw_thresh) { 715 | bbr->full_bw = bbr_max_bw(sk); 716 | bbr->full_bw_cnt = 0; 717 | return; 718 | } 719 | ++bbr->full_bw_cnt; 720 | } 721 | 722 | /* If pipe is probably full, drain the queue and then enter steady-state. */ 723 | static void bbr_check_drain(struct sock *sk, const struct rate_sample *rs) 724 | { 725 | struct bbr *bbr = inet_csk_ca(sk); 726 | 727 | if (bbr->mode == BBR_STARTUP && bbr_full_bw_reached(sk)) { 728 | bbr->mode = BBR_DRAIN; /* drain queue we created */ 729 | bbr->pacing_gain = bbr_drain_gain; /* pace slow to drain */ 730 | bbr->cwnd_gain = bbr_high_gain; /* maintain cwnd */ 731 | } /* fall through to check if in-flight is already small: */ 732 | if (bbr->mode == BBR_DRAIN && 733 | tcp_packets_in_flight(tcp_sk(sk)) <= 734 | bbr_target_cwnd(sk, bbr_max_bw(sk), BBR_UNIT)) 735 | bbr_reset_probe_bw_mode(sk); /* we estimate queue is drained */ 736 | } 737 | 738 | /* The goal of PROBE_RTT mode is to have BBR flows cooperatively and 739 | * periodically drain the bottleneck queue, to converge to measure the true 740 | * min_rtt (unloaded propagation delay). This allows the flows to keep queues 741 | * small (reducing queuing delay and packet loss) and achieve fairness among 742 | * BBR flows. 743 | * 744 | * The min_rtt filter window is 10 seconds. When the min_rtt estimate expires, 745 | * we enter PROBE_RTT mode and cap the cwnd at bbr_cwnd_min_target=4 packets. 746 | * After at least bbr_probe_rtt_mode_ms=200ms and at least one packet-timed 747 | * round trip elapsed with that flight size <= 4, we leave PROBE_RTT mode and 748 | * re-enter the previous mode. BBR uses 200ms to approximately bound the 749 | * performance penalty of PROBE_RTT's cwnd capping to roughly 2% (200ms/10s). 750 | * 751 | * Note that flows need only pay 2% if they are busy sending over the last 10 752 | * seconds. Interactive applications (e.g., Web, RPCs, video chunks) often have 753 | * natural silences or low-rate periods within 10 seconds where the rate is low 754 | * enough for long enough to drain its queue in the bottleneck. We pick up 755 | * these min RTT measurements opportunistically with our min_rtt filter. :-) 756 | */ 757 | static void bbr_update_min_rtt(struct sock *sk, const struct rate_sample *rs) 758 | { 759 | struct tcp_sock *tp = tcp_sk(sk); 760 | struct bbr *bbr = inet_csk_ca(sk); 761 | bool filter_expired; 762 | 763 | /* Track min RTT seen in the min_rtt_win_sec filter window: */ 764 | filter_expired = after(tcp_jiffies32, 765 | bbr->min_rtt_stamp + bbr_min_rtt_win_sec * HZ); 766 | if (rs->rtt_us >= 0 && 767 | (rs->rtt_us <= bbr->min_rtt_us || filter_expired)) { 768 | bbr->min_rtt_us = rs->rtt_us; 769 | bbr->min_rtt_stamp = tcp_jiffies32; 770 | } 771 | 772 | if (bbr_probe_rtt_mode_ms > 0 && filter_expired && 773 | !bbr->idle_restart && bbr->mode != BBR_PROBE_RTT) { 774 | bbr->mode = BBR_PROBE_RTT; /* dip, drain queue */ 775 | bbr->pacing_gain = BBR_UNIT; 776 | bbr->cwnd_gain = BBR_UNIT; 777 | bbr_save_cwnd(sk); /* note cwnd so we can restore it */ 778 | bbr->probe_rtt_done_stamp = 0; 779 | } 780 | 781 | if (bbr->mode == BBR_PROBE_RTT) { 782 | /* Ignore low rate samples during this mode. */ 783 | tp->app_limited = 784 | (tp->delivered + tcp_packets_in_flight(tp)) ? : 1; 785 | /* Maintain min packets in flight for max(200 ms, 1 round). */ 786 | if (!bbr->probe_rtt_done_stamp && 787 | tcp_packets_in_flight(tp) <= bbr_cwnd_min_target) { 788 | bbr->probe_rtt_done_stamp = tcp_jiffies32 + 789 | msecs_to_jiffies(bbr_probe_rtt_mode_ms); 790 | bbr->probe_rtt_round_done = 0; 791 | bbr->next_rtt_delivered = tp->delivered; 792 | } else if (bbr->probe_rtt_done_stamp) { 793 | if (bbr->round_start) 794 | bbr->probe_rtt_round_done = 1; 795 | if (bbr->probe_rtt_round_done && 796 | after(tcp_jiffies32, bbr->probe_rtt_done_stamp)) { 797 | bbr->min_rtt_stamp = tcp_jiffies32; 798 | bbr->restore_cwnd = 1; /* snap to prior_cwnd */ 799 | bbr_reset_mode(sk); 800 | } 801 | } 802 | } 803 | bbr->idle_restart = 0; 804 | } 805 | 806 | static void bbr_update_model(struct sock *sk, const struct rate_sample *rs) 807 | { 808 | bbr_update_bw(sk, rs); 809 | bbr_update_cycle_phase(sk, rs); 810 | bbr_check_full_bw_reached(sk, rs); 811 | bbr_check_drain(sk, rs); 812 | bbr_update_min_rtt(sk, rs); 813 | } 814 | 815 | static void bbr_main(struct sock *sk, const struct rate_sample *rs) 816 | { 817 | struct bbr *bbr = inet_csk_ca(sk); 818 | u32 bw; 819 | 820 | bbr_update_model(sk, rs); 821 | 822 | bw = bbr_bw(sk); 823 | bbr_set_pacing_rate(sk, bw, bbr->pacing_gain); 824 | bbr_set_tso_segs_goal(sk); 825 | bbr_set_cwnd(sk, rs, rs->acked_sacked, bw, bbr->cwnd_gain); 826 | } 827 | 828 | static void bbr_init(struct sock *sk) 829 | { 830 | struct tcp_sock *tp = tcp_sk(sk); 831 | struct bbr *bbr = inet_csk_ca(sk); 832 | 833 | bbr->prior_cwnd = 0; 834 | bbr->tso_segs_goal = 0; /* default segs per skb until first ACK */ 835 | bbr->rtt_cnt = 0; 836 | bbr->next_rtt_delivered = 0; 837 | bbr->prev_ca_state = TCP_CA_Open; 838 | bbr->packet_conservation = 0; 839 | 840 | bbr->probe_rtt_done_stamp = 0; 841 | bbr->probe_rtt_round_done = 0; 842 | bbr->min_rtt_us = tcp_min_rtt(tp); 843 | bbr->min_rtt_stamp = tcp_jiffies32; 844 | 845 | minmax_reset(&bbr->bw, bbr->rtt_cnt, 0); /* init max bw to 0 */ 846 | 847 | bbr->has_seen_rtt = 0; 848 | bbr_init_pacing_rate_from_rtt(sk); 849 | 850 | bbr->restore_cwnd = 0; 851 | bbr->round_start = 0; 852 | bbr->idle_restart = 0; 853 | bbr->full_bw = 0; 854 | bbr->full_bw_cnt = 0; 855 | bbr->cycle_mstamp = 0; 856 | bbr->cycle_idx = 0; 857 | bbr_reset_lt_bw_sampling(sk); 858 | bbr_reset_startup_mode(sk); 859 | 860 | cmpxchg(&sk->sk_pacing_status, SK_PACING_NONE, SK_PACING_NEEDED); 861 | } 862 | 863 | static u32 bbr_sndbuf_expand(struct sock *sk) 864 | { 865 | /* Provision 3 * cwnd since BBR may slow-start even during recovery. */ 866 | return 3; 867 | } 868 | 869 | /* In theory BBR does not need to undo the cwnd since it does not 870 | * always reduce cwnd on losses (see bbr_main()). Keep it for now. 871 | */ 872 | static u32 bbr_undo_cwnd(struct sock *sk) 873 | { 874 | return tcp_sk(sk)->snd_cwnd; 875 | } 876 | 877 | /* Entering loss recovery, so save cwnd for when we exit or undo recovery. */ 878 | static u32 bbr_ssthresh(struct sock *sk) 879 | { 880 | bbr_save_cwnd(sk); 881 | return TCP_INFINITE_SSTHRESH; /* BBR does not use ssthresh */ 882 | } 883 | 884 | static size_t bbr_get_info(struct sock *sk, u32 ext, int *attr, 885 | union tcp_cc_info *info) 886 | { 887 | if (ext & (1 << (INET_DIAG_BBRINFO - 1)) || 888 | ext & (1 << (INET_DIAG_VEGASINFO - 1))) { 889 | struct tcp_sock *tp = tcp_sk(sk); 890 | struct bbr *bbr = inet_csk_ca(sk); 891 | u64 bw = bbr_bw(sk); 892 | 893 | bw = bw * tp->mss_cache * USEC_PER_SEC >> BW_SCALE; 894 | memset(&info->bbr, 0, sizeof(info->bbr)); 895 | info->bbr.bbr_bw_lo = (u32)bw; 896 | info->bbr.bbr_bw_hi = (u32)(bw >> 32); 897 | info->bbr.bbr_min_rtt = bbr->min_rtt_us; 898 | info->bbr.bbr_pacing_gain = bbr->pacing_gain; 899 | info->bbr.bbr_cwnd_gain = bbr->cwnd_gain; 900 | *attr = INET_DIAG_BBRINFO; 901 | return sizeof(info->bbr); 902 | } 903 | return 0; 904 | } 905 | 906 | static void bbr_set_state(struct sock *sk, u8 new_state) 907 | { 908 | struct bbr *bbr = inet_csk_ca(sk); 909 | 910 | if (new_state == TCP_CA_Loss) { 911 | struct rate_sample rs = { .losses = 1 }; 912 | 913 | bbr->prev_ca_state = TCP_CA_Loss; 914 | bbr->full_bw = 0; 915 | bbr->round_start = 1; /* treat RTO like end of a round */ 916 | bbr_lt_bw_sampling(sk, &rs); 917 | } 918 | } 919 | 920 | static struct tcp_congestion_ops tcp_bbr_cong_ops __read_mostly = { 921 | .flags = TCP_CONG_NON_RESTRICTED, 922 | .name = "bbr", 923 | .owner = THIS_MODULE, 924 | .init = bbr_init, 925 | .cong_control = bbr_main, 926 | .sndbuf_expand = bbr_sndbuf_expand, 927 | .undo_cwnd = bbr_undo_cwnd, 928 | .cwnd_event = bbr_cwnd_event, 929 | .ssthresh = bbr_ssthresh, 930 | .tso_segs_goal = bbr_tso_segs_goal, 931 | .get_info = bbr_get_info, 932 | .set_state = bbr_set_state, 933 | }; 934 | 935 | static int __init bbr_register(void) 936 | { 937 | BUILD_BUG_ON(sizeof(struct bbr) > ICSK_CA_PRIV_SIZE); 938 | return tcp_register_congestion_control(&tcp_bbr_cong_ops); 939 | } 940 | 941 | static void __exit bbr_unregister(void) 942 | { 943 | tcp_unregister_congestion_control(&tcp_bbr_cong_ops); 944 | } 945 | 946 | module_init(bbr_register); 947 | module_exit(bbr_unregister); 948 | 949 | MODULE_AUTHOR("Van Jacobson "); 950 | MODULE_AUTHOR("Neal Cardwell "); 951 | MODULE_AUTHOR("Yuchung Cheng "); 952 | MODULE_AUTHOR("Soheil Hassas Yeganeh "); 953 | MODULE_LICENSE("Dual BSD/GPL"); 954 | MODULE_DESCRIPTION("TCP BBR (Bottleneck Bandwidth and RTT)"); 955 | -------------------------------------------------------------------------------- /General/Debian/source/kernel-v4.14/tcp_bbr.c: -------------------------------------------------------------------------------- 1 | /* Bottleneck Bandwidth and RTT (BBR) congestion control 2 | * 3 | * BBR congestion control computes the sending rate based on the delivery 4 | * rate (throughput) estimated from ACKs. In a nutshell: 5 | * 6 | * On each ACK, update our model of the network path: 7 | * bottleneck_bandwidth = windowed_max(delivered / elapsed, 10 round trips) 8 | * min_rtt = windowed_min(rtt, 10 seconds) 9 | * pacing_rate = pacing_gain * bottleneck_bandwidth 10 | * cwnd = max(cwnd_gain * bottleneck_bandwidth * min_rtt, 4) 11 | * 12 | * The core algorithm does not react directly to packet losses or delays, 13 | * although BBR may adjust the size of next send per ACK when loss is 14 | * observed, or adjust the sending rate if it estimates there is a 15 | * traffic policer, in order to keep the drop rate reasonable. 16 | * 17 | * Here is a state transition diagram for BBR: 18 | * 19 | * | 20 | * V 21 | * +---> STARTUP ----+ 22 | * | | | 23 | * | V | 24 | * | DRAIN ----+ 25 | * | | | 26 | * | V | 27 | * +---> PROBE_BW ----+ 28 | * | ^ | | 29 | * | | | | 30 | * | +----+ | 31 | * | | 32 | * +---- PROBE_RTT <--+ 33 | * 34 | * A BBR flow starts in STARTUP, and ramps up its sending rate quickly. 35 | * When it estimates the pipe is full, it enters DRAIN to drain the queue. 36 | * In steady state a BBR flow only uses PROBE_BW and PROBE_RTT. 37 | * A long-lived BBR flow spends the vast majority of its time remaining 38 | * (repeatedly) in PROBE_BW, fully probing and utilizing the pipe's bandwidth 39 | * in a fair manner, with a small, bounded queue. *If* a flow has been 40 | * continuously sending for the entire min_rtt window, and hasn't seen an RTT 41 | * sample that matches or decreases its min_rtt estimate for 10 seconds, then 42 | * it briefly enters PROBE_RTT to cut inflight to a minimum value to re-probe 43 | * the path's two-way propagation delay (min_rtt). When exiting PROBE_RTT, if 44 | * we estimated that we reached the full bw of the pipe then we enter PROBE_BW; 45 | * otherwise we enter STARTUP to try to fill the pipe. 46 | * 47 | * BBR is described in detail in: 48 | * "BBR: Congestion-Based Congestion Control", 49 | * Neal Cardwell, Yuchung Cheng, C. Stephen Gunn, Soheil Hassas Yeganeh, 50 | * Van Jacobson. ACM Queue, Vol. 14 No. 5, September-October 2016. 51 | * 52 | * There is a public e-mail list for discussing BBR development and testing: 53 | * https://groups.google.com/forum/#!forum/bbr-dev 54 | * 55 | * NOTE: BBR might be used with the fq qdisc ("man tc-fq") with pacing enabled, 56 | * otherwise TCP stack falls back to an internal pacing using one high 57 | * resolution timer per TCP socket and may use more resources. 58 | */ 59 | #include 60 | #include 61 | #include 62 | #include 63 | #include 64 | #include 65 | 66 | /* Scale factor for rate in pkt/uSec unit to avoid truncation in bandwidth 67 | * estimation. The rate unit ~= (1500 bytes / 1 usec / 2^24) ~= 715 bps. 68 | * This handles bandwidths from 0.06pps (715bps) to 256Mpps (3Tbps) in a u32. 69 | * Since the minimum window is >=4 packets, the lower bound isn't 70 | * an issue. The upper bound isn't an issue with existing technologies. 71 | */ 72 | #define BW_SCALE 24 73 | #define BW_UNIT (1 << BW_SCALE) 74 | 75 | #define BBR_SCALE 8 /* scaling factor for fractions in BBR (e.g. gains) */ 76 | #define BBR_UNIT (1 << BBR_SCALE) 77 | 78 | /* BBR has the following modes for deciding how fast to send: */ 79 | enum bbr_mode { 80 | BBR_STARTUP, /* ramp up sending rate rapidly to fill pipe */ 81 | BBR_DRAIN, /* drain any queue created during startup */ 82 | BBR_PROBE_BW, /* discover, share bw: pace around estimated bw */ 83 | BBR_PROBE_RTT, /* cut inflight to min to probe min_rtt */ 84 | }; 85 | 86 | /* BBR congestion control block */ 87 | struct bbr { 88 | u32 min_rtt_us; /* min RTT in min_rtt_win_sec window */ 89 | u32 min_rtt_stamp; /* timestamp of min_rtt_us */ 90 | u32 probe_rtt_done_stamp; /* end time for BBR_PROBE_RTT mode */ 91 | struct minmax bw; /* Max recent delivery rate in pkts/uS << 24 */ 92 | u32 rtt_cnt; /* count of packet-timed rounds elapsed */ 93 | u32 next_rtt_delivered; /* scb->tx.delivered at end of round */ 94 | u64 cycle_mstamp; /* time of this cycle phase start */ 95 | u32 mode:3, /* current bbr_mode in state machine */ 96 | prev_ca_state:3, /* CA state on previous ACK */ 97 | packet_conservation:1, /* use packet conservation? */ 98 | restore_cwnd:1, /* decided to revert cwnd to old value */ 99 | round_start:1, /* start of packet-timed tx->ack round? */ 100 | tso_segs_goal:7, /* segments we want in each skb we send */ 101 | idle_restart:1, /* restarting after idle? */ 102 | probe_rtt_round_done:1, /* a BBR_PROBE_RTT round at 4 pkts? */ 103 | unused:5, 104 | lt_is_sampling:1, /* taking long-term ("LT") samples now? */ 105 | lt_rtt_cnt:7, /* round trips in long-term interval */ 106 | lt_use_bw:1; /* use lt_bw as our bw estimate? */ 107 | u32 lt_bw; /* LT est delivery rate in pkts/uS << 24 */ 108 | u32 lt_last_delivered; /* LT intvl start: tp->delivered */ 109 | u32 lt_last_stamp; /* LT intvl start: tp->delivered_mstamp */ 110 | u32 lt_last_lost; /* LT intvl start: tp->lost */ 111 | u32 pacing_gain:10, /* current gain for setting pacing rate */ 112 | cwnd_gain:10, /* current gain for setting cwnd */ 113 | full_bw_cnt:3, /* number of rounds without large bw gains */ 114 | cycle_idx:3, /* current index in pacing_gain cycle array */ 115 | has_seen_rtt:1, /* have we seen an RTT sample yet? */ 116 | unused_b:5; 117 | u32 prior_cwnd; /* prior cwnd upon entering loss recovery */ 118 | u32 full_bw; /* recent bw, to estimate if pipe is full */ 119 | }; 120 | 121 | #define CYCLE_LEN 8 /* number of phases in a pacing gain cycle */ 122 | 123 | /* Window length of bw filter (in rounds): */ 124 | static const int bbr_bw_rtts = CYCLE_LEN + 2; 125 | /* Window length of min_rtt filter (in sec): */ 126 | static const u32 bbr_min_rtt_win_sec = 10; 127 | /* Minimum time (in ms) spent at bbr_cwnd_min_target in BBR_PROBE_RTT mode: */ 128 | static const u32 bbr_probe_rtt_mode_ms = 200; 129 | /* Skip TSO below the following bandwidth (bits/sec): */ 130 | static const int bbr_min_tso_rate = 1200000; 131 | 132 | /* We use a high_gain value of 2/ln(2) because it's the smallest pacing gain 133 | * that will allow a smoothly increasing pacing rate that will double each RTT 134 | * and send the same number of packets per RTT that an un-paced, slow-starting 135 | * Reno or CUBIC flow would: 136 | */ 137 | static const int bbr_high_gain = BBR_UNIT * 2885 / 1000 + 1; 138 | /* The pacing gain of 1/high_gain in BBR_DRAIN is calculated to typically drain 139 | * the queue created in BBR_STARTUP in a single round: 140 | */ 141 | static const int bbr_drain_gain = BBR_UNIT * 1000 / 2885; 142 | /* The gain for deriving steady-state cwnd tolerates delayed/stretched ACKs: */ 143 | static const int bbr_cwnd_gain = BBR_UNIT * 2; 144 | /* The pacing_gain values for the PROBE_BW gain cycle, to discover/share bw: */ 145 | static const int bbr_pacing_gain[] = { 146 | BBR_UNIT * 5 / 4, /* probe for more available bw */ 147 | BBR_UNIT * 3 / 4, /* drain queue and/or yield bw to other flows */ 148 | BBR_UNIT, BBR_UNIT, BBR_UNIT, /* cruise at 1.0*bw to utilize pipe, */ 149 | BBR_UNIT, BBR_UNIT, BBR_UNIT /* without creating excess queue... */ 150 | }; 151 | /* Randomize the starting gain cycling phase over N phases: */ 152 | static const u32 bbr_cycle_rand = 7; 153 | 154 | /* Try to keep at least this many packets in flight, if things go smoothly. For 155 | * smooth functioning, a sliding window protocol ACKing every other packet 156 | * needs at least 4 packets in flight: 157 | */ 158 | static const u32 bbr_cwnd_min_target = 4; 159 | 160 | /* To estimate if BBR_STARTUP mode (i.e. high_gain) has filled pipe... */ 161 | /* If bw has increased significantly (1.25x), there may be more bw available: */ 162 | static const u32 bbr_full_bw_thresh = BBR_UNIT * 5 / 4; 163 | /* But after 3 rounds w/o significant bw growth, estimate pipe is full: */ 164 | static const u32 bbr_full_bw_cnt = 3; 165 | 166 | /* "long-term" ("LT") bandwidth estimator parameters... */ 167 | /* The minimum number of rounds in an LT bw sampling interval: */ 168 | static const u32 bbr_lt_intvl_min_rtts = 4; 169 | /* If lost/delivered ratio > 20%, interval is "lossy" and we may be policed: */ 170 | static const u32 bbr_lt_loss_thresh = 50; 171 | /* If 2 intervals have a bw ratio <= 1/8, their bw is "consistent": */ 172 | static const u32 bbr_lt_bw_ratio = BBR_UNIT / 8; 173 | /* If 2 intervals have a bw diff <= 4 Kbit/sec their bw is "consistent": */ 174 | static const u32 bbr_lt_bw_diff = 4000 / 8; 175 | /* If we estimate we're policed, use lt_bw for this many round trips: */ 176 | static const u32 bbr_lt_bw_max_rtts = 48; 177 | 178 | /* Do we estimate that STARTUP filled the pipe? */ 179 | static bool bbr_full_bw_reached(const struct sock *sk) 180 | { 181 | const struct bbr *bbr = inet_csk_ca(sk); 182 | 183 | return bbr->full_bw_cnt >= bbr_full_bw_cnt; 184 | } 185 | 186 | /* Return the windowed max recent bandwidth sample, in pkts/uS << BW_SCALE. */ 187 | static u32 bbr_max_bw(const struct sock *sk) 188 | { 189 | struct bbr *bbr = inet_csk_ca(sk); 190 | 191 | return minmax_get(&bbr->bw); 192 | } 193 | 194 | /* Return the estimated bandwidth of the path, in pkts/uS << BW_SCALE. */ 195 | static u32 bbr_bw(const struct sock *sk) 196 | { 197 | struct bbr *bbr = inet_csk_ca(sk); 198 | 199 | return bbr->lt_use_bw ? bbr->lt_bw : bbr_max_bw(sk); 200 | } 201 | 202 | /* Return rate in bytes per second, optionally with a gain. 203 | * The order here is chosen carefully to avoid overflow of u64. This should 204 | * work for input rates of up to 2.9Tbit/sec and gain of 2.89x. 205 | */ 206 | static u64 bbr_rate_bytes_per_sec(struct sock *sk, u64 rate, int gain) 207 | { 208 | rate *= tcp_mss_to_mtu(sk, tcp_sk(sk)->mss_cache); 209 | rate *= gain; 210 | rate >>= BBR_SCALE; 211 | rate *= USEC_PER_SEC; 212 | return rate >> BW_SCALE; 213 | } 214 | 215 | /* Convert a BBR bw and gain factor to a pacing rate in bytes per second. */ 216 | static u32 bbr_bw_to_pacing_rate(struct sock *sk, u32 bw, int gain) 217 | { 218 | u64 rate = bw; 219 | 220 | rate = bbr_rate_bytes_per_sec(sk, rate, gain); 221 | rate = min_t(u64, rate, sk->sk_max_pacing_rate); 222 | return rate; 223 | } 224 | 225 | /* Initialize pacing rate to: high_gain * init_cwnd / RTT. */ 226 | static void bbr_init_pacing_rate_from_rtt(struct sock *sk) 227 | { 228 | struct tcp_sock *tp = tcp_sk(sk); 229 | struct bbr *bbr = inet_csk_ca(sk); 230 | u64 bw; 231 | u32 rtt_us; 232 | 233 | if (tp->srtt_us) { /* any RTT sample yet? */ 234 | rtt_us = max(tp->srtt_us >> 3, 1U); 235 | bbr->has_seen_rtt = 1; 236 | } else { /* no RTT sample yet */ 237 | rtt_us = USEC_PER_MSEC; /* use nominal default RTT */ 238 | } 239 | bw = (u64)tp->snd_cwnd * BW_UNIT; 240 | do_div(bw, rtt_us); 241 | sk->sk_pacing_rate = bbr_bw_to_pacing_rate(sk, bw, bbr_high_gain); 242 | } 243 | 244 | /* Pace using current bw estimate and a gain factor. In order to help drive the 245 | * network toward lower queues while maintaining high utilization and low 246 | * latency, the average pacing rate aims to be slightly (~1%) lower than the 247 | * estimated bandwidth. This is an important aspect of the design. In this 248 | * implementation this slightly lower pacing rate is achieved implicitly by not 249 | * including link-layer headers in the packet size used for the pacing rate. 250 | */ 251 | static void bbr_set_pacing_rate(struct sock *sk, u32 bw, int gain) 252 | { 253 | struct tcp_sock *tp = tcp_sk(sk); 254 | struct bbr *bbr = inet_csk_ca(sk); 255 | u32 rate = bbr_bw_to_pacing_rate(sk, bw, gain); 256 | 257 | if (unlikely(!bbr->has_seen_rtt && tp->srtt_us)) 258 | bbr_init_pacing_rate_from_rtt(sk); 259 | if (bbr_full_bw_reached(sk) || rate > sk->sk_pacing_rate) 260 | sk->sk_pacing_rate = rate; 261 | } 262 | 263 | /* Return count of segments we want in the skbs we send, or 0 for default. */ 264 | static u32 bbr_tso_segs_goal(struct sock *sk) 265 | { 266 | struct bbr *bbr = inet_csk_ca(sk); 267 | 268 | return bbr->tso_segs_goal; 269 | } 270 | 271 | static void bbr_set_tso_segs_goal(struct sock *sk) 272 | { 273 | struct tcp_sock *tp = tcp_sk(sk); 274 | struct bbr *bbr = inet_csk_ca(sk); 275 | u32 min_segs; 276 | 277 | min_segs = sk->sk_pacing_rate < (bbr_min_tso_rate >> 3) ? 1 : 2; 278 | bbr->tso_segs_goal = min(tcp_tso_autosize(sk, tp->mss_cache, min_segs), 279 | 0x7FU); 280 | } 281 | 282 | /* Save "last known good" cwnd so we can restore it after losses or PROBE_RTT */ 283 | static void bbr_save_cwnd(struct sock *sk) 284 | { 285 | struct tcp_sock *tp = tcp_sk(sk); 286 | struct bbr *bbr = inet_csk_ca(sk); 287 | 288 | if (bbr->prev_ca_state < TCP_CA_Recovery && bbr->mode != BBR_PROBE_RTT) 289 | bbr->prior_cwnd = tp->snd_cwnd; /* this cwnd is good enough */ 290 | else /* loss recovery or BBR_PROBE_RTT have temporarily cut cwnd */ 291 | bbr->prior_cwnd = max(bbr->prior_cwnd, tp->snd_cwnd); 292 | } 293 | 294 | static void bbr_cwnd_event(struct sock *sk, enum tcp_ca_event event) 295 | { 296 | struct tcp_sock *tp = tcp_sk(sk); 297 | struct bbr *bbr = inet_csk_ca(sk); 298 | 299 | if (event == CA_EVENT_TX_START && tp->app_limited) { 300 | bbr->idle_restart = 1; 301 | /* Avoid pointless buffer overflows: pace at est. bw if we don't 302 | * need more speed (we're restarting from idle and app-limited). 303 | */ 304 | if (bbr->mode == BBR_PROBE_BW) 305 | bbr_set_pacing_rate(sk, bbr_bw(sk), BBR_UNIT); 306 | } 307 | } 308 | 309 | /* Find target cwnd. Right-size the cwnd based on min RTT and the 310 | * estimated bottleneck bandwidth: 311 | * 312 | * cwnd = bw * min_rtt * gain = BDP * gain 313 | * 314 | * The key factor, gain, controls the amount of queue. While a small gain 315 | * builds a smaller queue, it becomes more vulnerable to noise in RTT 316 | * measurements (e.g., delayed ACKs or other ACK compression effects). This 317 | * noise may cause BBR to under-estimate the rate. 318 | * 319 | * To achieve full performance in high-speed paths, we budget enough cwnd to 320 | * fit full-sized skbs in-flight on both end hosts to fully utilize the path: 321 | * - one skb in sending host Qdisc, 322 | * - one skb in sending host TSO/GSO engine 323 | * - one skb being received by receiver host LRO/GRO/delayed-ACK engine 324 | * Don't worry, at low rates (bbr_min_tso_rate) this won't bloat cwnd because 325 | * in such cases tso_segs_goal is 1. The minimum cwnd is 4 packets, 326 | * which allows 2 outstanding 2-packet sequences, to try to keep pipe 327 | * full even with ACK-every-other-packet delayed ACKs. 328 | */ 329 | static u32 bbr_target_cwnd(struct sock *sk, u32 bw, int gain) 330 | { 331 | struct bbr *bbr = inet_csk_ca(sk); 332 | u32 cwnd; 333 | u64 w; 334 | 335 | /* If we've never had a valid RTT sample, cap cwnd at the initial 336 | * default. This should only happen when the connection is not using TCP 337 | * timestamps and has retransmitted all of the SYN/SYNACK/data packets 338 | * ACKed so far. In this case, an RTO can cut cwnd to 1, in which 339 | * case we need to slow-start up toward something safe: TCP_INIT_CWND. 340 | */ 341 | if (unlikely(bbr->min_rtt_us == ~0U)) /* no valid RTT samples yet? */ 342 | return TCP_INIT_CWND; /* be safe: cap at default initial cwnd*/ 343 | 344 | w = (u64)bw * bbr->min_rtt_us; 345 | 346 | /* Apply a gain to the given value, then remove the BW_SCALE shift. */ 347 | cwnd = (((w * gain) >> BBR_SCALE) + BW_UNIT - 1) / BW_UNIT; 348 | 349 | /* Allow enough full-sized skbs in flight to utilize end systems. */ 350 | cwnd += 3 * bbr->tso_segs_goal; 351 | 352 | /* Reduce delayed ACKs by rounding up cwnd to the next even number. */ 353 | cwnd = (cwnd + 1) & ~1U; 354 | 355 | return cwnd; 356 | } 357 | 358 | /* An optimization in BBR to reduce losses: On the first round of recovery, we 359 | * follow the packet conservation principle: send P packets per P packets acked. 360 | * After that, we slow-start and send at most 2*P packets per P packets acked. 361 | * After recovery finishes, or upon undo, we restore the cwnd we had when 362 | * recovery started (capped by the target cwnd based on estimated BDP). 363 | * 364 | * TODO(ycheng/ncardwell): implement a rate-based approach. 365 | */ 366 | static bool bbr_set_cwnd_to_recover_or_restore( 367 | struct sock *sk, const struct rate_sample *rs, u32 acked, u32 *new_cwnd) 368 | { 369 | struct tcp_sock *tp = tcp_sk(sk); 370 | struct bbr *bbr = inet_csk_ca(sk); 371 | u8 prev_state = bbr->prev_ca_state, state = inet_csk(sk)->icsk_ca_state; 372 | u32 cwnd = tp->snd_cwnd; 373 | 374 | /* An ACK for P pkts should release at most 2*P packets. We do this 375 | * in two steps. First, here we deduct the number of lost packets. 376 | * Then, in bbr_set_cwnd() we slow start up toward the target cwnd. 377 | */ 378 | if (rs->losses > 0) 379 | cwnd = max_t(s32, cwnd - rs->losses, 1); 380 | 381 | if (state == TCP_CA_Recovery && prev_state != TCP_CA_Recovery) { 382 | /* Starting 1st round of Recovery, so do packet conservation. */ 383 | bbr->packet_conservation = 1; 384 | bbr->next_rtt_delivered = tp->delivered; /* start round now */ 385 | /* Cut unused cwnd from app behavior, TSQ, or TSO deferral: */ 386 | cwnd = tcp_packets_in_flight(tp) + acked; 387 | } else if (prev_state >= TCP_CA_Recovery && state < TCP_CA_Recovery) { 388 | /* Exiting loss recovery; restore cwnd saved before recovery. */ 389 | bbr->restore_cwnd = 1; 390 | bbr->packet_conservation = 0; 391 | } 392 | bbr->prev_ca_state = state; 393 | 394 | if (bbr->restore_cwnd) { 395 | /* Restore cwnd after exiting loss recovery or PROBE_RTT. */ 396 | cwnd = max(cwnd, bbr->prior_cwnd); 397 | bbr->restore_cwnd = 0; 398 | } 399 | 400 | if (bbr->packet_conservation) { 401 | *new_cwnd = max(cwnd, tcp_packets_in_flight(tp) + acked); 402 | return true; /* yes, using packet conservation */ 403 | } 404 | *new_cwnd = cwnd; 405 | return false; 406 | } 407 | 408 | /* Slow-start up toward target cwnd (if bw estimate is growing, or packet loss 409 | * has drawn us down below target), or snap down to target if we're above it. 410 | */ 411 | static void bbr_set_cwnd(struct sock *sk, const struct rate_sample *rs, 412 | u32 acked, u32 bw, int gain) 413 | { 414 | struct tcp_sock *tp = tcp_sk(sk); 415 | struct bbr *bbr = inet_csk_ca(sk); 416 | u32 cwnd = 0, target_cwnd = 0; 417 | 418 | if (!acked) 419 | return; 420 | 421 | if (bbr_set_cwnd_to_recover_or_restore(sk, rs, acked, &cwnd)) 422 | goto done; 423 | 424 | /* If we're below target cwnd, slow start cwnd toward target cwnd. */ 425 | target_cwnd = bbr_target_cwnd(sk, bw, gain); 426 | if (bbr_full_bw_reached(sk)) /* only cut cwnd if we filled the pipe */ 427 | cwnd = min(cwnd + acked, target_cwnd); 428 | else if (cwnd < target_cwnd || tp->delivered < TCP_INIT_CWND) 429 | cwnd = cwnd + acked; 430 | cwnd = max(cwnd, bbr_cwnd_min_target); 431 | 432 | done: 433 | tp->snd_cwnd = min(cwnd, tp->snd_cwnd_clamp); /* apply global cap */ 434 | if (bbr->mode == BBR_PROBE_RTT) /* drain queue, refresh min_rtt */ 435 | tp->snd_cwnd = min(tp->snd_cwnd, bbr_cwnd_min_target); 436 | } 437 | 438 | /* End cycle phase if it's time and/or we hit the phase's in-flight target. */ 439 | static bool bbr_is_next_cycle_phase(struct sock *sk, 440 | const struct rate_sample *rs) 441 | { 442 | struct tcp_sock *tp = tcp_sk(sk); 443 | struct bbr *bbr = inet_csk_ca(sk); 444 | bool is_full_length = 445 | tcp_stamp_us_delta(tp->delivered_mstamp, bbr->cycle_mstamp) > 446 | bbr->min_rtt_us; 447 | u32 inflight, bw; 448 | 449 | /* The pacing_gain of 1.0 paces at the estimated bw to try to fully 450 | * use the pipe without increasing the queue. 451 | */ 452 | if (bbr->pacing_gain == BBR_UNIT) 453 | return is_full_length; /* just use wall clock time */ 454 | 455 | inflight = rs->prior_in_flight; /* what was in-flight before ACK? */ 456 | bw = bbr_max_bw(sk); 457 | 458 | /* A pacing_gain > 1.0 probes for bw by trying to raise inflight to at 459 | * least pacing_gain*BDP; this may take more than min_rtt if min_rtt is 460 | * small (e.g. on a LAN). We do not persist if packets are lost, since 461 | * a path with small buffers may not hold that much. 462 | */ 463 | if (bbr->pacing_gain > BBR_UNIT) 464 | return is_full_length && 465 | (rs->losses || /* perhaps pacing_gain*BDP won't fit */ 466 | inflight >= bbr_target_cwnd(sk, bw, bbr->pacing_gain)); 467 | 468 | /* A pacing_gain < 1.0 tries to drain extra queue we added if bw 469 | * probing didn't find more bw. If inflight falls to match BDP then we 470 | * estimate queue is drained; persisting would underutilize the pipe. 471 | */ 472 | return is_full_length || 473 | inflight <= bbr_target_cwnd(sk, bw, BBR_UNIT); 474 | } 475 | 476 | static void bbr_advance_cycle_phase(struct sock *sk) 477 | { 478 | struct tcp_sock *tp = tcp_sk(sk); 479 | struct bbr *bbr = inet_csk_ca(sk); 480 | 481 | bbr->cycle_idx = (bbr->cycle_idx + 1) & (CYCLE_LEN - 1); 482 | bbr->cycle_mstamp = tp->delivered_mstamp; 483 | bbr->pacing_gain = bbr_pacing_gain[bbr->cycle_idx]; 484 | } 485 | 486 | /* Gain cycling: cycle pacing gain to converge to fair share of available bw. */ 487 | static void bbr_update_cycle_phase(struct sock *sk, 488 | const struct rate_sample *rs) 489 | { 490 | struct bbr *bbr = inet_csk_ca(sk); 491 | 492 | if ((bbr->mode == BBR_PROBE_BW) && !bbr->lt_use_bw && 493 | bbr_is_next_cycle_phase(sk, rs)) 494 | bbr_advance_cycle_phase(sk); 495 | } 496 | 497 | static void bbr_reset_startup_mode(struct sock *sk) 498 | { 499 | struct bbr *bbr = inet_csk_ca(sk); 500 | 501 | bbr->mode = BBR_STARTUP; 502 | bbr->pacing_gain = bbr_high_gain; 503 | bbr->cwnd_gain = bbr_high_gain; 504 | } 505 | 506 | static void bbr_reset_probe_bw_mode(struct sock *sk) 507 | { 508 | struct bbr *bbr = inet_csk_ca(sk); 509 | 510 | bbr->mode = BBR_PROBE_BW; 511 | bbr->pacing_gain = BBR_UNIT; 512 | bbr->cwnd_gain = bbr_cwnd_gain; 513 | bbr->cycle_idx = CYCLE_LEN - 1 - prandom_u32_max(bbr_cycle_rand); 514 | bbr_advance_cycle_phase(sk); /* flip to next phase of gain cycle */ 515 | } 516 | 517 | static void bbr_reset_mode(struct sock *sk) 518 | { 519 | if (!bbr_full_bw_reached(sk)) 520 | bbr_reset_startup_mode(sk); 521 | else 522 | bbr_reset_probe_bw_mode(sk); 523 | } 524 | 525 | /* Start a new long-term sampling interval. */ 526 | static void bbr_reset_lt_bw_sampling_interval(struct sock *sk) 527 | { 528 | struct tcp_sock *tp = tcp_sk(sk); 529 | struct bbr *bbr = inet_csk_ca(sk); 530 | 531 | bbr->lt_last_stamp = div_u64(tp->delivered_mstamp, USEC_PER_MSEC); 532 | bbr->lt_last_delivered = tp->delivered; 533 | bbr->lt_last_lost = tp->lost; 534 | bbr->lt_rtt_cnt = 0; 535 | } 536 | 537 | /* Completely reset long-term bandwidth sampling. */ 538 | static void bbr_reset_lt_bw_sampling(struct sock *sk) 539 | { 540 | struct bbr *bbr = inet_csk_ca(sk); 541 | 542 | bbr->lt_bw = 0; 543 | bbr->lt_use_bw = 0; 544 | bbr->lt_is_sampling = false; 545 | bbr_reset_lt_bw_sampling_interval(sk); 546 | } 547 | 548 | /* Long-term bw sampling interval is done. Estimate whether we're policed. */ 549 | static void bbr_lt_bw_interval_done(struct sock *sk, u32 bw) 550 | { 551 | struct bbr *bbr = inet_csk_ca(sk); 552 | u32 diff; 553 | 554 | if (bbr->lt_bw) { /* do we have bw from a previous interval? */ 555 | /* Is new bw close to the lt_bw from the previous interval? */ 556 | diff = abs(bw - bbr->lt_bw); 557 | if ((diff * BBR_UNIT <= bbr_lt_bw_ratio * bbr->lt_bw) || 558 | (bbr_rate_bytes_per_sec(sk, diff, BBR_UNIT) <= 559 | bbr_lt_bw_diff)) { 560 | /* All criteria are met; estimate we're policed. */ 561 | bbr->lt_bw = (bw + bbr->lt_bw) >> 1; /* avg 2 intvls */ 562 | bbr->lt_use_bw = 1; 563 | bbr->pacing_gain = BBR_UNIT; /* try to avoid drops */ 564 | bbr->lt_rtt_cnt = 0; 565 | return; 566 | } 567 | } 568 | bbr->lt_bw = bw; 569 | bbr_reset_lt_bw_sampling_interval(sk); 570 | } 571 | 572 | /* Token-bucket traffic policers are common (see "An Internet-Wide Analysis of 573 | * Traffic Policing", SIGCOMM 2016). BBR detects token-bucket policers and 574 | * explicitly models their policed rate, to reduce unnecessary losses. We 575 | * estimate that we're policed if we see 2 consecutive sampling intervals with 576 | * consistent throughput and high packet loss. If we think we're being policed, 577 | * set lt_bw to the "long-term" average delivery rate from those 2 intervals. 578 | */ 579 | static void bbr_lt_bw_sampling(struct sock *sk, const struct rate_sample *rs) 580 | { 581 | struct tcp_sock *tp = tcp_sk(sk); 582 | struct bbr *bbr = inet_csk_ca(sk); 583 | u32 lost, delivered; 584 | u64 bw; 585 | u32 t; 586 | 587 | if (bbr->lt_use_bw) { /* already using long-term rate, lt_bw? */ 588 | if (bbr->mode == BBR_PROBE_BW && bbr->round_start && 589 | ++bbr->lt_rtt_cnt >= bbr_lt_bw_max_rtts) { 590 | bbr_reset_lt_bw_sampling(sk); /* stop using lt_bw */ 591 | bbr_reset_probe_bw_mode(sk); /* restart gain cycling */ 592 | } 593 | return; 594 | } 595 | 596 | /* Wait for the first loss before sampling, to let the policer exhaust 597 | * its tokens and estimate the steady-state rate allowed by the policer. 598 | * Starting samples earlier includes bursts that over-estimate the bw. 599 | */ 600 | if (!bbr->lt_is_sampling) { 601 | if (!rs->losses) 602 | return; 603 | bbr_reset_lt_bw_sampling_interval(sk); 604 | bbr->lt_is_sampling = true; 605 | } 606 | 607 | /* To avoid underestimates, reset sampling if we run out of data. */ 608 | if (rs->is_app_limited) { 609 | bbr_reset_lt_bw_sampling(sk); 610 | return; 611 | } 612 | 613 | if (bbr->round_start) 614 | bbr->lt_rtt_cnt++; /* count round trips in this interval */ 615 | if (bbr->lt_rtt_cnt < bbr_lt_intvl_min_rtts) 616 | return; /* sampling interval needs to be longer */ 617 | if (bbr->lt_rtt_cnt > 4 * bbr_lt_intvl_min_rtts) { 618 | bbr_reset_lt_bw_sampling(sk); /* interval is too long */ 619 | return; 620 | } 621 | 622 | /* End sampling interval when a packet is lost, so we estimate the 623 | * policer tokens were exhausted. Stopping the sampling before the 624 | * tokens are exhausted under-estimates the policed rate. 625 | */ 626 | if (!rs->losses) 627 | return; 628 | 629 | /* Calculate packets lost and delivered in sampling interval. */ 630 | lost = tp->lost - bbr->lt_last_lost; 631 | delivered = tp->delivered - bbr->lt_last_delivered; 632 | /* Is loss rate (lost/delivered) >= lt_loss_thresh? If not, wait. */ 633 | if (!delivered || (lost << BBR_SCALE) < bbr_lt_loss_thresh * delivered) 634 | return; 635 | 636 | /* Find average delivery rate in this sampling interval. */ 637 | t = div_u64(tp->delivered_mstamp, USEC_PER_MSEC) - bbr->lt_last_stamp; 638 | if ((s32)t < 1) 639 | return; /* interval is less than one ms, so wait */ 640 | /* Check if can multiply without overflow */ 641 | if (t >= ~0U / USEC_PER_MSEC) { 642 | bbr_reset_lt_bw_sampling(sk); /* interval too long; reset */ 643 | return; 644 | } 645 | t *= USEC_PER_MSEC; 646 | bw = (u64)delivered * BW_UNIT; 647 | do_div(bw, t); 648 | bbr_lt_bw_interval_done(sk, bw); 649 | } 650 | 651 | /* Estimate the bandwidth based on how fast packets are delivered */ 652 | static void bbr_update_bw(struct sock *sk, const struct rate_sample *rs) 653 | { 654 | struct tcp_sock *tp = tcp_sk(sk); 655 | struct bbr *bbr = inet_csk_ca(sk); 656 | u64 bw; 657 | 658 | bbr->round_start = 0; 659 | if (rs->delivered < 0 || rs->interval_us <= 0) 660 | return; /* Not a valid observation */ 661 | 662 | /* See if we've reached the next RTT */ 663 | if (!before(rs->prior_delivered, bbr->next_rtt_delivered)) { 664 | bbr->next_rtt_delivered = tp->delivered; 665 | bbr->rtt_cnt++; 666 | bbr->round_start = 1; 667 | bbr->packet_conservation = 0; 668 | } 669 | 670 | bbr_lt_bw_sampling(sk, rs); 671 | 672 | /* Divide delivered by the interval to find a (lower bound) bottleneck 673 | * bandwidth sample. Delivered is in packets and interval_us in uS and 674 | * ratio will be <<1 for most connections. So delivered is first scaled. 675 | */ 676 | bw = (u64)rs->delivered * BW_UNIT; 677 | do_div(bw, rs->interval_us); 678 | 679 | /* If this sample is application-limited, it is likely to have a very 680 | * low delivered count that represents application behavior rather than 681 | * the available network rate. Such a sample could drag down estimated 682 | * bw, causing needless slow-down. Thus, to continue to send at the 683 | * last measured network rate, we filter out app-limited samples unless 684 | * they describe the path bw at least as well as our bw model. 685 | * 686 | * So the goal during app-limited phase is to proceed with the best 687 | * network rate no matter how long. We automatically leave this 688 | * phase when app writes faster than the network can deliver :) 689 | */ 690 | if (!rs->is_app_limited || bw >= bbr_max_bw(sk)) { 691 | /* Incorporate new sample into our max bw filter. */ 692 | minmax_running_max(&bbr->bw, bbr_bw_rtts, bbr->rtt_cnt, bw); 693 | } 694 | } 695 | 696 | /* Estimate when the pipe is full, using the change in delivery rate: BBR 697 | * estimates that STARTUP filled the pipe if the estimated bw hasn't changed by 698 | * at least bbr_full_bw_thresh (25%) after bbr_full_bw_cnt (3) non-app-limited 699 | * rounds. Why 3 rounds: 1: rwin autotuning grows the rwin, 2: we fill the 700 | * higher rwin, 3: we get higher delivery rate samples. Or transient 701 | * cross-traffic or radio noise can go away. CUBIC Hystart shares a similar 702 | * design goal, but uses delay and inter-ACK spacing instead of bandwidth. 703 | */ 704 | static void bbr_check_full_bw_reached(struct sock *sk, 705 | const struct rate_sample *rs) 706 | { 707 | struct bbr *bbr = inet_csk_ca(sk); 708 | u32 bw_thresh; 709 | 710 | if (bbr_full_bw_reached(sk) || !bbr->round_start || rs->is_app_limited) 711 | return; 712 | 713 | bw_thresh = (u64)bbr->full_bw * bbr_full_bw_thresh >> BBR_SCALE; 714 | if (bbr_max_bw(sk) >= bw_thresh) { 715 | bbr->full_bw = bbr_max_bw(sk); 716 | bbr->full_bw_cnt = 0; 717 | return; 718 | } 719 | ++bbr->full_bw_cnt; 720 | } 721 | 722 | /* If pipe is probably full, drain the queue and then enter steady-state. */ 723 | static void bbr_check_drain(struct sock *sk, const struct rate_sample *rs) 724 | { 725 | struct bbr *bbr = inet_csk_ca(sk); 726 | 727 | if (bbr->mode == BBR_STARTUP && bbr_full_bw_reached(sk)) { 728 | bbr->mode = BBR_DRAIN; /* drain queue we created */ 729 | bbr->pacing_gain = bbr_drain_gain; /* pace slow to drain */ 730 | bbr->cwnd_gain = bbr_high_gain; /* maintain cwnd */ 731 | } /* fall through to check if in-flight is already small: */ 732 | if (bbr->mode == BBR_DRAIN && 733 | tcp_packets_in_flight(tcp_sk(sk)) <= 734 | bbr_target_cwnd(sk, bbr_max_bw(sk), BBR_UNIT)) 735 | bbr_reset_probe_bw_mode(sk); /* we estimate queue is drained */ 736 | } 737 | 738 | /* The goal of PROBE_RTT mode is to have BBR flows cooperatively and 739 | * periodically drain the bottleneck queue, to converge to measure the true 740 | * min_rtt (unloaded propagation delay). This allows the flows to keep queues 741 | * small (reducing queuing delay and packet loss) and achieve fairness among 742 | * BBR flows. 743 | * 744 | * The min_rtt filter window is 10 seconds. When the min_rtt estimate expires, 745 | * we enter PROBE_RTT mode and cap the cwnd at bbr_cwnd_min_target=4 packets. 746 | * After at least bbr_probe_rtt_mode_ms=200ms and at least one packet-timed 747 | * round trip elapsed with that flight size <= 4, we leave PROBE_RTT mode and 748 | * re-enter the previous mode. BBR uses 200ms to approximately bound the 749 | * performance penalty of PROBE_RTT's cwnd capping to roughly 2% (200ms/10s). 750 | * 751 | * Note that flows need only pay 2% if they are busy sending over the last 10 752 | * seconds. Interactive applications (e.g., Web, RPCs, video chunks) often have 753 | * natural silences or low-rate periods within 10 seconds where the rate is low 754 | * enough for long enough to drain its queue in the bottleneck. We pick up 755 | * these min RTT measurements opportunistically with our min_rtt filter. :-) 756 | */ 757 | static void bbr_update_min_rtt(struct sock *sk, const struct rate_sample *rs) 758 | { 759 | struct tcp_sock *tp = tcp_sk(sk); 760 | struct bbr *bbr = inet_csk_ca(sk); 761 | bool filter_expired; 762 | 763 | /* Track min RTT seen in the min_rtt_win_sec filter window: */ 764 | filter_expired = after(tcp_jiffies32, 765 | bbr->min_rtt_stamp + bbr_min_rtt_win_sec * HZ); 766 | if (rs->rtt_us >= 0 && 767 | (rs->rtt_us <= bbr->min_rtt_us || filter_expired)) { 768 | bbr->min_rtt_us = rs->rtt_us; 769 | bbr->min_rtt_stamp = tcp_jiffies32; 770 | } 771 | 772 | if (bbr_probe_rtt_mode_ms > 0 && filter_expired && 773 | !bbr->idle_restart && bbr->mode != BBR_PROBE_RTT) { 774 | bbr->mode = BBR_PROBE_RTT; /* dip, drain queue */ 775 | bbr->pacing_gain = BBR_UNIT; 776 | bbr->cwnd_gain = BBR_UNIT; 777 | bbr_save_cwnd(sk); /* note cwnd so we can restore it */ 778 | bbr->probe_rtt_done_stamp = 0; 779 | } 780 | 781 | if (bbr->mode == BBR_PROBE_RTT) { 782 | /* Ignore low rate samples during this mode. */ 783 | tp->app_limited = 784 | (tp->delivered + tcp_packets_in_flight(tp)) ? : 1; 785 | /* Maintain min packets in flight for max(200 ms, 1 round). */ 786 | if (!bbr->probe_rtt_done_stamp && 787 | tcp_packets_in_flight(tp) <= bbr_cwnd_min_target) { 788 | bbr->probe_rtt_done_stamp = tcp_jiffies32 + 789 | msecs_to_jiffies(bbr_probe_rtt_mode_ms); 790 | bbr->probe_rtt_round_done = 0; 791 | bbr->next_rtt_delivered = tp->delivered; 792 | } else if (bbr->probe_rtt_done_stamp) { 793 | if (bbr->round_start) 794 | bbr->probe_rtt_round_done = 1; 795 | if (bbr->probe_rtt_round_done && 796 | after(tcp_jiffies32, bbr->probe_rtt_done_stamp)) { 797 | bbr->min_rtt_stamp = tcp_jiffies32; 798 | bbr->restore_cwnd = 1; /* snap to prior_cwnd */ 799 | bbr_reset_mode(sk); 800 | } 801 | } 802 | } 803 | bbr->idle_restart = 0; 804 | } 805 | 806 | static void bbr_update_model(struct sock *sk, const struct rate_sample *rs) 807 | { 808 | bbr_update_bw(sk, rs); 809 | bbr_update_cycle_phase(sk, rs); 810 | bbr_check_full_bw_reached(sk, rs); 811 | bbr_check_drain(sk, rs); 812 | bbr_update_min_rtt(sk, rs); 813 | } 814 | 815 | static void bbr_main(struct sock *sk, const struct rate_sample *rs) 816 | { 817 | struct bbr *bbr = inet_csk_ca(sk); 818 | u32 bw; 819 | 820 | bbr_update_model(sk, rs); 821 | 822 | bw = bbr_bw(sk); 823 | bbr_set_pacing_rate(sk, bw, bbr->pacing_gain); 824 | bbr_set_tso_segs_goal(sk); 825 | bbr_set_cwnd(sk, rs, rs->acked_sacked, bw, bbr->cwnd_gain); 826 | } 827 | 828 | static void bbr_init(struct sock *sk) 829 | { 830 | struct tcp_sock *tp = tcp_sk(sk); 831 | struct bbr *bbr = inet_csk_ca(sk); 832 | 833 | bbr->prior_cwnd = 0; 834 | bbr->tso_segs_goal = 0; /* default segs per skb until first ACK */ 835 | bbr->rtt_cnt = 0; 836 | bbr->next_rtt_delivered = 0; 837 | bbr->prev_ca_state = TCP_CA_Open; 838 | bbr->packet_conservation = 0; 839 | 840 | bbr->probe_rtt_done_stamp = 0; 841 | bbr->probe_rtt_round_done = 0; 842 | bbr->min_rtt_us = tcp_min_rtt(tp); 843 | bbr->min_rtt_stamp = tcp_jiffies32; 844 | 845 | minmax_reset(&bbr->bw, bbr->rtt_cnt, 0); /* init max bw to 0 */ 846 | 847 | bbr->has_seen_rtt = 0; 848 | bbr_init_pacing_rate_from_rtt(sk); 849 | 850 | bbr->restore_cwnd = 0; 851 | bbr->round_start = 0; 852 | bbr->idle_restart = 0; 853 | bbr->full_bw = 0; 854 | bbr->full_bw_cnt = 0; 855 | bbr->cycle_mstamp = 0; 856 | bbr->cycle_idx = 0; 857 | bbr_reset_lt_bw_sampling(sk); 858 | bbr_reset_startup_mode(sk); 859 | 860 | cmpxchg(&sk->sk_pacing_status, SK_PACING_NONE, SK_PACING_NEEDED); 861 | } 862 | 863 | static u32 bbr_sndbuf_expand(struct sock *sk) 864 | { 865 | /* Provision 3 * cwnd since BBR may slow-start even during recovery. */ 866 | return 3; 867 | } 868 | 869 | /* In theory BBR does not need to undo the cwnd since it does not 870 | * always reduce cwnd on losses (see bbr_main()). Keep it for now. 871 | */ 872 | static u32 bbr_undo_cwnd(struct sock *sk) 873 | { 874 | return tcp_sk(sk)->snd_cwnd; 875 | } 876 | 877 | /* Entering loss recovery, so save cwnd for when we exit or undo recovery. */ 878 | static u32 bbr_ssthresh(struct sock *sk) 879 | { 880 | bbr_save_cwnd(sk); 881 | return TCP_INFINITE_SSTHRESH; /* BBR does not use ssthresh */ 882 | } 883 | 884 | static size_t bbr_get_info(struct sock *sk, u32 ext, int *attr, 885 | union tcp_cc_info *info) 886 | { 887 | if (ext & (1 << (INET_DIAG_BBRINFO - 1)) || 888 | ext & (1 << (INET_DIAG_VEGASINFO - 1))) { 889 | struct tcp_sock *tp = tcp_sk(sk); 890 | struct bbr *bbr = inet_csk_ca(sk); 891 | u64 bw = bbr_bw(sk); 892 | 893 | bw = bw * tp->mss_cache * USEC_PER_SEC >> BW_SCALE; 894 | memset(&info->bbr, 0, sizeof(info->bbr)); 895 | info->bbr.bbr_bw_lo = (u32)bw; 896 | info->bbr.bbr_bw_hi = (u32)(bw >> 32); 897 | info->bbr.bbr_min_rtt = bbr->min_rtt_us; 898 | info->bbr.bbr_pacing_gain = bbr->pacing_gain; 899 | info->bbr.bbr_cwnd_gain = bbr->cwnd_gain; 900 | *attr = INET_DIAG_BBRINFO; 901 | return sizeof(info->bbr); 902 | } 903 | return 0; 904 | } 905 | 906 | static void bbr_set_state(struct sock *sk, u8 new_state) 907 | { 908 | struct bbr *bbr = inet_csk_ca(sk); 909 | 910 | if (new_state == TCP_CA_Loss) { 911 | struct rate_sample rs = { .losses = 1 }; 912 | 913 | bbr->prev_ca_state = TCP_CA_Loss; 914 | bbr->full_bw = 0; 915 | bbr->round_start = 1; /* treat RTO like end of a round */ 916 | bbr_lt_bw_sampling(sk, &rs); 917 | } 918 | } 919 | 920 | static struct tcp_congestion_ops tcp_bbr_cong_ops __read_mostly = { 921 | .flags = TCP_CONG_NON_RESTRICTED, 922 | .name = "bbr", 923 | .owner = THIS_MODULE, 924 | .init = bbr_init, 925 | .cong_control = bbr_main, 926 | .sndbuf_expand = bbr_sndbuf_expand, 927 | .undo_cwnd = bbr_undo_cwnd, 928 | .cwnd_event = bbr_cwnd_event, 929 | .ssthresh = bbr_ssthresh, 930 | .tso_segs_goal = bbr_tso_segs_goal, 931 | .get_info = bbr_get_info, 932 | .set_state = bbr_set_state, 933 | }; 934 | 935 | static int __init bbr_register(void) 936 | { 937 | BUILD_BUG_ON(sizeof(struct bbr) > ICSK_CA_PRIV_SIZE); 938 | return tcp_register_congestion_control(&tcp_bbr_cong_ops); 939 | } 940 | 941 | static void __exit bbr_unregister(void) 942 | { 943 | tcp_unregister_congestion_control(&tcp_bbr_cong_ops); 944 | } 945 | 946 | module_init(bbr_register); 947 | module_exit(bbr_unregister); 948 | 949 | MODULE_AUTHOR("Van Jacobson "); 950 | MODULE_AUTHOR("Neal Cardwell "); 951 | MODULE_AUTHOR("Yuchung Cheng "); 952 | MODULE_AUTHOR("Soheil Hassas Yeganeh "); 953 | MODULE_LICENSE("Dual BSD/GPL"); 954 | MODULE_DESCRIPTION("TCP BBR (Bottleneck Bandwidth and RTT)"); 955 | -------------------------------------------------------------------------------- /General/Debian/source/kernel-v4.15/tcp_bbr.c: -------------------------------------------------------------------------------- 1 | /* Bottleneck Bandwidth and RTT (BBR) congestion control 2 | * 3 | * BBR congestion control computes the sending rate based on the delivery 4 | * rate (throughput) estimated from ACKs. In a nutshell: 5 | * 6 | * On each ACK, update our model of the network path: 7 | * bottleneck_bandwidth = windowed_max(delivered / elapsed, 10 round trips) 8 | * min_rtt = windowed_min(rtt, 10 seconds) 9 | * pacing_rate = pacing_gain * bottleneck_bandwidth 10 | * cwnd = max(cwnd_gain * bottleneck_bandwidth * min_rtt, 4) 11 | * 12 | * The core algorithm does not react directly to packet losses or delays, 13 | * although BBR may adjust the size of next send per ACK when loss is 14 | * observed, or adjust the sending rate if it estimates there is a 15 | * traffic policer, in order to keep the drop rate reasonable. 16 | * 17 | * Here is a state transition diagram for BBR: 18 | * 19 | * | 20 | * V 21 | * +---> STARTUP ----+ 22 | * | | | 23 | * | V | 24 | * | DRAIN ----+ 25 | * | | | 26 | * | V | 27 | * +---> PROBE_BW ----+ 28 | * | ^ | | 29 | * | | | | 30 | * | +----+ | 31 | * | | 32 | * +---- PROBE_RTT <--+ 33 | * 34 | * A BBR flow starts in STARTUP, and ramps up its sending rate quickly. 35 | * When it estimates the pipe is full, it enters DRAIN to drain the queue. 36 | * In steady state a BBR flow only uses PROBE_BW and PROBE_RTT. 37 | * A long-lived BBR flow spends the vast majority of its time remaining 38 | * (repeatedly) in PROBE_BW, fully probing and utilizing the pipe's bandwidth 39 | * in a fair manner, with a small, bounded queue. *If* a flow has been 40 | * continuously sending for the entire min_rtt window, and hasn't seen an RTT 41 | * sample that matches or decreases its min_rtt estimate for 10 seconds, then 42 | * it briefly enters PROBE_RTT to cut inflight to a minimum value to re-probe 43 | * the path's two-way propagation delay (min_rtt). When exiting PROBE_RTT, if 44 | * we estimated that we reached the full bw of the pipe then we enter PROBE_BW; 45 | * otherwise we enter STARTUP to try to fill the pipe. 46 | * 47 | * BBR is described in detail in: 48 | * "BBR: Congestion-Based Congestion Control", 49 | * Neal Cardwell, Yuchung Cheng, C. Stephen Gunn, Soheil Hassas Yeganeh, 50 | * Van Jacobson. ACM Queue, Vol. 14 No. 5, September-October 2016. 51 | * 52 | * There is a public e-mail list for discussing BBR development and testing: 53 | * https://groups.google.com/forum/#!forum/bbr-dev 54 | * 55 | * NOTE: BBR might be used with the fq qdisc ("man tc-fq") with pacing enabled, 56 | * otherwise TCP stack falls back to an internal pacing using one high 57 | * resolution timer per TCP socket and may use more resources. 58 | */ 59 | #include 60 | #include 61 | #include 62 | #include 63 | #include 64 | #include 65 | 66 | /* Scale factor for rate in pkt/uSec unit to avoid truncation in bandwidth 67 | * estimation. The rate unit ~= (1500 bytes / 1 usec / 2^24) ~= 715 bps. 68 | * This handles bandwidths from 0.06pps (715bps) to 256Mpps (3Tbps) in a u32. 69 | * Since the minimum window is >=4 packets, the lower bound isn't 70 | * an issue. The upper bound isn't an issue with existing technologies. 71 | */ 72 | #define BW_SCALE 24 73 | #define BW_UNIT (1 << BW_SCALE) 74 | 75 | #define BBR_SCALE 8 /* scaling factor for fractions in BBR (e.g. gains) */ 76 | #define BBR_UNIT (1 << BBR_SCALE) 77 | 78 | /* BBR has the following modes for deciding how fast to send: */ 79 | enum bbr_mode { 80 | BBR_STARTUP, /* ramp up sending rate rapidly to fill pipe */ 81 | BBR_DRAIN, /* drain any queue created during startup */ 82 | BBR_PROBE_BW, /* discover, share bw: pace around estimated bw */ 83 | BBR_PROBE_RTT, /* cut inflight to min to probe min_rtt */ 84 | }; 85 | 86 | /* BBR congestion control block */ 87 | struct bbr { 88 | u32 min_rtt_us; /* min RTT in min_rtt_win_sec window */ 89 | u32 min_rtt_stamp; /* timestamp of min_rtt_us */ 90 | u32 probe_rtt_done_stamp; /* end time for BBR_PROBE_RTT mode */ 91 | struct minmax bw; /* Max recent delivery rate in pkts/uS << 24 */ 92 | u32 rtt_cnt; /* count of packet-timed rounds elapsed */ 93 | u32 next_rtt_delivered; /* scb->tx.delivered at end of round */ 94 | u64 cycle_mstamp; /* time of this cycle phase start */ 95 | u32 mode:3, /* current bbr_mode in state machine */ 96 | prev_ca_state:3, /* CA state on previous ACK */ 97 | packet_conservation:1, /* use packet conservation? */ 98 | restore_cwnd:1, /* decided to revert cwnd to old value */ 99 | round_start:1, /* start of packet-timed tx->ack round? */ 100 | tso_segs_goal:7, /* segments we want in each skb we send */ 101 | idle_restart:1, /* restarting after idle? */ 102 | probe_rtt_round_done:1, /* a BBR_PROBE_RTT round at 4 pkts? */ 103 | unused:5, 104 | lt_is_sampling:1, /* taking long-term ("LT") samples now? */ 105 | lt_rtt_cnt:7, /* round trips in long-term interval */ 106 | lt_use_bw:1; /* use lt_bw as our bw estimate? */ 107 | u32 lt_bw; /* LT est delivery rate in pkts/uS << 24 */ 108 | u32 lt_last_delivered; /* LT intvl start: tp->delivered */ 109 | u32 lt_last_stamp; /* LT intvl start: tp->delivered_mstamp */ 110 | u32 lt_last_lost; /* LT intvl start: tp->lost */ 111 | u32 pacing_gain:10, /* current gain for setting pacing rate */ 112 | cwnd_gain:10, /* current gain for setting cwnd */ 113 | full_bw_reached:1, /* reached full bw in Startup? */ 114 | full_bw_cnt:2, /* number of rounds without large bw gains */ 115 | cycle_idx:3, /* current index in pacing_gain cycle array */ 116 | has_seen_rtt:1, /* have we seen an RTT sample yet? */ 117 | unused_b:5; 118 | u32 prior_cwnd; /* prior cwnd upon entering loss recovery */ 119 | u32 full_bw; /* recent bw, to estimate if pipe is full */ 120 | }; 121 | 122 | #define CYCLE_LEN 8 /* number of phases in a pacing gain cycle */ 123 | 124 | /* Window length of bw filter (in rounds): */ 125 | static const int bbr_bw_rtts = CYCLE_LEN + 2; 126 | /* Window length of min_rtt filter (in sec): */ 127 | static const u32 bbr_min_rtt_win_sec = 10; 128 | /* Minimum time (in ms) spent at bbr_cwnd_min_target in BBR_PROBE_RTT mode: */ 129 | static const u32 bbr_probe_rtt_mode_ms = 200; 130 | /* Skip TSO below the following bandwidth (bits/sec): */ 131 | static const int bbr_min_tso_rate = 1200000; 132 | 133 | /* We use a high_gain value of 2/ln(2) because it's the smallest pacing gain 134 | * that will allow a smoothly increasing pacing rate that will double each RTT 135 | * and send the same number of packets per RTT that an un-paced, slow-starting 136 | * Reno or CUBIC flow would: 137 | */ 138 | static const int bbr_high_gain = BBR_UNIT * 2885 / 1000 + 1; 139 | /* The pacing gain of 1/high_gain in BBR_DRAIN is calculated to typically drain 140 | * the queue created in BBR_STARTUP in a single round: 141 | */ 142 | static const int bbr_drain_gain = BBR_UNIT * 1000 / 2885; 143 | /* The gain for deriving steady-state cwnd tolerates delayed/stretched ACKs: */ 144 | static const int bbr_cwnd_gain = BBR_UNIT * 2; 145 | /* The pacing_gain values for the PROBE_BW gain cycle, to discover/share bw: */ 146 | static const int bbr_pacing_gain[] = { 147 | BBR_UNIT * 5 / 4, /* probe for more available bw */ 148 | BBR_UNIT * 3 / 4, /* drain queue and/or yield bw to other flows */ 149 | BBR_UNIT, BBR_UNIT, BBR_UNIT, /* cruise at 1.0*bw to utilize pipe, */ 150 | BBR_UNIT, BBR_UNIT, BBR_UNIT /* without creating excess queue... */ 151 | }; 152 | /* Randomize the starting gain cycling phase over N phases: */ 153 | static const u32 bbr_cycle_rand = 7; 154 | 155 | /* Try to keep at least this many packets in flight, if things go smoothly. For 156 | * smooth functioning, a sliding window protocol ACKing every other packet 157 | * needs at least 4 packets in flight: 158 | */ 159 | static const u32 bbr_cwnd_min_target = 4; 160 | 161 | /* To estimate if BBR_STARTUP mode (i.e. high_gain) has filled pipe... */ 162 | /* If bw has increased significantly (1.25x), there may be more bw available: */ 163 | static const u32 bbr_full_bw_thresh = BBR_UNIT * 5 / 4; 164 | /* But after 3 rounds w/o significant bw growth, estimate pipe is full: */ 165 | static const u32 bbr_full_bw_cnt = 3; 166 | 167 | /* "long-term" ("LT") bandwidth estimator parameters... */ 168 | /* The minimum number of rounds in an LT bw sampling interval: */ 169 | static const u32 bbr_lt_intvl_min_rtts = 4; 170 | /* If lost/delivered ratio > 20%, interval is "lossy" and we may be policed: */ 171 | static const u32 bbr_lt_loss_thresh = 50; 172 | /* If 2 intervals have a bw ratio <= 1/8, their bw is "consistent": */ 173 | static const u32 bbr_lt_bw_ratio = BBR_UNIT / 8; 174 | /* If 2 intervals have a bw diff <= 4 Kbit/sec their bw is "consistent": */ 175 | static const u32 bbr_lt_bw_diff = 4000 / 8; 176 | /* If we estimate we're policed, use lt_bw for this many round trips: */ 177 | static const u32 bbr_lt_bw_max_rtts = 48; 178 | 179 | /* Do we estimate that STARTUP filled the pipe? */ 180 | static bool bbr_full_bw_reached(const struct sock *sk) 181 | { 182 | const struct bbr *bbr = inet_csk_ca(sk); 183 | 184 | return bbr->full_bw_reached; 185 | } 186 | 187 | /* Return the windowed max recent bandwidth sample, in pkts/uS << BW_SCALE. */ 188 | static u32 bbr_max_bw(const struct sock *sk) 189 | { 190 | struct bbr *bbr = inet_csk_ca(sk); 191 | 192 | return minmax_get(&bbr->bw); 193 | } 194 | 195 | /* Return the estimated bandwidth of the path, in pkts/uS << BW_SCALE. */ 196 | static u32 bbr_bw(const struct sock *sk) 197 | { 198 | struct bbr *bbr = inet_csk_ca(sk); 199 | 200 | return bbr->lt_use_bw ? bbr->lt_bw : bbr_max_bw(sk); 201 | } 202 | 203 | /* Return rate in bytes per second, optionally with a gain. 204 | * The order here is chosen carefully to avoid overflow of u64. This should 205 | * work for input rates of up to 2.9Tbit/sec and gain of 2.89x. 206 | */ 207 | static u64 bbr_rate_bytes_per_sec(struct sock *sk, u64 rate, int gain) 208 | { 209 | rate *= tcp_mss_to_mtu(sk, tcp_sk(sk)->mss_cache); 210 | rate *= gain; 211 | rate >>= BBR_SCALE; 212 | rate *= USEC_PER_SEC; 213 | return rate >> BW_SCALE; 214 | } 215 | 216 | /* Convert a BBR bw and gain factor to a pacing rate in bytes per second. */ 217 | static u32 bbr_bw_to_pacing_rate(struct sock *sk, u32 bw, int gain) 218 | { 219 | u64 rate = bw; 220 | 221 | rate = bbr_rate_bytes_per_sec(sk, rate, gain); 222 | rate = min_t(u64, rate, sk->sk_max_pacing_rate); 223 | return rate; 224 | } 225 | 226 | /* Initialize pacing rate to: high_gain * init_cwnd / RTT. */ 227 | static void bbr_init_pacing_rate_from_rtt(struct sock *sk) 228 | { 229 | struct tcp_sock *tp = tcp_sk(sk); 230 | struct bbr *bbr = inet_csk_ca(sk); 231 | u64 bw; 232 | u32 rtt_us; 233 | 234 | if (tp->srtt_us) { /* any RTT sample yet? */ 235 | rtt_us = max(tp->srtt_us >> 3, 1U); 236 | bbr->has_seen_rtt = 1; 237 | } else { /* no RTT sample yet */ 238 | rtt_us = USEC_PER_MSEC; /* use nominal default RTT */ 239 | } 240 | bw = (u64)tp->snd_cwnd * BW_UNIT; 241 | do_div(bw, rtt_us); 242 | sk->sk_pacing_rate = bbr_bw_to_pacing_rate(sk, bw, bbr_high_gain); 243 | } 244 | 245 | /* Pace using current bw estimate and a gain factor. In order to help drive the 246 | * network toward lower queues while maintaining high utilization and low 247 | * latency, the average pacing rate aims to be slightly (~1%) lower than the 248 | * estimated bandwidth. This is an important aspect of the design. In this 249 | * implementation this slightly lower pacing rate is achieved implicitly by not 250 | * including link-layer headers in the packet size used for the pacing rate. 251 | */ 252 | static void bbr_set_pacing_rate(struct sock *sk, u32 bw, int gain) 253 | { 254 | struct tcp_sock *tp = tcp_sk(sk); 255 | struct bbr *bbr = inet_csk_ca(sk); 256 | u32 rate = bbr_bw_to_pacing_rate(sk, bw, gain); 257 | 258 | if (unlikely(!bbr->has_seen_rtt && tp->srtt_us)) 259 | bbr_init_pacing_rate_from_rtt(sk); 260 | if (bbr_full_bw_reached(sk) || rate > sk->sk_pacing_rate) 261 | sk->sk_pacing_rate = rate; 262 | } 263 | 264 | /* Return count of segments we want in the skbs we send, or 0 for default. */ 265 | static u32 bbr_tso_segs_goal(struct sock *sk) 266 | { 267 | struct bbr *bbr = inet_csk_ca(sk); 268 | 269 | return bbr->tso_segs_goal; 270 | } 271 | 272 | static void bbr_set_tso_segs_goal(struct sock *sk) 273 | { 274 | struct tcp_sock *tp = tcp_sk(sk); 275 | struct bbr *bbr = inet_csk_ca(sk); 276 | u32 min_segs; 277 | 278 | min_segs = sk->sk_pacing_rate < (bbr_min_tso_rate >> 3) ? 1 : 2; 279 | bbr->tso_segs_goal = min(tcp_tso_autosize(sk, tp->mss_cache, min_segs), 280 | 0x7FU); 281 | } 282 | 283 | /* Save "last known good" cwnd so we can restore it after losses or PROBE_RTT */ 284 | static void bbr_save_cwnd(struct sock *sk) 285 | { 286 | struct tcp_sock *tp = tcp_sk(sk); 287 | struct bbr *bbr = inet_csk_ca(sk); 288 | 289 | if (bbr->prev_ca_state < TCP_CA_Recovery && bbr->mode != BBR_PROBE_RTT) 290 | bbr->prior_cwnd = tp->snd_cwnd; /* this cwnd is good enough */ 291 | else /* loss recovery or BBR_PROBE_RTT have temporarily cut cwnd */ 292 | bbr->prior_cwnd = max(bbr->prior_cwnd, tp->snd_cwnd); 293 | } 294 | 295 | static void bbr_cwnd_event(struct sock *sk, enum tcp_ca_event event) 296 | { 297 | struct tcp_sock *tp = tcp_sk(sk); 298 | struct bbr *bbr = inet_csk_ca(sk); 299 | 300 | if (event == CA_EVENT_TX_START && tp->app_limited) { 301 | bbr->idle_restart = 1; 302 | /* Avoid pointless buffer overflows: pace at est. bw if we don't 303 | * need more speed (we're restarting from idle and app-limited). 304 | */ 305 | if (bbr->mode == BBR_PROBE_BW) 306 | bbr_set_pacing_rate(sk, bbr_bw(sk), BBR_UNIT); 307 | } 308 | } 309 | 310 | /* Find target cwnd. Right-size the cwnd based on min RTT and the 311 | * estimated bottleneck bandwidth: 312 | * 313 | * cwnd = bw * min_rtt * gain = BDP * gain 314 | * 315 | * The key factor, gain, controls the amount of queue. While a small gain 316 | * builds a smaller queue, it becomes more vulnerable to noise in RTT 317 | * measurements (e.g., delayed ACKs or other ACK compression effects). This 318 | * noise may cause BBR to under-estimate the rate. 319 | * 320 | * To achieve full performance in high-speed paths, we budget enough cwnd to 321 | * fit full-sized skbs in-flight on both end hosts to fully utilize the path: 322 | * - one skb in sending host Qdisc, 323 | * - one skb in sending host TSO/GSO engine 324 | * - one skb being received by receiver host LRO/GRO/delayed-ACK engine 325 | * Don't worry, at low rates (bbr_min_tso_rate) this won't bloat cwnd because 326 | * in such cases tso_segs_goal is 1. The minimum cwnd is 4 packets, 327 | * which allows 2 outstanding 2-packet sequences, to try to keep pipe 328 | * full even with ACK-every-other-packet delayed ACKs. 329 | */ 330 | static u32 bbr_target_cwnd(struct sock *sk, u32 bw, int gain) 331 | { 332 | struct bbr *bbr = inet_csk_ca(sk); 333 | u32 cwnd; 334 | u64 w; 335 | 336 | /* If we've never had a valid RTT sample, cap cwnd at the initial 337 | * default. This should only happen when the connection is not using TCP 338 | * timestamps and has retransmitted all of the SYN/SYNACK/data packets 339 | * ACKed so far. In this case, an RTO can cut cwnd to 1, in which 340 | * case we need to slow-start up toward something safe: TCP_INIT_CWND. 341 | */ 342 | if (unlikely(bbr->min_rtt_us == ~0U)) /* no valid RTT samples yet? */ 343 | return TCP_INIT_CWND; /* be safe: cap at default initial cwnd*/ 344 | 345 | w = (u64)bw * bbr->min_rtt_us; 346 | 347 | /* Apply a gain to the given value, then remove the BW_SCALE shift. */ 348 | cwnd = (((w * gain) >> BBR_SCALE) + BW_UNIT - 1) / BW_UNIT; 349 | 350 | /* Allow enough full-sized skbs in flight to utilize end systems. */ 351 | cwnd += 3 * bbr->tso_segs_goal; 352 | 353 | /* Reduce delayed ACKs by rounding up cwnd to the next even number. */ 354 | cwnd = (cwnd + 1) & ~1U; 355 | 356 | return cwnd; 357 | } 358 | 359 | /* An optimization in BBR to reduce losses: On the first round of recovery, we 360 | * follow the packet conservation principle: send P packets per P packets acked. 361 | * After that, we slow-start and send at most 2*P packets per P packets acked. 362 | * After recovery finishes, or upon undo, we restore the cwnd we had when 363 | * recovery started (capped by the target cwnd based on estimated BDP). 364 | * 365 | * TODO(ycheng/ncardwell): implement a rate-based approach. 366 | */ 367 | static bool bbr_set_cwnd_to_recover_or_restore( 368 | struct sock *sk, const struct rate_sample *rs, u32 acked, u32 *new_cwnd) 369 | { 370 | struct tcp_sock *tp = tcp_sk(sk); 371 | struct bbr *bbr = inet_csk_ca(sk); 372 | u8 prev_state = bbr->prev_ca_state, state = inet_csk(sk)->icsk_ca_state; 373 | u32 cwnd = tp->snd_cwnd; 374 | 375 | /* An ACK for P pkts should release at most 2*P packets. We do this 376 | * in two steps. First, here we deduct the number of lost packets. 377 | * Then, in bbr_set_cwnd() we slow start up toward the target cwnd. 378 | */ 379 | if (rs->losses > 0) 380 | cwnd = max_t(s32, cwnd - rs->losses, 1); 381 | 382 | if (state == TCP_CA_Recovery && prev_state != TCP_CA_Recovery) { 383 | /* Starting 1st round of Recovery, so do packet conservation. */ 384 | bbr->packet_conservation = 1; 385 | bbr->next_rtt_delivered = tp->delivered; /* start round now */ 386 | /* Cut unused cwnd from app behavior, TSQ, or TSO deferral: */ 387 | cwnd = tcp_packets_in_flight(tp) + acked; 388 | } else if (prev_state >= TCP_CA_Recovery && state < TCP_CA_Recovery) { 389 | /* Exiting loss recovery; restore cwnd saved before recovery. */ 390 | bbr->restore_cwnd = 1; 391 | bbr->packet_conservation = 0; 392 | } 393 | bbr->prev_ca_state = state; 394 | 395 | if (bbr->restore_cwnd) { 396 | /* Restore cwnd after exiting loss recovery or PROBE_RTT. */ 397 | cwnd = max(cwnd, bbr->prior_cwnd); 398 | bbr->restore_cwnd = 0; 399 | } 400 | 401 | if (bbr->packet_conservation) { 402 | *new_cwnd = max(cwnd, tcp_packets_in_flight(tp) + acked); 403 | return true; /* yes, using packet conservation */ 404 | } 405 | *new_cwnd = cwnd; 406 | return false; 407 | } 408 | 409 | /* Slow-start up toward target cwnd (if bw estimate is growing, or packet loss 410 | * has drawn us down below target), or snap down to target if we're above it. 411 | */ 412 | static void bbr_set_cwnd(struct sock *sk, const struct rate_sample *rs, 413 | u32 acked, u32 bw, int gain) 414 | { 415 | struct tcp_sock *tp = tcp_sk(sk); 416 | struct bbr *bbr = inet_csk_ca(sk); 417 | u32 cwnd = 0, target_cwnd = 0; 418 | 419 | if (!acked) 420 | return; 421 | 422 | if (bbr_set_cwnd_to_recover_or_restore(sk, rs, acked, &cwnd)) 423 | goto done; 424 | 425 | /* If we're below target cwnd, slow start cwnd toward target cwnd. */ 426 | target_cwnd = bbr_target_cwnd(sk, bw, gain); 427 | if (bbr_full_bw_reached(sk)) /* only cut cwnd if we filled the pipe */ 428 | cwnd = min(cwnd + acked, target_cwnd); 429 | else if (cwnd < target_cwnd || tp->delivered < TCP_INIT_CWND) 430 | cwnd = cwnd + acked; 431 | cwnd = max(cwnd, bbr_cwnd_min_target); 432 | 433 | done: 434 | tp->snd_cwnd = min(cwnd, tp->snd_cwnd_clamp); /* apply global cap */ 435 | if (bbr->mode == BBR_PROBE_RTT) /* drain queue, refresh min_rtt */ 436 | tp->snd_cwnd = min(tp->snd_cwnd, bbr_cwnd_min_target); 437 | } 438 | 439 | /* End cycle phase if it's time and/or we hit the phase's in-flight target. */ 440 | static bool bbr_is_next_cycle_phase(struct sock *sk, 441 | const struct rate_sample *rs) 442 | { 443 | struct tcp_sock *tp = tcp_sk(sk); 444 | struct bbr *bbr = inet_csk_ca(sk); 445 | bool is_full_length = 446 | tcp_stamp_us_delta(tp->delivered_mstamp, bbr->cycle_mstamp) > 447 | bbr->min_rtt_us; 448 | u32 inflight, bw; 449 | 450 | /* The pacing_gain of 1.0 paces at the estimated bw to try to fully 451 | * use the pipe without increasing the queue. 452 | */ 453 | if (bbr->pacing_gain == BBR_UNIT) 454 | return is_full_length; /* just use wall clock time */ 455 | 456 | inflight = rs->prior_in_flight; /* what was in-flight before ACK? */ 457 | bw = bbr_max_bw(sk); 458 | 459 | /* A pacing_gain > 1.0 probes for bw by trying to raise inflight to at 460 | * least pacing_gain*BDP; this may take more than min_rtt if min_rtt is 461 | * small (e.g. on a LAN). We do not persist if packets are lost, since 462 | * a path with small buffers may not hold that much. 463 | */ 464 | if (bbr->pacing_gain > BBR_UNIT) 465 | return is_full_length && 466 | (rs->losses || /* perhaps pacing_gain*BDP won't fit */ 467 | inflight >= bbr_target_cwnd(sk, bw, bbr->pacing_gain)); 468 | 469 | /* A pacing_gain < 1.0 tries to drain extra queue we added if bw 470 | * probing didn't find more bw. If inflight falls to match BDP then we 471 | * estimate queue is drained; persisting would underutilize the pipe. 472 | */ 473 | return is_full_length || 474 | inflight <= bbr_target_cwnd(sk, bw, BBR_UNIT); 475 | } 476 | 477 | static void bbr_advance_cycle_phase(struct sock *sk) 478 | { 479 | struct tcp_sock *tp = tcp_sk(sk); 480 | struct bbr *bbr = inet_csk_ca(sk); 481 | 482 | bbr->cycle_idx = (bbr->cycle_idx + 1) & (CYCLE_LEN - 1); 483 | bbr->cycle_mstamp = tp->delivered_mstamp; 484 | bbr->pacing_gain = bbr_pacing_gain[bbr->cycle_idx]; 485 | } 486 | 487 | /* Gain cycling: cycle pacing gain to converge to fair share of available bw. */ 488 | static void bbr_update_cycle_phase(struct sock *sk, 489 | const struct rate_sample *rs) 490 | { 491 | struct bbr *bbr = inet_csk_ca(sk); 492 | 493 | if ((bbr->mode == BBR_PROBE_BW) && !bbr->lt_use_bw && 494 | bbr_is_next_cycle_phase(sk, rs)) 495 | bbr_advance_cycle_phase(sk); 496 | } 497 | 498 | static void bbr_reset_startup_mode(struct sock *sk) 499 | { 500 | struct bbr *bbr = inet_csk_ca(sk); 501 | 502 | bbr->mode = BBR_STARTUP; 503 | bbr->pacing_gain = bbr_high_gain; 504 | bbr->cwnd_gain = bbr_high_gain; 505 | } 506 | 507 | static void bbr_reset_probe_bw_mode(struct sock *sk) 508 | { 509 | struct bbr *bbr = inet_csk_ca(sk); 510 | 511 | bbr->mode = BBR_PROBE_BW; 512 | bbr->pacing_gain = BBR_UNIT; 513 | bbr->cwnd_gain = bbr_cwnd_gain; 514 | bbr->cycle_idx = CYCLE_LEN - 1 - prandom_u32_max(bbr_cycle_rand); 515 | bbr_advance_cycle_phase(sk); /* flip to next phase of gain cycle */ 516 | } 517 | 518 | static void bbr_reset_mode(struct sock *sk) 519 | { 520 | if (!bbr_full_bw_reached(sk)) 521 | bbr_reset_startup_mode(sk); 522 | else 523 | bbr_reset_probe_bw_mode(sk); 524 | } 525 | 526 | /* Start a new long-term sampling interval. */ 527 | static void bbr_reset_lt_bw_sampling_interval(struct sock *sk) 528 | { 529 | struct tcp_sock *tp = tcp_sk(sk); 530 | struct bbr *bbr = inet_csk_ca(sk); 531 | 532 | bbr->lt_last_stamp = div_u64(tp->delivered_mstamp, USEC_PER_MSEC); 533 | bbr->lt_last_delivered = tp->delivered; 534 | bbr->lt_last_lost = tp->lost; 535 | bbr->lt_rtt_cnt = 0; 536 | } 537 | 538 | /* Completely reset long-term bandwidth sampling. */ 539 | static void bbr_reset_lt_bw_sampling(struct sock *sk) 540 | { 541 | struct bbr *bbr = inet_csk_ca(sk); 542 | 543 | bbr->lt_bw = 0; 544 | bbr->lt_use_bw = 0; 545 | bbr->lt_is_sampling = false; 546 | bbr_reset_lt_bw_sampling_interval(sk); 547 | } 548 | 549 | /* Long-term bw sampling interval is done. Estimate whether we're policed. */ 550 | static void bbr_lt_bw_interval_done(struct sock *sk, u32 bw) 551 | { 552 | struct bbr *bbr = inet_csk_ca(sk); 553 | u32 diff; 554 | 555 | if (bbr->lt_bw) { /* do we have bw from a previous interval? */ 556 | /* Is new bw close to the lt_bw from the previous interval? */ 557 | diff = abs(bw - bbr->lt_bw); 558 | if ((diff * BBR_UNIT <= bbr_lt_bw_ratio * bbr->lt_bw) || 559 | (bbr_rate_bytes_per_sec(sk, diff, BBR_UNIT) <= 560 | bbr_lt_bw_diff)) { 561 | /* All criteria are met; estimate we're policed. */ 562 | bbr->lt_bw = (bw + bbr->lt_bw) >> 1; /* avg 2 intvls */ 563 | bbr->lt_use_bw = 1; 564 | bbr->pacing_gain = BBR_UNIT; /* try to avoid drops */ 565 | bbr->lt_rtt_cnt = 0; 566 | return; 567 | } 568 | } 569 | bbr->lt_bw = bw; 570 | bbr_reset_lt_bw_sampling_interval(sk); 571 | } 572 | 573 | /* Token-bucket traffic policers are common (see "An Internet-Wide Analysis of 574 | * Traffic Policing", SIGCOMM 2016). BBR detects token-bucket policers and 575 | * explicitly models their policed rate, to reduce unnecessary losses. We 576 | * estimate that we're policed if we see 2 consecutive sampling intervals with 577 | * consistent throughput and high packet loss. If we think we're being policed, 578 | * set lt_bw to the "long-term" average delivery rate from those 2 intervals. 579 | */ 580 | static void bbr_lt_bw_sampling(struct sock *sk, const struct rate_sample *rs) 581 | { 582 | struct tcp_sock *tp = tcp_sk(sk); 583 | struct bbr *bbr = inet_csk_ca(sk); 584 | u32 lost, delivered; 585 | u64 bw; 586 | u32 t; 587 | 588 | if (bbr->lt_use_bw) { /* already using long-term rate, lt_bw? */ 589 | if (bbr->mode == BBR_PROBE_BW && bbr->round_start && 590 | ++bbr->lt_rtt_cnt >= bbr_lt_bw_max_rtts) { 591 | bbr_reset_lt_bw_sampling(sk); /* stop using lt_bw */ 592 | bbr_reset_probe_bw_mode(sk); /* restart gain cycling */ 593 | } 594 | return; 595 | } 596 | 597 | /* Wait for the first loss before sampling, to let the policer exhaust 598 | * its tokens and estimate the steady-state rate allowed by the policer. 599 | * Starting samples earlier includes bursts that over-estimate the bw. 600 | */ 601 | if (!bbr->lt_is_sampling) { 602 | if (!rs->losses) 603 | return; 604 | bbr_reset_lt_bw_sampling_interval(sk); 605 | bbr->lt_is_sampling = true; 606 | } 607 | 608 | /* To avoid underestimates, reset sampling if we run out of data. */ 609 | if (rs->is_app_limited) { 610 | bbr_reset_lt_bw_sampling(sk); 611 | return; 612 | } 613 | 614 | if (bbr->round_start) 615 | bbr->lt_rtt_cnt++; /* count round trips in this interval */ 616 | if (bbr->lt_rtt_cnt < bbr_lt_intvl_min_rtts) 617 | return; /* sampling interval needs to be longer */ 618 | if (bbr->lt_rtt_cnt > 4 * bbr_lt_intvl_min_rtts) { 619 | bbr_reset_lt_bw_sampling(sk); /* interval is too long */ 620 | return; 621 | } 622 | 623 | /* End sampling interval when a packet is lost, so we estimate the 624 | * policer tokens were exhausted. Stopping the sampling before the 625 | * tokens are exhausted under-estimates the policed rate. 626 | */ 627 | if (!rs->losses) 628 | return; 629 | 630 | /* Calculate packets lost and delivered in sampling interval. */ 631 | lost = tp->lost - bbr->lt_last_lost; 632 | delivered = tp->delivered - bbr->lt_last_delivered; 633 | /* Is loss rate (lost/delivered) >= lt_loss_thresh? If not, wait. */ 634 | if (!delivered || (lost << BBR_SCALE) < bbr_lt_loss_thresh * delivered) 635 | return; 636 | 637 | /* Find average delivery rate in this sampling interval. */ 638 | t = div_u64(tp->delivered_mstamp, USEC_PER_MSEC) - bbr->lt_last_stamp; 639 | if ((s32)t < 1) 640 | return; /* interval is less than one ms, so wait */ 641 | /* Check if can multiply without overflow */ 642 | if (t >= ~0U / USEC_PER_MSEC) { 643 | bbr_reset_lt_bw_sampling(sk); /* interval too long; reset */ 644 | return; 645 | } 646 | t *= USEC_PER_MSEC; 647 | bw = (u64)delivered * BW_UNIT; 648 | do_div(bw, t); 649 | bbr_lt_bw_interval_done(sk, bw); 650 | } 651 | 652 | /* Estimate the bandwidth based on how fast packets are delivered */ 653 | static void bbr_update_bw(struct sock *sk, const struct rate_sample *rs) 654 | { 655 | struct tcp_sock *tp = tcp_sk(sk); 656 | struct bbr *bbr = inet_csk_ca(sk); 657 | u64 bw; 658 | 659 | bbr->round_start = 0; 660 | if (rs->delivered < 0 || rs->interval_us <= 0) 661 | return; /* Not a valid observation */ 662 | 663 | /* See if we've reached the next RTT */ 664 | if (!before(rs->prior_delivered, bbr->next_rtt_delivered)) { 665 | bbr->next_rtt_delivered = tp->delivered; 666 | bbr->rtt_cnt++; 667 | bbr->round_start = 1; 668 | bbr->packet_conservation = 0; 669 | } 670 | 671 | bbr_lt_bw_sampling(sk, rs); 672 | 673 | /* Divide delivered by the interval to find a (lower bound) bottleneck 674 | * bandwidth sample. Delivered is in packets and interval_us in uS and 675 | * ratio will be <<1 for most connections. So delivered is first scaled. 676 | */ 677 | bw = (u64)rs->delivered * BW_UNIT; 678 | do_div(bw, rs->interval_us); 679 | 680 | /* If this sample is application-limited, it is likely to have a very 681 | * low delivered count that represents application behavior rather than 682 | * the available network rate. Such a sample could drag down estimated 683 | * bw, causing needless slow-down. Thus, to continue to send at the 684 | * last measured network rate, we filter out app-limited samples unless 685 | * they describe the path bw at least as well as our bw model. 686 | * 687 | * So the goal during app-limited phase is to proceed with the best 688 | * network rate no matter how long. We automatically leave this 689 | * phase when app writes faster than the network can deliver :) 690 | */ 691 | if (!rs->is_app_limited || bw >= bbr_max_bw(sk)) { 692 | /* Incorporate new sample into our max bw filter. */ 693 | minmax_running_max(&bbr->bw, bbr_bw_rtts, bbr->rtt_cnt, bw); 694 | } 695 | } 696 | 697 | /* Estimate when the pipe is full, using the change in delivery rate: BBR 698 | * estimates that STARTUP filled the pipe if the estimated bw hasn't changed by 699 | * at least bbr_full_bw_thresh (25%) after bbr_full_bw_cnt (3) non-app-limited 700 | * rounds. Why 3 rounds: 1: rwin autotuning grows the rwin, 2: we fill the 701 | * higher rwin, 3: we get higher delivery rate samples. Or transient 702 | * cross-traffic or radio noise can go away. CUBIC Hystart shares a similar 703 | * design goal, but uses delay and inter-ACK spacing instead of bandwidth. 704 | */ 705 | static void bbr_check_full_bw_reached(struct sock *sk, 706 | const struct rate_sample *rs) 707 | { 708 | struct bbr *bbr = inet_csk_ca(sk); 709 | u32 bw_thresh; 710 | 711 | if (bbr_full_bw_reached(sk) || !bbr->round_start || rs->is_app_limited) 712 | return; 713 | 714 | bw_thresh = (u64)bbr->full_bw * bbr_full_bw_thresh >> BBR_SCALE; 715 | if (bbr_max_bw(sk) >= bw_thresh) { 716 | bbr->full_bw = bbr_max_bw(sk); 717 | bbr->full_bw_cnt = 0; 718 | return; 719 | } 720 | ++bbr->full_bw_cnt; 721 | bbr->full_bw_reached = bbr->full_bw_cnt >= bbr_full_bw_cnt; 722 | } 723 | 724 | /* If pipe is probably full, drain the queue and then enter steady-state. */ 725 | static void bbr_check_drain(struct sock *sk, const struct rate_sample *rs) 726 | { 727 | struct bbr *bbr = inet_csk_ca(sk); 728 | 729 | if (bbr->mode == BBR_STARTUP && bbr_full_bw_reached(sk)) { 730 | bbr->mode = BBR_DRAIN; /* drain queue we created */ 731 | bbr->pacing_gain = bbr_drain_gain; /* pace slow to drain */ 732 | bbr->cwnd_gain = bbr_high_gain; /* maintain cwnd */ 733 | } /* fall through to check if in-flight is already small: */ 734 | if (bbr->mode == BBR_DRAIN && 735 | tcp_packets_in_flight(tcp_sk(sk)) <= 736 | bbr_target_cwnd(sk, bbr_max_bw(sk), BBR_UNIT)) 737 | bbr_reset_probe_bw_mode(sk); /* we estimate queue is drained */ 738 | } 739 | 740 | /* The goal of PROBE_RTT mode is to have BBR flows cooperatively and 741 | * periodically drain the bottleneck queue, to converge to measure the true 742 | * min_rtt (unloaded propagation delay). This allows the flows to keep queues 743 | * small (reducing queuing delay and packet loss) and achieve fairness among 744 | * BBR flows. 745 | * 746 | * The min_rtt filter window is 10 seconds. When the min_rtt estimate expires, 747 | * we enter PROBE_RTT mode and cap the cwnd at bbr_cwnd_min_target=4 packets. 748 | * After at least bbr_probe_rtt_mode_ms=200ms and at least one packet-timed 749 | * round trip elapsed with that flight size <= 4, we leave PROBE_RTT mode and 750 | * re-enter the previous mode. BBR uses 200ms to approximately bound the 751 | * performance penalty of PROBE_RTT's cwnd capping to roughly 2% (200ms/10s). 752 | * 753 | * Note that flows need only pay 2% if they are busy sending over the last 10 754 | * seconds. Interactive applications (e.g., Web, RPCs, video chunks) often have 755 | * natural silences or low-rate periods within 10 seconds where the rate is low 756 | * enough for long enough to drain its queue in the bottleneck. We pick up 757 | * these min RTT measurements opportunistically with our min_rtt filter. :-) 758 | */ 759 | static void bbr_update_min_rtt(struct sock *sk, const struct rate_sample *rs) 760 | { 761 | struct tcp_sock *tp = tcp_sk(sk); 762 | struct bbr *bbr = inet_csk_ca(sk); 763 | bool filter_expired; 764 | 765 | /* Track min RTT seen in the min_rtt_win_sec filter window: */ 766 | filter_expired = after(tcp_jiffies32, 767 | bbr->min_rtt_stamp + bbr_min_rtt_win_sec * HZ); 768 | if (rs->rtt_us >= 0 && 769 | (rs->rtt_us <= bbr->min_rtt_us || filter_expired)) { 770 | bbr->min_rtt_us = rs->rtt_us; 771 | bbr->min_rtt_stamp = tcp_jiffies32; 772 | } 773 | 774 | if (bbr_probe_rtt_mode_ms > 0 && filter_expired && 775 | !bbr->idle_restart && bbr->mode != BBR_PROBE_RTT) { 776 | bbr->mode = BBR_PROBE_RTT; /* dip, drain queue */ 777 | bbr->pacing_gain = BBR_UNIT; 778 | bbr->cwnd_gain = BBR_UNIT; 779 | bbr_save_cwnd(sk); /* note cwnd so we can restore it */ 780 | bbr->probe_rtt_done_stamp = 0; 781 | } 782 | 783 | if (bbr->mode == BBR_PROBE_RTT) { 784 | /* Ignore low rate samples during this mode. */ 785 | tp->app_limited = 786 | (tp->delivered + tcp_packets_in_flight(tp)) ? : 1; 787 | /* Maintain min packets in flight for max(200 ms, 1 round). */ 788 | if (!bbr->probe_rtt_done_stamp && 789 | tcp_packets_in_flight(tp) <= bbr_cwnd_min_target) { 790 | bbr->probe_rtt_done_stamp = tcp_jiffies32 + 791 | msecs_to_jiffies(bbr_probe_rtt_mode_ms); 792 | bbr->probe_rtt_round_done = 0; 793 | bbr->next_rtt_delivered = tp->delivered; 794 | } else if (bbr->probe_rtt_done_stamp) { 795 | if (bbr->round_start) 796 | bbr->probe_rtt_round_done = 1; 797 | if (bbr->probe_rtt_round_done && 798 | after(tcp_jiffies32, bbr->probe_rtt_done_stamp)) { 799 | bbr->min_rtt_stamp = tcp_jiffies32; 800 | bbr->restore_cwnd = 1; /* snap to prior_cwnd */ 801 | bbr_reset_mode(sk); 802 | } 803 | } 804 | } 805 | bbr->idle_restart = 0; 806 | } 807 | 808 | static void bbr_update_model(struct sock *sk, const struct rate_sample *rs) 809 | { 810 | bbr_update_bw(sk, rs); 811 | bbr_update_cycle_phase(sk, rs); 812 | bbr_check_full_bw_reached(sk, rs); 813 | bbr_check_drain(sk, rs); 814 | bbr_update_min_rtt(sk, rs); 815 | } 816 | 817 | static void bbr_main(struct sock *sk, const struct rate_sample *rs) 818 | { 819 | struct bbr *bbr = inet_csk_ca(sk); 820 | u32 bw; 821 | 822 | bbr_update_model(sk, rs); 823 | 824 | bw = bbr_bw(sk); 825 | bbr_set_pacing_rate(sk, bw, bbr->pacing_gain); 826 | bbr_set_tso_segs_goal(sk); 827 | bbr_set_cwnd(sk, rs, rs->acked_sacked, bw, bbr->cwnd_gain); 828 | } 829 | 830 | static void bbr_init(struct sock *sk) 831 | { 832 | struct tcp_sock *tp = tcp_sk(sk); 833 | struct bbr *bbr = inet_csk_ca(sk); 834 | 835 | bbr->prior_cwnd = 0; 836 | bbr->tso_segs_goal = 0; /* default segs per skb until first ACK */ 837 | bbr->rtt_cnt = 0; 838 | bbr->next_rtt_delivered = 0; 839 | bbr->prev_ca_state = TCP_CA_Open; 840 | bbr->packet_conservation = 0; 841 | 842 | bbr->probe_rtt_done_stamp = 0; 843 | bbr->probe_rtt_round_done = 0; 844 | bbr->min_rtt_us = tcp_min_rtt(tp); 845 | bbr->min_rtt_stamp = tcp_jiffies32; 846 | 847 | minmax_reset(&bbr->bw, bbr->rtt_cnt, 0); /* init max bw to 0 */ 848 | 849 | bbr->has_seen_rtt = 0; 850 | bbr_init_pacing_rate_from_rtt(sk); 851 | 852 | bbr->restore_cwnd = 0; 853 | bbr->round_start = 0; 854 | bbr->idle_restart = 0; 855 | bbr->full_bw_reached = 0; 856 | bbr->full_bw = 0; 857 | bbr->full_bw_cnt = 0; 858 | bbr->cycle_mstamp = 0; 859 | bbr->cycle_idx = 0; 860 | bbr_reset_lt_bw_sampling(sk); 861 | bbr_reset_startup_mode(sk); 862 | 863 | cmpxchg(&sk->sk_pacing_status, SK_PACING_NONE, SK_PACING_NEEDED); 864 | } 865 | 866 | static u32 bbr_sndbuf_expand(struct sock *sk) 867 | { 868 | /* Provision 3 * cwnd since BBR may slow-start even during recovery. */ 869 | return 3; 870 | } 871 | 872 | /* In theory BBR does not need to undo the cwnd since it does not 873 | * always reduce cwnd on losses (see bbr_main()). Keep it for now. 874 | */ 875 | static u32 bbr_undo_cwnd(struct sock *sk) 876 | { 877 | struct bbr *bbr = inet_csk_ca(sk); 878 | 879 | bbr->full_bw = 0; /* spurious slow-down; reset full pipe detection */ 880 | bbr->full_bw_cnt = 0; 881 | bbr_reset_lt_bw_sampling(sk); 882 | return tcp_sk(sk)->snd_cwnd; 883 | } 884 | 885 | /* Entering loss recovery, so save cwnd for when we exit or undo recovery. */ 886 | static u32 bbr_ssthresh(struct sock *sk) 887 | { 888 | bbr_save_cwnd(sk); 889 | return TCP_INFINITE_SSTHRESH; /* BBR does not use ssthresh */ 890 | } 891 | 892 | static size_t bbr_get_info(struct sock *sk, u32 ext, int *attr, 893 | union tcp_cc_info *info) 894 | { 895 | if (ext & (1 << (INET_DIAG_BBRINFO - 1)) || 896 | ext & (1 << (INET_DIAG_VEGASINFO - 1))) { 897 | struct tcp_sock *tp = tcp_sk(sk); 898 | struct bbr *bbr = inet_csk_ca(sk); 899 | u64 bw = bbr_bw(sk); 900 | 901 | bw = bw * tp->mss_cache * USEC_PER_SEC >> BW_SCALE; 902 | memset(&info->bbr, 0, sizeof(info->bbr)); 903 | info->bbr.bbr_bw_lo = (u32)bw; 904 | info->bbr.bbr_bw_hi = (u32)(bw >> 32); 905 | info->bbr.bbr_min_rtt = bbr->min_rtt_us; 906 | info->bbr.bbr_pacing_gain = bbr->pacing_gain; 907 | info->bbr.bbr_cwnd_gain = bbr->cwnd_gain; 908 | *attr = INET_DIAG_BBRINFO; 909 | return sizeof(info->bbr); 910 | } 911 | return 0; 912 | } 913 | 914 | static void bbr_set_state(struct sock *sk, u8 new_state) 915 | { 916 | struct bbr *bbr = inet_csk_ca(sk); 917 | 918 | if (new_state == TCP_CA_Loss) { 919 | struct rate_sample rs = { .losses = 1 }; 920 | 921 | bbr->prev_ca_state = TCP_CA_Loss; 922 | bbr->full_bw = 0; 923 | bbr->round_start = 1; /* treat RTO like end of a round */ 924 | bbr_lt_bw_sampling(sk, &rs); 925 | } 926 | } 927 | 928 | static struct tcp_congestion_ops tcp_bbr_cong_ops __read_mostly = { 929 | .flags = TCP_CONG_NON_RESTRICTED, 930 | .name = "bbr", 931 | .owner = THIS_MODULE, 932 | .init = bbr_init, 933 | .cong_control = bbr_main, 934 | .sndbuf_expand = bbr_sndbuf_expand, 935 | .undo_cwnd = bbr_undo_cwnd, 936 | .cwnd_event = bbr_cwnd_event, 937 | .ssthresh = bbr_ssthresh, 938 | .tso_segs_goal = bbr_tso_segs_goal, 939 | .get_info = bbr_get_info, 940 | .set_state = bbr_set_state, 941 | }; 942 | 943 | static int __init bbr_register(void) 944 | { 945 | BUILD_BUG_ON(sizeof(struct bbr) > ICSK_CA_PRIV_SIZE); 946 | return tcp_register_congestion_control(&tcp_bbr_cong_ops); 947 | } 948 | 949 | static void __exit bbr_unregister(void) 950 | { 951 | tcp_unregister_congestion_control(&tcp_bbr_cong_ops); 952 | } 953 | 954 | module_init(bbr_register); 955 | module_exit(bbr_unregister); 956 | 957 | MODULE_AUTHOR("Van Jacobson "); 958 | MODULE_AUTHOR("Neal Cardwell "); 959 | MODULE_AUTHOR("Yuchung Cheng "); 960 | MODULE_AUTHOR("Soheil Hassas Yeganeh "); 961 | MODULE_LICENSE("Dual BSD/GPL"); 962 | MODULE_DESCRIPTION("TCP BBR (Bottleneck Bandwidth and RTT)"); 963 | -------------------------------------------------------------------------------- /Makefile/Makefile-CentOS: -------------------------------------------------------------------------------- 1 | obj-m := tcp_nanqinlang.o 2 | 3 | all: 4 | make -C /lib/modules/`uname -r`/build M=`pwd` modules CC=/usr/bin/gcc 5 | 6 | clean: 7 | make -C /lib/modules/`uname -r`/build M=`pwd` clean 8 | 9 | install: 10 | install tcp_nanqinlang.ko /lib/modules/`uname -r`/kernel/net/ipv4 11 | insmod /lib/modules/`uname -r`/kernel/net/ipv4/tcp_nanqinlang.ko 12 | depmod -a 13 | 14 | uninstall: 15 | rm /lib/modules/`uname -r`/kernel/net/ipv4/tcp_nanqinlang.ko -------------------------------------------------------------------------------- /Makefile/Makefile-Debian7or8: -------------------------------------------------------------------------------- 1 | obj-m := tcp_nanqinlang.o 2 | 3 | all: 4 | make -C /lib/modules/`uname -r`/build M=`pwd` modules CC=/usr/bin/gcc-4.9 5 | 6 | clean: 7 | make -C /lib/modules/`uname -r`/build M=`pwd` clean 8 | 9 | install: 10 | install tcp_nanqinlang.ko /lib/modules/`uname -r`/kernel/net/ipv4 11 | insmod /lib/modules/`uname -r`/kernel/net/ipv4/tcp_nanqinlang.ko 12 | depmod -a 13 | 14 | uninstall: 15 | rm /lib/modules/`uname -r`/kernel/net/ipv4/tcp_nanqinlang.ko -------------------------------------------------------------------------------- /Makefile/Makefile-Debian9: -------------------------------------------------------------------------------- 1 | obj-m := tcp_nanqinlang.o 2 | 3 | all: 4 | make -C /lib/modules/`uname -r`/build M=`pwd` modules CC=/usr/bin/gcc-6 5 | 6 | clean: 7 | make -C /lib/modules/`uname -r`/build M=`pwd` clean 8 | 9 | install: 10 | install tcp_nanqinlang.ko /lib/modules/`uname -r`/kernel/net/ipv4 11 | insmod /lib/modules/`uname -r`/kernel/net/ipv4/tcp_nanqinlang.ko 12 | depmod -a 13 | 14 | uninstall: 15 | rm /lib/modules/`uname -r`/kernel/net/ipv4/tcp_nanqinlang.ko -------------------------------------------------------------------------------- /readme.md: -------------------------------------------------------------------------------- 1 | # tcp_nanqinlang 2 | 3 | [![build](https://github.com/nanqinlang/SVG/blob/master/build%20passing.svg)](https://github.com/tcp-nanqinlang/general) 4 | [![language1](https://github.com/nanqinlang/SVG/blob/master/language-c-blue.svg)](https://github.com/tcp-nanqinlang/general) 5 | [![language2](https://github.com/nanqinlang/SVG/blob/master/language-shell-blue.svg)](https://github.com/tcp-nanqinlang/general) 6 | [![author](https://github.com/nanqinlang/SVG/blob/master/author-nanqinlang-lightgrey.svg)](https://github.com/tcp-nanqinlang/general) 7 | [![license](https://github.com/nanqinlang/SVG/blob/master/license-GPLv3-orange.svg)](https://github.com/tcp-nanqinlang/general) 8 | 9 | A tcp_bbr enhancement with `violence` 10 | 11 | as this will, the script works with `KVM or higher` 12 | 13 | ## according 14 | Update History 15 | https://github.com/tcp-nanqinlang/general/releases 16 | 17 | 中文文档 18 | https://github.com/tcp-nanqinlang/wiki/wiki/general 19 | --------------------------------------------------------------------------------