├── General
    ├── CentOS
    │   ├── bash
    │   │   ├── tcp_nanqinlang-1.3.2-nocheckvirt.sh
    │   │   └── tcp_nanqinlang-1.3.2.sh
    │   └── source
    │   │   ├── tcp_bbr.c
    │   │   └── tcp_nanqinlang.c
    └── Debian
    │   ├── mod
    │       └── tcp_nanqinlang-for-v4.10.2.ko
    │   └── source
    │       ├── kernel-v4.12andbelow
    │           ├── tcp_bbr.c
    │           └── tcp_nanqinlang.c
    │       ├── kernel-v4.13
    │           ├── tcp_bbr.c
    │           └── tcp_nanqinlang.c
    │       ├── kernel-v4.14
    │           ├── tcp_bbr.c
    │           └── tcp_nanqinlang.c
    │       ├── kernel-v4.15
    │           ├── tcp_bbr.c
    │           └── tcp_nanqinlang.c
    │       └── kernel-v4.16
    │           ├── tcp_bbr.c
    │           └── tcp_nanqinlang.c
├── Makefile
    ├── Makefile-CentOS
    ├── Makefile-Debian7or8
    └── Makefile-Debian9
└── readme.md


/General/CentOS/bash/tcp_nanqinlang-1.3.2-nocheckvirt.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | Green_font="\033[32m" && Yellow_font="\033[33m" && Red_font="\033[31m" && Font_suffix="\033[0m"
  3 | Info="${Green_font}[Info]${Font_suffix}"
  4 | Error="${Red_font}[Error]${Font_suffix}"
  5 | echo -e "${Green_font}
  6 | #======================================================
  7 | # Project:  tcp_nanqinlang general
  8 | # Platform: --CentOS_6/7_64bit --nocheckvirt
  9 | # Version:  1.3.2
 10 | # Author:   nanqinlang
 11 | # Blog:     https://sometimesnaive.org
 12 | # Github:   https://github.com/nanqinlang
 13 | #======================================================${Font_suffix}"
 14 | 
 15 | check_system(){
 16 | 	#sort
 17 | 	[[ -z "`cat /etc/redhat-release | grep -iE "CentOS"`" ]] && echo -e "${Error} only support CentOS !" && exit 1
 18 | 	#number
 19 | 	[[ ! -z "`cat /etc/redhat-release | grep -iE " 7."`" ]] && bit=7
 20 | 	[[ ! -z "`cat /etc/redhat-release | grep -iE " 6."`" ]] && bit=6
 21 | 	#bit
 22 | 	[[ "`uname -m`" != "x86_64" ]] && echo -e "${Error} only support 64bit !" && exit 1
 23 | }
 24 | 
 25 | check_root(){
 26 | 	[[ "`id -u`" != "0" ]] && echo -e "${Error} must be root user !" && exit 1
 27 | }
 28 | 
 29 | check_kvm(){
 30 | 	yum update
 31 | 	yum install -y virt-what
 32 | 	[[ "`virt-what`" != "kvm" ]] && echo -e "${Error} only support KVM !" && exit 1
 33 | }
 34 | 
 35 | directory(){
 36 | 	[[ ! -d /home/tcp_nanqinlang ]] && mkdir -p /home/tcp_nanqinlang
 37 | 	cd /home/tcp_nanqinlang
 38 | }
 39 | 
 40 | check_kernel(){
 41 | 	# check 4.12.10 already installed or not
 42 | 	already_image=`rpm -qa | grep kernel-4.12.10`
 43 | 	already_devel=`rpm -qa | grep kernel-devel-4.12.10`
 44 | 	already_headers=`rpm -qa | grep kernel-headers-4.12.10`
 45 | 
 46 | 	delete_surplus_1
 47 | 
 48 | 	if [[ -z "${already_image}" ]]; then
 49 | 		 echo -e "${Info} installing image" && install_image
 50 | 	else echo -e "${Info} noneed install image"
 51 | 	fi
 52 | 
 53 | 	if [[ -z "${already_devel}" ]]; then
 54 | 		 echo -e "${Info} installing devel" && install_devel
 55 | 	else echo -e "${Info} noneed install devel"
 56 | 	fi
 57 | 
 58 | 	if [[ -z "${already_headers}" ]]; then
 59 | 		 echo -e "${Info} installing headers" && install_headers
 60 | 	else echo -e "${Info} noneed install headers"
 61 | 	fi
 62 | 
 63 | 	update-grub
 64 | 
 65 | }
 66 | 
 67 | delete_surplus_1(){
 68 | 	#surplus_image=`rpm -qa | grep kernel | awk '{print $2}' | grep -v "4.12.10" | wc -l`
 69 | 	#surplus_devel=`rpm -qa | grep kernel-devel | awk '{print $2}' | grep -v "4.12.10" | wc -l`
 70 | 	#surplus_headers=`rpm -qa | grep kernel-headers | awk '{print $2}' | grep -v "4.12.10" | wc -l`
 71 | 
 72 | 	surplus_count=`rpm -qa | grep kernel | grep -v "4.12.10" | wc -l`
 73 | 	surplus_sort_1=`rpm -qa | grep kernel | grep -v "4.12.10"`
 74 | 
 75 | 	while [[ "${surplus_count}" > "1" ]]
 76 | 	do
 77 | 		yum remove -y ${surplus_sort_1}
 78 | 		surplus_count=`rpm -qa | grep kernel | grep -v "4.12.10" | wc -l`
 79 | 		surplus_sort_1=`rpm -qa | grep kernel | grep -v "4.12.10"`
 80 | 	done
 81 | }
 82 | 
 83 | delete_surplus_2(){
 84 | 	current=`uname -r | grep -v "4.12.10"`
 85 | 	if [[ -z "${current}" ]]; then
 86 | 		surplus_sort_2=`rpm -qa | grep kernel | grep -v "4.12.10" | grep -v "dracut-kernel-004-409.el6_8.2.noarch"`
 87 | 		while [[ ! -z "${surplus_sort_2}" ]]
 88 | 		do
 89 | 			 yum remove -y ${surplus_sort_2}
 90 | 			 surplus_sort_2=`rpm -qa | grep kernel | grep -v "4.12.10" | grep -v "dracut-kernel-004-409.el6_8.2.noarch"`
 91 | 		done
 92 | 	else
 93 | 		echo -e "${Error} current running kernel is not v4.12.10, please check !"
 94 | 	fi
 95 | }
 96 | 
 97 | # achieve
 98 | # http://elrepo.mirror.angkasa.id/elrepo/archive/kernel/el6/x86_64/RPMS/
 99 | # http://elrepo.mirror.angkasa.id/elrepo/archive/kernel/el7/x86_64/RPMS/
100 | # my backup: https://github.com/nanqinlang/CentOS-kernel
101 | install_image(){
102 | 	#[[ ! -f kernel-ml-4.12.10-1.el${bit}.elrepo.x86_64.rpm ]] && wget http://elrepo.mirror.angkasa.id/elrepo/archive/kernel/el${bit}/x86_64/RPMS/kernel-ml-4.12.10-1.el${bit}.elrepo.x86_64.rpm
103 | 	[[ ! -f kernel-ml-4.12.10-1.el${bit}.elrepo.x86_64.rpm ]] && wget https://raw.githubusercontent.com/tcp-nanqinlang/CentOS-kernel/master/kernel-ml-4.12.10-1.el${bit}.elrepo.x86_64.rpm
104 | 	[[ ! -f kernel-ml-4.12.10-1.el${bit}.elrepo.x86_64.rpm ]] && echo -e "${Error} ==image download failed, please check !" && exit 1
105 | 	yum  install -y kernel-ml-4.12.10-1.el${bit}.elrepo.x86_64.rpm
106 | }
107 | install_devel(){
108 | 	#[[ ! -f kernel-ml-devel-4.12.10-1.el${bit}.elrepo.x86_64.rpm ]] && wget http://elrepo.mirror.angkasa.id/elrepo/archive/kernel/el${bit}/x86_64/RPMS/kernel-ml-devel-4.12.10-1.el${bit}.elrepo.x86_64.rpm
109 | 	[[ ! -f kernel-ml-devel-4.12.10-1.el${bit}.elrepo.x86_64.rpm ]] && wget https://raw.githubusercontent.com/tcp-nanqinlang/CentOS-kernel/master/kernel-ml-devel-4.12.10-1.el${bit}.elrepo.x86_64.rpm
110 | 	[[ ! -f kernel-ml-devel-4.12.10-1.el${bit}.elrepo.x86_64.rpm ]] && echo -e "${Error} devel download failed, please check !" && exit 1
111 | 	yum  install -y kernel-ml-devel-4.12.10-1.el${bit}.elrepo.x86_64.rpm
112 | }
113 | install_headers(){
114 | 	#[[ ! -f kernel-ml-headers-4.12.10-1.el${bit}.elrepo.x86_64.rpm ]] && wget http://elrepo.mirror.angkasa.id/elrepo/archive/kernel/el${bit}/x86_64/RPMS/kernel-ml-headers-4.12.10-1.el${bit}.elrepo.x86_64.rpm
115 | 	[[ ! -f kernel-ml-headers-4.12.10-1.el${bit}.elrepo.x86_64.rpm ]] && wget https://raw.githubusercontent.com/tcp-nanqinlang/CentOS-kernel/master/kernel-ml-headers-4.12.10-1.el${bit}.elrepo.x86_64.rpm
116 | 	[[ ! -f kernel-ml-headers-4.12.10-1.el${bit}.elrepo.x86_64.rpm ]] && echo -e "${Error} headers download failed, please check !" && exit 1
117 | 	yum  install -y kernel-ml-headers-4.12.10-1.el${bit}.elrepo.x86_64.rpm
118 | }
119 | 
120 | update-grub(){
121 | 	[[ "${bit}" = "7" ]] && grub2-mkconfig -o /boot/grub2/grub.cfg && grub2-set-default 0
122 | 	[[ "${bit}" = "6" ]] && sed -i '/default=/d' /boot/grub/grub.conf && echo -e "\ndefault=0\c" >> /boot/grub/grub.conf
123 | }
124 | 
125 | rpm_list(){
126 | 	rpm -qa | grep kernel
127 | }
128 | 
129 | maker(){
130 | 	yum groupinstall -y "Development Tools" && yum update
131 | 	[[ ! -e /lib/modules/`uname -r`/kernel/net/ipv4/tcp_nanqinlang.ko ]] && compile
132 | 	[[ ! -e /lib/modules/`uname -r`/kernel/net/ipv4/tcp_nanqinlang.ko ]] && echo -e "${Error} load mod failed, please check!" && exit 1
133 | }
134 | 
135 | compile(){
136 | 	wget https://raw.githubusercontent.com/tcp-nanqinlang/general/master/General/CentOS/source/tcp_nanqinlang.c
137 | 	wget -O Makefile https://raw.githubusercontent.com/tcp-nanqinlang/general/master/Makefile/Makefile-CentOS
138 | 	make && make install
139 | }
140 | 
141 | check_status(){
142 | 	#status_sysctl=`sysctl net.ipv4.tcp_available_congestion_control | awk '{print $3}'`
143 | 	#status_lsmod=`lsmod | grep nanqinlang`
144 | 	if [[ "`lsmod | grep nanqinlang`" != "" ]]; then
145 | 		echo -e "${Info} tcp_nanqinlang is installed !"
146 | 			if [[ "`sysctl net.ipv4.tcp_available_congestion_control | awk '{print $3}'`" = "nanqinlang" ]]; then
147 | 				 echo -e "${Info} tcp_nanqinlang is running !"
148 | 			else echo -e "${Error} tcp_nanqinlang is installed but not running !"
149 | 			fi
150 | 	else
151 | 		echo -e "${Error} tcp_nanqinlang not installed !"
152 | 	fi
153 | }
154 | 
155 | 
156 | 
157 | ###################################################################################################
158 | install(){
159 | 	check_system
160 | 	check_root
161 | 	#check_kvm
162 | 	directory
163 | 	check_kernel
164 | 	rpm_list
165 | 	echo -e "${Info} 请确认此行上面的列表显示的内核版本后，重启以应用新内核"
166 | }
167 | 
168 | start(){
169 | 	check_system
170 | 	check_root
171 | 	#check_kvm
172 | 	directory
173 | 	delete_surplus_2 && update-grub
174 | 	maker
175 | 	sed  -i '/net\.core\.default_qdisc/d' /etc/sysctl.conf
176 | 	sed  -i '/net\.ipv4\.tcp_congestion_control/d' /etc/sysctl.conf
177 | 	echo -e "\nnet.core.default_qdisc=fq" >> /etc/sysctl.conf
178 | 	echo -e "net.ipv4.tcp_congestion_control=nanqinlang\c" >> /etc/sysctl.conf
179 | 	sysctl -p
180 | 	check_status
181 | 	rm -rf /home/tcp_nanqinlang
182 | }
183 | 
184 | status(){
185 | 	check_status
186 | }
187 | 
188 | uninstall(){
189 | 	check_root
190 | 	sed -i '/net\.core\.default_qdisc=/d'          /etc/sysctl.conf
191 | 	sed -i '/net\.ipv4\.tcp_congestion_control=/d' /etc/sysctl.conf
192 | 	sysctl -p
193 | 	rm  /lib/modules/`uname -r`/kernel/net/ipv4/tcp_nanqinlang.ko
194 | 	echo -e "${Info} please remember ${reboot} to stop tcp_nanqinlang !"
195 | }
196 | 
197 | echo -e "${Info} 选择你要使用的功能: "
198 | echo -e "1.安装内核\n2.开启算法\n3.检查算法运行状态\n4.卸载算法"
199 | read -p "输入数字以选择:" function
200 | 
201 | while [[ ! "${function}" =~ ^[1-4]$ ]]
202 | 	do
203 | 		echo -e "${Error} 无效输入"
204 | 		echo -e "${Info} 请重新选择" && read -p "输入数字以选择:" function
205 | 	done
206 | 
207 | if [[ "${function}" == "1" ]]; then
208 | 	install
209 | elif [[ "${function}" == "2" ]]; then
210 | 	start
211 | elif [[ "${function}" == "3" ]]; then
212 | 	status
213 | else
214 | 	uninstall
215 | fi
216 | 


--------------------------------------------------------------------------------
/General/CentOS/bash/tcp_nanqinlang-1.3.2.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | Green_font="\033[32m" && Yellow_font="\033[33m" && Red_font="\033[31m" && Font_suffix="\033[0m"
  3 | Info="${Green_font}[Info]${Font_suffix}"
  4 | Error="${Red_font}[Error]${Font_suffix}"
  5 | echo -e "${Green_font}
  6 | #======================================================
  7 | # Project:  tcp_nanqinlang general
  8 | # Platform: --CentOS_6/7_64bit --KVM
  9 | # Version:  1.3.2
 10 | # Author:   nanqinlang
 11 | # Blog:     https://sometimesnaive.org
 12 | # Github:   https://github.com/nanqinlang
 13 | #======================================================${Font_suffix}"
 14 | 
 15 | check_system(){
 16 | 	#sort
 17 | 	[[ -z "`cat /etc/redhat-release | grep -iE "CentOS"`" ]] && echo -e "${Error} only support CentOS !" && exit 1
 18 | 	#number
 19 | 	[[ ! -z "`cat /etc/redhat-release | grep -iE " 7."`" ]] && bit=7
 20 | 	[[ ! -z "`cat /etc/redhat-release | grep -iE " 6."`" ]] && bit=6
 21 | 	#bit
 22 | 	[[ "`uname -m`" != "x86_64" ]] && echo -e "${Error} only support 64bit !" && exit 1
 23 | }
 24 | 
 25 | check_root(){
 26 | 	[[ "`id -u`" != "0" ]] && echo -e "${Error} must be root user !" && exit 1
 27 | }
 28 | 
 29 | check_kvm(){
 30 | 	yum update
 31 | 	yum install -y virt-what
 32 | 	[[ "`virt-what`" != "kvm" ]] && echo -e "${Error} only support KVM !" && exit 1
 33 | }
 34 | 
 35 | directory(){
 36 | 	[[ ! -d /home/tcp_nanqinlang ]] && mkdir -p /home/tcp_nanqinlang
 37 | 	cd /home/tcp_nanqinlang
 38 | }
 39 | 
 40 | check_kernel(){
 41 | 	# check 4.12.10 already installed or not
 42 | 	already_image=`rpm -qa | grep kernel-4.12.10`
 43 | 	already_devel=`rpm -qa | grep kernel-devel-4.12.10`
 44 | 	already_headers=`rpm -qa | grep kernel-headers-4.12.10`
 45 | 
 46 | 	delete_surplus_1
 47 | 
 48 | 	if [[ -z "${already_image}" ]]; then
 49 | 		 echo -e "${Info} installing image" && install_image
 50 | 	else echo -e "${Info} noneed install image"
 51 | 	fi
 52 | 
 53 | 	if [[ -z "${already_devel}" ]]; then
 54 | 		 echo -e "${Info} installing devel" && install_devel
 55 | 	else echo -e "${Info} noneed install devel"
 56 | 	fi
 57 | 
 58 | 	if [[ -z "${already_headers}" ]]; then
 59 | 		 echo -e "${Info} installing headers" && install_headers
 60 | 	else echo -e "${Info} noneed install headers"
 61 | 	fi
 62 | 
 63 | 	update-grub
 64 | 
 65 | }
 66 | 
 67 | delete_surplus_1(){
 68 | 	#surplus_image=`rpm -qa | grep kernel | awk '{print $2}' | grep -v "4.12.10" | wc -l`
 69 | 	#surplus_devel=`rpm -qa | grep kernel-devel | awk '{print $2}' | grep -v "4.12.10" | wc -l`
 70 | 	#surplus_headers=`rpm -qa | grep kernel-headers | awk '{print $2}' | grep -v "4.12.10" | wc -l`
 71 | 
 72 | 	surplus_count=`rpm -qa | grep kernel | grep -v "4.12.10" | wc -l`
 73 | 	surplus_sort_1=`rpm -qa | grep kernel | grep -v "4.12.10"`
 74 | 
 75 | 	while [[ "${surplus_count}" > "1" ]]
 76 | 	do
 77 | 		yum remove -y ${surplus_sort_1}
 78 | 		surplus_count=`rpm -qa | grep kernel | grep -v "4.12.10" | wc -l`
 79 | 		surplus_sort_1=`rpm -qa | grep kernel | grep -v "4.12.10"`
 80 | 	done
 81 | }
 82 | 
 83 | delete_surplus_2(){
 84 | 	current=`uname -r | grep -v "4.12.10"`
 85 | 	if [[ -z "${current}" ]]; then
 86 | 		surplus_sort_2=`rpm -qa | grep kernel | grep -v "4.12.10" | grep -v "dracut-kernel-004-409.el6_8.2.noarch"`
 87 | 		while [[ ! -z "${surplus_sort_2}" ]]
 88 | 		do
 89 | 			 yum remove -y ${surplus_sort_2}
 90 | 			 surplus_sort_2=`rpm -qa | grep kernel | grep -v "4.12.10" | grep -v "dracut-kernel-004-409.el6_8.2.noarch"`
 91 | 		done
 92 | 	else
 93 | 		echo -e "${Error} current running kernel is not v4.12.10, please check !"
 94 | 	fi
 95 | }
 96 | 
 97 | # achieve
 98 | # http://elrepo.mirror.angkasa.id/elrepo/archive/kernel/el6/x86_64/RPMS/
 99 | # http://elrepo.mirror.angkasa.id/elrepo/archive/kernel/el7/x86_64/RPMS/
100 | # my backup: https://github.com/nanqinlang/CentOS-kernel
101 | install_image(){
102 | 	#[[ ! -f kernel-ml-4.12.10-1.el${bit}.elrepo.x86_64.rpm ]] && wget http://elrepo.mirror.angkasa.id/elrepo/archive/kernel/el${bit}/x86_64/RPMS/kernel-ml-4.12.10-1.el${bit}.elrepo.x86_64.rpm
103 | 	[[ ! -f kernel-ml-4.12.10-1.el${bit}.elrepo.x86_64.rpm ]] && wget https://raw.githubusercontent.com/tcp-nanqinlang/CentOS-kernel/master/kernel-ml-4.12.10-1.el${bit}.elrepo.x86_64.rpm
104 | 	[[ ! -f kernel-ml-4.12.10-1.el${bit}.elrepo.x86_64.rpm ]] && echo -e "${Error} ==image download failed, please check !" && exit 1
105 | 	yum  install -y kernel-ml-4.12.10-1.el${bit}.elrepo.x86_64.rpm
106 | }
107 | install_devel(){
108 | 	#[[ ! -f kernel-ml-devel-4.12.10-1.el${bit}.elrepo.x86_64.rpm ]] && wget http://elrepo.mirror.angkasa.id/elrepo/archive/kernel/el${bit}/x86_64/RPMS/kernel-ml-devel-4.12.10-1.el${bit}.elrepo.x86_64.rpm
109 | 	[[ ! -f kernel-ml-devel-4.12.10-1.el${bit}.elrepo.x86_64.rpm ]] && wget https://raw.githubusercontent.com/tcp-nanqinlang/CentOS-kernel/master/kernel-ml-devel-4.12.10-1.el${bit}.elrepo.x86_64.rpm
110 | 	[[ ! -f kernel-ml-devel-4.12.10-1.el${bit}.elrepo.x86_64.rpm ]] && echo -e "${Error} devel download failed, please check !" && exit 1
111 | 	yum  install -y kernel-ml-devel-4.12.10-1.el${bit}.elrepo.x86_64.rpm
112 | }
113 | install_headers(){
114 | 	#[[ ! -f kernel-ml-headers-4.12.10-1.el${bit}.elrepo.x86_64.rpm ]] && wget http://elrepo.mirror.angkasa.id/elrepo/archive/kernel/el${bit}/x86_64/RPMS/kernel-ml-headers-4.12.10-1.el${bit}.elrepo.x86_64.rpm
115 | 	[[ ! -f kernel-ml-headers-4.12.10-1.el${bit}.elrepo.x86_64.rpm ]] && wget https://raw.githubusercontent.com/tcp-nanqinlang/CentOS-kernel/master/kernel-ml-headers-4.12.10-1.el${bit}.elrepo.x86_64.rpm
116 | 	[[ ! -f kernel-ml-headers-4.12.10-1.el${bit}.elrepo.x86_64.rpm ]] && echo -e "${Error} headers download failed, please check !" && exit 1
117 | 	yum  install -y kernel-ml-headers-4.12.10-1.el${bit}.elrepo.x86_64.rpm
118 | }
119 | 
120 | update-grub(){
121 | 	[[ "${bit}" = "7" ]] && grub2-mkconfig -o /boot/grub2/grub.cfg && grub2-set-default 0
122 | 	[[ "${bit}" = "6" ]] && sed -i '/default=/d' /boot/grub/grub.conf && echo -e "\ndefault=0\c" >> /boot/grub/grub.conf
123 | }
124 | 
125 | rpm_list(){
126 | 	rpm -qa | grep kernel
127 | }
128 | 
129 | maker(){
130 | 	yum groupinstall -y "Development Tools" && yum update
131 | 	[[ ! -e /lib/modules/`uname -r`/kernel/net/ipv4/tcp_nanqinlang.ko ]] && compile
132 | 	[[ ! -e /lib/modules/`uname -r`/kernel/net/ipv4/tcp_nanqinlang.ko ]] && echo -e "${Error} load mod failed, please check!" && exit 1
133 | }
134 | 
135 | compile(){
136 | 	wget https://raw.githubusercontent.com/tcp-nanqinlang/general/master/General/CentOS/source/tcp_nanqinlang.c
137 | 	wget -O Makefile https://raw.githubusercontent.com/tcp-nanqinlang/general/master/Makefile/Makefile-CentOS
138 | 	make && make install
139 | }
140 | 
141 | check_status(){
142 | 	#status_sysctl=`sysctl net.ipv4.tcp_available_congestion_control | awk '{print $3}'`
143 | 	#status_lsmod=`lsmod | grep nanqinlang`
144 | 	if [[ "`lsmod | grep nanqinlang`" != "" ]]; then
145 | 		echo -e "${Info} tcp_nanqinlang is installed !"
146 | 			if [[ "`sysctl net.ipv4.tcp_available_congestion_control | awk '{print $3}'`" = "nanqinlang" ]]; then
147 | 				 echo -e "${Info} tcp_nanqinlang is running !"
148 | 			else echo -e "${Error} tcp_nanqinlang is installed but not running !"
149 | 			fi
150 | 	else
151 | 		echo -e "${Error} tcp_nanqinlang not installed !"
152 | 	fi
153 | }
154 | 
155 | 
156 | 
157 | ###################################################################################################
158 | install(){
159 | 	check_system
160 | 	check_root
161 | 	check_kvm
162 | 	directory
163 | 	check_kernel
164 | 	rpm_list
165 | 	echo -e "${Info} 请确认此行上面的列表显示的内核版本后，重启以应用新内核"
166 | }
167 | 
168 | start(){
169 | 	check_system
170 | 	check_root
171 | 	check_kvm
172 | 	directory
173 | 	delete_surplus_2 && update-grub
174 | 	maker
175 | 	sed  -i '/net\.core\.default_qdisc/d' /etc/sysctl.conf
176 | 	sed  -i '/net\.ipv4\.tcp_congestion_control/d' /etc/sysctl.conf
177 | 	echo -e "\nnet.core.default_qdisc=fq" >> /etc/sysctl.conf
178 | 	echo -e "net.ipv4.tcp_congestion_control=nanqinlang\c" >> /etc/sysctl.conf
179 | 	sysctl -p
180 | 	check_status
181 | 	rm -rf /home/tcp_nanqinlang
182 | }
183 | 
184 | status(){
185 | 	check_status
186 | }
187 | 
188 | uninstall(){
189 | 	check_root
190 | 	sed -i '/net\.core\.default_qdisc=/d'          /etc/sysctl.conf
191 | 	sed -i '/net\.ipv4\.tcp_congestion_control=/d' /etc/sysctl.conf
192 | 	sysctl -p
193 | 	rm  /lib/modules/`uname -r`/kernel/net/ipv4/tcp_nanqinlang.ko
194 | 	echo -e "${Info} please remember ${reboot} to stop tcp_nanqinlang !"
195 | }
196 | 
197 | echo -e "${Info} 选择你要使用的功能: "
198 | echo -e "1.安装内核\n2.开启算法\n3.检查算法运行状态\n4.卸载算法"
199 | read -p "输入数字以选择:" function
200 | 
201 | while [[ ! "${function}" =~ ^[1-4]$ ]]
202 | 	do
203 | 		echo -e "${Error} 无效输入"
204 | 		echo -e "${Info} 请重新选择" && read -p "输入数字以选择:" function
205 | 	done
206 | 
207 | if [[ "${function}" == "1" ]]; then
208 | 	install
209 | elif [[ "${function}" == "2" ]]; then
210 | 	start
211 | elif [[ "${function}" == "3" ]]; then
212 | 	status
213 | else
214 | 	uninstall
215 | fi


--------------------------------------------------------------------------------
/General/CentOS/source/tcp_bbr.c:
--------------------------------------------------------------------------------
  1 | /* Bottleneck Bandwidth and RTT (BBR) congestion control
  2 |  *
  3 |  * BBR congestion control computes the sending rate based on the delivery
  4 |  * rate (throughput) estimated from ACKs. In a nutshell:
  5 |  *
  6 |  *   On each ACK, update our model of the network path:
  7 |  *      bottleneck_bandwidth = windowed_max(delivered / elapsed, 10 round trips)
  8 |  *      min_rtt = windowed_min(rtt, 10 seconds)
  9 |  *   pacing_rate = pacing_gain * bottleneck_bandwidth
 10 |  *   cwnd = max(cwnd_gain * bottleneck_bandwidth * min_rtt, 4)
 11 |  *
 12 |  * The core algorithm does not react directly to packet losses or delays,
 13 |  * although BBR may adjust the size of next send per ACK when loss is
 14 |  * observed, or adjust the sending rate if it estimates there is a
 15 |  * traffic policer, in order to keep the drop rate reasonable.
 16 |  *
 17 |  * Here is a state transition diagram for BBR:
 18 |  *
 19 |  *             |
 20 |  *             V
 21 |  *    +---> STARTUP  ----+
 22 |  *    |        |         |
 23 |  *    |        V         |
 24 |  *    |      DRAIN   ----+
 25 |  *    |        |         |
 26 |  *    |        V         |
 27 |  *    +---> PROBE_BW ----+
 28 |  *    |      ^    |      |
 29 |  *    |      |    |      |
 30 |  *    |      +----+      |
 31 |  *    |                  |
 32 |  *    +---- PROBE_RTT <--+
 33 |  *
 34 |  * A BBR flow starts in STARTUP, and ramps up its sending rate quickly.
 35 |  * When it estimates the pipe is full, it enters DRAIN to drain the queue.
 36 |  * In steady state a BBR flow only uses PROBE_BW and PROBE_RTT.
 37 |  * A long-lived BBR flow spends the vast majority of its time remaining
 38 |  * (repeatedly) in PROBE_BW, fully probing and utilizing the pipe's bandwidth
 39 |  * in a fair manner, with a small, bounded queue. *If* a flow has been
 40 |  * continuously sending for the entire min_rtt window, and hasn't seen an RTT
 41 |  * sample that matches or decreases its min_rtt estimate for 10 seconds, then
 42 |  * it briefly enters PROBE_RTT to cut inflight to a minimum value to re-probe
 43 |  * the path's two-way propagation delay (min_rtt). When exiting PROBE_RTT, if
 44 |  * we estimated that we reached the full bw of the pipe then we enter PROBE_BW;
 45 |  * otherwise we enter STARTUP to try to fill the pipe.
 46 |  *
 47 |  * BBR is described in detail in:
 48 |  *   "BBR: Congestion-Based Congestion Control",
 49 |  *   Neal Cardwell, Yuchung Cheng, C. Stephen Gunn, Soheil Hassas Yeganeh,
 50 |  *   Van Jacobson. ACM Queue, Vol. 14 No. 5, September-October 2016.
 51 |  *
 52 |  * There is a public e-mail list for discussing BBR development and testing:
 53 |  *   https://groups.google.com/forum/#!forum/bbr-dev
 54 |  *
 55 |  * NOTE: BBR *must* be used with the fq qdisc ("man tc-fq") with pacing enabled,
 56 |  * since pacing is integral to the BBR design and implementation.
 57 |  * BBR without pacing would not function properly, and may incur unnecessary
 58 |  * high packet loss rates.
 59 |  */
 60 | #include <linux/module.h>
 61 | #include <net/tcp.h>
 62 | #include <linux/inet_diag.h>
 63 | #include <linux/inet.h>
 64 | #include <linux/random.h>
 65 | #include <linux/win_minmax.h>
 66 | 
 67 | /* Scale factor for rate in pkt/uSec unit to avoid truncation in bandwidth
 68 |  * estimation. The rate unit ~= (1500 bytes / 1 usec / 2^24) ~= 715 bps.
 69 |  * This handles bandwidths from 0.06pps (715bps) to 256Mpps (3Tbps) in a u32.
 70 |  * Since the minimum window is >=4 packets, the lower bound isn't
 71 |  * an issue. The upper bound isn't an issue with existing technologies.
 72 |  */
 73 | #define BW_SCALE 24
 74 | #define BW_UNIT (1 << BW_SCALE)
 75 | 
 76 | #define BBR_SCALE 8	/* scaling factor for fractions in BBR (e.g. gains) */
 77 | #define BBR_UNIT (1 << BBR_SCALE)
 78 | 
 79 | /* BBR has the following modes for deciding how fast to send: */
 80 | enum bbr_mode {
 81 | 	BBR_STARTUP,	/* ramp up sending rate rapidly to fill pipe */
 82 | 	BBR_DRAIN,	/* drain any queue created during startup */
 83 | 	BBR_PROBE_BW,	/* discover, share bw: pace around estimated bw */
 84 | 	BBR_PROBE_RTT,	/* cut inflight to min to probe min_rtt */
 85 | };
 86 | 
 87 | /* BBR congestion control block */
 88 | struct bbr {
 89 | 	u32	min_rtt_us;	        /* min RTT in min_rtt_win_sec window */
 90 | 	u32	min_rtt_stamp;	        /* timestamp of min_rtt_us */
 91 | 	u32	probe_rtt_done_stamp;   /* end time for BBR_PROBE_RTT mode */
 92 | 	struct minmax bw;	/* Max recent delivery rate in pkts/uS << 24 */
 93 | 	u32	rtt_cnt;	    /* count of packet-timed rounds elapsed */
 94 | 	u32     next_rtt_delivered; /* scb->tx.delivered at end of round */
 95 | 	struct skb_mstamp cycle_mstamp;  /* time of this cycle phase start */
 96 | 	u32     mode:3,		     /* current bbr_mode in state machine */
 97 | 		prev_ca_state:3,     /* CA state on previous ACK */
 98 | 		packet_conservation:1,  /* use packet conservation? */
 99 | 		restore_cwnd:1,	     /* decided to revert cwnd to old value */
100 | 		round_start:1,	     /* start of packet-timed tx->ack round? */
101 | 		tso_segs_goal:7,     /* segments we want in each skb we send */
102 | 		idle_restart:1,	     /* restarting after idle? */
103 | 		probe_rtt_round_done:1,  /* a BBR_PROBE_RTT round at 4 pkts? */
104 | 		unused:5,
105 | 		lt_is_sampling:1,    /* taking long-term ("LT") samples now? */
106 | 		lt_rtt_cnt:7,	     /* round trips in long-term interval */
107 | 		lt_use_bw:1;	     /* use lt_bw as our bw estimate? */
108 | 	u32	lt_bw;		     /* LT est delivery rate in pkts/uS << 24 */
109 | 	u32	lt_last_delivered;   /* LT intvl start: tp->delivered */
110 | 	u32	lt_last_stamp;	     /* LT intvl start: tp->delivered_mstamp */
111 | 	u32	lt_last_lost;	     /* LT intvl start: tp->lost */
112 | 	u32	pacing_gain:10,	/* current gain for setting pacing rate */
113 | 		cwnd_gain:10,	/* current gain for setting cwnd */
114 | 		full_bw_cnt:3,	/* number of rounds without large bw gains */
115 | 		cycle_idx:3,	/* current index in pacing_gain cycle array */
116 | 		unused_b:6;
117 | 	u32	prior_cwnd;	/* prior cwnd upon entering loss recovery */
118 | 	u32	full_bw;	/* recent bw, to estimate if pipe is full */
119 | };
120 | 
121 | #define CYCLE_LEN	8	/* number of phases in a pacing gain cycle */
122 | 
123 | /* Window length of bw filter (in rounds): */
124 | static const int bbr_bw_rtts = CYCLE_LEN + 2;
125 | /* Window length of min_rtt filter (in sec): */
126 | static const u32 bbr_min_rtt_win_sec = 10;
127 | /* Minimum time (in ms) spent at bbr_cwnd_min_target in BBR_PROBE_RTT mode: */
128 | static const u32 bbr_probe_rtt_mode_ms = 200;
129 | /* Skip TSO below the following bandwidth (bits/sec): */
130 | static const int bbr_min_tso_rate = 1200000;
131 | 
132 | /* We use a high_gain value of 2/ln(2) because it's the smallest pacing gain
133 |  * that will allow a smoothly increasing pacing rate that will double each RTT
134 |  * and send the same number of packets per RTT that an un-paced, slow-starting
135 |  * Reno or CUBIC flow would:
136 |  */
137 | static const int bbr_high_gain  = BBR_UNIT * 2885 / 1000 + 1;
138 | /* The pacing gain of 1/high_gain in BBR_DRAIN is calculated to typically drain
139 |  * the queue created in BBR_STARTUP in a single round:
140 |  */
141 | static const int bbr_drain_gain = BBR_UNIT * 1000 / 2885;
142 | /* The gain for deriving steady-state cwnd tolerates delayed/stretched ACKs: */
143 | static const int bbr_cwnd_gain  = BBR_UNIT * 2;
144 | /* The pacing_gain values for the PROBE_BW gain cycle, to discover/share bw: */
145 | static const int bbr_pacing_gain[] = {
146 | 	BBR_UNIT * 5 / 4,	/* probe for more available bw */
147 | 	BBR_UNIT * 3 / 4,	/* drain queue and/or yield bw to other flows */
148 | 	BBR_UNIT, BBR_UNIT, BBR_UNIT,	/* cruise at 1.0*bw to utilize pipe, */
149 | 	BBR_UNIT, BBR_UNIT, BBR_UNIT	/* without creating excess queue... */
150 | };
151 | /* Randomize the starting gain cycling phase over N phases: */
152 | static const u32 bbr_cycle_rand = 7;
153 | 
154 | /* Try to keep at least this many packets in flight, if things go smoothly. For
155 |  * smooth functioning, a sliding window protocol ACKing every other packet
156 |  * needs at least 4 packets in flight:
157 |  */
158 | static const u32 bbr_cwnd_min_target = 4;
159 | 
160 | /* To estimate if BBR_STARTUP mode (i.e. high_gain) has filled pipe... */
161 | /* If bw has increased significantly (1.25x), there may be more bw available: */
162 | static const u32 bbr_full_bw_thresh = BBR_UNIT * 5 / 4;
163 | /* But after 3 rounds w/o significant bw growth, estimate pipe is full: */
164 | static const u32 bbr_full_bw_cnt = 3;
165 | 
166 | /* "long-term" ("LT") bandwidth estimator parameters... */
167 | /* The minimum number of rounds in an LT bw sampling interval: */
168 | static const u32 bbr_lt_intvl_min_rtts = 4;
169 | /* If lost/delivered ratio > 20%, interval is "lossy" and we may be policed: */
170 | static const u32 bbr_lt_loss_thresh = 50;
171 | /* If 2 intervals have a bw ratio <= 1/8, their bw is "consistent": */
172 | static const u32 bbr_lt_bw_ratio = BBR_UNIT / 8;
173 | /* If 2 intervals have a bw diff <= 4 Kbit/sec their bw is "consistent": */
174 | static const u32 bbr_lt_bw_diff = 4000 / 8;
175 | /* If we estimate we're policed, use lt_bw for this many round trips: */
176 | static const u32 bbr_lt_bw_max_rtts = 48;
177 | 
178 | /* Do we estimate that STARTUP filled the pipe? */
179 | static bool bbr_full_bw_reached(const struct sock *sk)
180 | {
181 | 	const struct bbr *bbr = inet_csk_ca(sk);
182 | 
183 | 	return bbr->full_bw_cnt >= bbr_full_bw_cnt;
184 | }
185 | 
186 | /* Return the windowed max recent bandwidth sample, in pkts/uS << BW_SCALE. */
187 | static u32 bbr_max_bw(const struct sock *sk)
188 | {
189 | 	struct bbr *bbr = inet_csk_ca(sk);
190 | 
191 | 	return minmax_get(&bbr->bw);
192 | }
193 | 
194 | /* Return the estimated bandwidth of the path, in pkts/uS << BW_SCALE. */
195 | static u32 bbr_bw(const struct sock *sk)
196 | {
197 | 	struct bbr *bbr = inet_csk_ca(sk);
198 | 
199 | 	return bbr->lt_use_bw ? bbr->lt_bw : bbr_max_bw(sk);
200 | }
201 | 
202 | /* Return rate in bytes per second, optionally with a gain.
203 |  * The order here is chosen carefully to avoid overflow of u64. This should
204 |  * work for input rates of up to 2.9Tbit/sec and gain of 2.89x.
205 |  */
206 | static u64 bbr_rate_bytes_per_sec(struct sock *sk, u64 rate, int gain)
207 | {
208 | 	rate *= tcp_mss_to_mtu(sk, tcp_sk(sk)->mss_cache);
209 | 	rate *= gain;
210 | 	rate >>= BBR_SCALE;
211 | 	rate *= USEC_PER_SEC;
212 | 	return rate >> BW_SCALE;
213 | }
214 | 
215 | /* Pace using current bw estimate and a gain factor. In order to help drive the
216 |  * network toward lower queues while maintaining high utilization and low
217 |  * latency, the average pacing rate aims to be slightly (~1%) lower than the
218 |  * estimated bandwidth. This is an important aspect of the design. In this
219 |  * implementation this slightly lower pacing rate is achieved implicitly by not
220 |  * including link-layer headers in the packet size used for the pacing rate.
221 |  */
222 | static void bbr_set_pacing_rate(struct sock *sk, u32 bw, int gain)
223 | {
224 | 	struct bbr *bbr = inet_csk_ca(sk);
225 | 	u64 rate = bw;
226 | 
227 | 	rate = bbr_rate_bytes_per_sec(sk, rate, gain);
228 | 	rate = min_t(u64, rate, sk->sk_max_pacing_rate);
229 | 	if (bbr->mode != BBR_STARTUP || rate > sk->sk_pacing_rate)
230 | 		sk->sk_pacing_rate = rate;
231 | }
232 | 
233 | /* Return count of segments we want in the skbs we send, or 0 for default. */
234 | static u32 bbr_tso_segs_goal(struct sock *sk)
235 | {
236 | 	struct bbr *bbr = inet_csk_ca(sk);
237 | 
238 | 	return bbr->tso_segs_goal;
239 | }
240 | 
241 | static void bbr_set_tso_segs_goal(struct sock *sk)
242 | {
243 | 	struct tcp_sock *tp = tcp_sk(sk);
244 | 	struct bbr *bbr = inet_csk_ca(sk);
245 | 	u32 min_segs;
246 | 
247 | 	min_segs = sk->sk_pacing_rate < (bbr_min_tso_rate >> 3) ? 1 : 2;
248 | 	bbr->tso_segs_goal = min(tcp_tso_autosize(sk, tp->mss_cache, min_segs),
249 | 				 0x7FU);
250 | }
251 | 
252 | /* Save "last known good" cwnd so we can restore it after losses or PROBE_RTT */
253 | static void bbr_save_cwnd(struct sock *sk)
254 | {
255 | 	struct tcp_sock *tp = tcp_sk(sk);
256 | 	struct bbr *bbr = inet_csk_ca(sk);
257 | 
258 | 	if (bbr->prev_ca_state < TCP_CA_Recovery && bbr->mode != BBR_PROBE_RTT)
259 | 		bbr->prior_cwnd = tp->snd_cwnd;  /* this cwnd is good enough */
260 | 	else  /* loss recovery or BBR_PROBE_RTT have temporarily cut cwnd */
261 | 		bbr->prior_cwnd = max(bbr->prior_cwnd, tp->snd_cwnd);
262 | }
263 | 
264 | static void bbr_cwnd_event(struct sock *sk, enum tcp_ca_event event)
265 | {
266 | 	struct tcp_sock *tp = tcp_sk(sk);
267 | 	struct bbr *bbr = inet_csk_ca(sk);
268 | 
269 | 	if (event == CA_EVENT_TX_START && tp->app_limited) {
270 | 		bbr->idle_restart = 1;
271 | 		/* Avoid pointless buffer overflows: pace at est. bw if we don't
272 | 		 * need more speed (we're restarting from idle and app-limited).
273 | 		 */
274 | 		if (bbr->mode == BBR_PROBE_BW)
275 | 			bbr_set_pacing_rate(sk, bbr_bw(sk), BBR_UNIT);
276 | 	}
277 | }
278 | 
279 | /* Find target cwnd. Right-size the cwnd based on min RTT and the
280 |  * estimated bottleneck bandwidth:
281 |  *
282 |  * cwnd = bw * min_rtt * gain = BDP * gain
283 |  *
284 |  * The key factor, gain, controls the amount of queue. While a small gain
285 |  * builds a smaller queue, it becomes more vulnerable to noise in RTT
286 |  * measurements (e.g., delayed ACKs or other ACK compression effects). This
287 |  * noise may cause BBR to under-estimate the rate.
288 |  *
289 |  * To achieve full performance in high-speed paths, we budget enough cwnd to
290 |  * fit full-sized skbs in-flight on both end hosts to fully utilize the path:
291 |  *   - one skb in sending host Qdisc,
292 |  *   - one skb in sending host TSO/GSO engine
293 |  *   - one skb being received by receiver host LRO/GRO/delayed-ACK engine
294 |  * Don't worry, at low rates (bbr_min_tso_rate) this won't bloat cwnd because
295 |  * in such cases tso_segs_goal is 1. The minimum cwnd is 4 packets,
296 |  * which allows 2 outstanding 2-packet sequences, to try to keep pipe
297 |  * full even with ACK-every-other-packet delayed ACKs.
298 |  */
299 | static u32 bbr_target_cwnd(struct sock *sk, u32 bw, int gain)
300 | {
301 | 	struct bbr *bbr = inet_csk_ca(sk);
302 | 	u32 cwnd;
303 | 	u64 w;
304 | 
305 | 	/* If we've never had a valid RTT sample, cap cwnd at the initial
306 | 	 * default. This should only happen when the connection is not using TCP
307 | 	 * timestamps and has retransmitted all of the SYN/SYNACK/data packets
308 | 	 * ACKed so far. In this case, an RTO can cut cwnd to 1, in which
309 | 	 * case we need to slow-start up toward something safe: TCP_INIT_CWND.
310 | 	 */
311 | 	if (unlikely(bbr->min_rtt_us == ~0U))	 /* no valid RTT samples yet? */
312 | 		return TCP_INIT_CWND;  /* be safe: cap at default initial cwnd*/
313 | 
314 | 	w = (u64)bw * bbr->min_rtt_us;
315 | 
316 | 	/* Apply a gain to the given value, then remove the BW_SCALE shift. */
317 | 	cwnd = (((w * gain) >> BBR_SCALE) + BW_UNIT - 1) / BW_UNIT;
318 | 
319 | 	/* Allow enough full-sized skbs in flight to utilize end systems. */
320 | 	cwnd += 3 * bbr->tso_segs_goal;
321 | 
322 | 	/* Reduce delayed ACKs by rounding up cwnd to the next even number. */
323 | 	cwnd = (cwnd + 1) & ~1U;
324 | 
325 | 	return cwnd;
326 | }
327 | 
328 | /* An optimization in BBR to reduce losses: On the first round of recovery, we
329 |  * follow the packet conservation principle: send P packets per P packets acked.
330 |  * After that, we slow-start and send at most 2*P packets per P packets acked.
331 |  * After recovery finishes, or upon undo, we restore the cwnd we had when
332 |  * recovery started (capped by the target cwnd based on estimated BDP).
333 |  *
334 |  * TODO(ycheng/ncardwell): implement a rate-based approach.
335 |  */
336 | static bool bbr_set_cwnd_to_recover_or_restore(
337 | 	struct sock *sk, const struct rate_sample *rs, u32 acked, u32 *new_cwnd)
338 | {
339 | 	struct tcp_sock *tp = tcp_sk(sk);
340 | 	struct bbr *bbr = inet_csk_ca(sk);
341 | 	u8 prev_state = bbr->prev_ca_state, state = inet_csk(sk)->icsk_ca_state;
342 | 	u32 cwnd = tp->snd_cwnd;
343 | 
344 | 	/* An ACK for P pkts should release at most 2*P packets. We do this
345 | 	 * in two steps. First, here we deduct the number of lost packets.
346 | 	 * Then, in bbr_set_cwnd() we slow start up toward the target cwnd.
347 | 	 */
348 | 	if (rs->losses > 0)
349 | 		cwnd = max_t(s32, cwnd - rs->losses, 1);
350 | 
351 | 	if (state == TCP_CA_Recovery && prev_state != TCP_CA_Recovery) {
352 | 		/* Starting 1st round of Recovery, so do packet conservation. */
353 | 		bbr->packet_conservation = 1;
354 | 		bbr->next_rtt_delivered = tp->delivered;  /* start round now */
355 | 		/* Cut unused cwnd from app behavior, TSQ, or TSO deferral: */
356 | 		cwnd = tcp_packets_in_flight(tp) + acked;
357 | 	} else if (prev_state >= TCP_CA_Recovery && state < TCP_CA_Recovery) {
358 | 		/* Exiting loss recovery; restore cwnd saved before recovery. */
359 | 		bbr->restore_cwnd = 1;
360 | 		bbr->packet_conservation = 0;
361 | 	}
362 | 	bbr->prev_ca_state = state;
363 | 
364 | 	if (bbr->restore_cwnd) {
365 | 		/* Restore cwnd after exiting loss recovery or PROBE_RTT. */
366 | 		cwnd = max(cwnd, bbr->prior_cwnd);
367 | 		bbr->restore_cwnd = 0;
368 | 	}
369 | 
370 | 	if (bbr->packet_conservation) {
371 | 		*new_cwnd = max(cwnd, tcp_packets_in_flight(tp) + acked);
372 | 		return true;	/* yes, using packet conservation */
373 | 	}
374 | 	*new_cwnd = cwnd;
375 | 	return false;
376 | }
377 | 
378 | /* Slow-start up toward target cwnd (if bw estimate is growing, or packet loss
379 |  * has drawn us down below target), or snap down to target if we're above it.
380 |  */
381 | static void bbr_set_cwnd(struct sock *sk, const struct rate_sample *rs,
382 | 			 u32 acked, u32 bw, int gain)
383 | {
384 | 	struct tcp_sock *tp = tcp_sk(sk);
385 | 	struct bbr *bbr = inet_csk_ca(sk);
386 | 	u32 cwnd = 0, target_cwnd = 0;
387 | 
388 | 	if (!acked)
389 | 		return;
390 | 
391 | 	if (bbr_set_cwnd_to_recover_or_restore(sk, rs, acked, &cwnd))
392 | 		goto done;
393 | 
394 | 	/* If we're below target cwnd, slow start cwnd toward target cwnd. */
395 | 	target_cwnd = bbr_target_cwnd(sk, bw, gain);
396 | 	if (bbr_full_bw_reached(sk))  /* only cut cwnd if we filled the pipe */
397 | 		cwnd = min(cwnd + acked, target_cwnd);
398 | 	else if (cwnd < target_cwnd || tp->delivered < TCP_INIT_CWND)
399 | 		cwnd = cwnd + acked;
400 | 	cwnd = max(cwnd, bbr_cwnd_min_target);
401 | 
402 | done:
403 | 	tp->snd_cwnd = min(cwnd, tp->snd_cwnd_clamp);	/* apply global cap */
404 | 	if (bbr->mode == BBR_PROBE_RTT)  /* drain queue, refresh min_rtt */
405 | 		tp->snd_cwnd = min(tp->snd_cwnd, bbr_cwnd_min_target);
406 | }
407 | 
408 | /* End cycle phase if it's time and/or we hit the phase's in-flight target. */
409 | static bool bbr_is_next_cycle_phase(struct sock *sk,
410 | 				    const struct rate_sample *rs)
411 | {
412 | 	struct tcp_sock *tp = tcp_sk(sk);
413 | 	struct bbr *bbr = inet_csk_ca(sk);
414 | 	bool is_full_length =
415 | 		skb_mstamp_us_delta(&tp->delivered_mstamp, &bbr->cycle_mstamp) >
416 | 		bbr->min_rtt_us;
417 | 	u32 inflight, bw;
418 | 
419 | 	/* The pacing_gain of 1.0 paces at the estimated bw to try to fully
420 | 	 * use the pipe without increasing the queue.
421 | 	 */
422 | 	if (bbr->pacing_gain == BBR_UNIT)
423 | 		return is_full_length;		/* just use wall clock time */
424 | 
425 | 	inflight = rs->prior_in_flight;  /* what was in-flight before ACK? */
426 | 	bw = bbr_max_bw(sk);
427 | 
428 | 	/* A pacing_gain > 1.0 probes for bw by trying to raise inflight to at
429 | 	 * least pacing_gain*BDP; this may take more than min_rtt if min_rtt is
430 | 	 * small (e.g. on a LAN). We do not persist if packets are lost, since
431 | 	 * a path with small buffers may not hold that much.
432 | 	 */
433 | 	if (bbr->pacing_gain > BBR_UNIT)
434 | 		return is_full_length &&
435 | 			(rs->losses ||  /* perhaps pacing_gain*BDP won't fit */
436 | 			 inflight >= bbr_target_cwnd(sk, bw, bbr->pacing_gain));
437 | 
438 | 	/* A pacing_gain < 1.0 tries to drain extra queue we added if bw
439 | 	 * probing didn't find more bw. If inflight falls to match BDP then we
440 | 	 * estimate queue is drained; persisting would underutilize the pipe.
441 | 	 */
442 | 	return is_full_length ||
443 | 		inflight <= bbr_target_cwnd(sk, bw, BBR_UNIT);
444 | }
445 | 
446 | static void bbr_advance_cycle_phase(struct sock *sk)
447 | {
448 | 	struct tcp_sock *tp = tcp_sk(sk);
449 | 	struct bbr *bbr = inet_csk_ca(sk);
450 | 
451 | 	bbr->cycle_idx = (bbr->cycle_idx + 1) & (CYCLE_LEN - 1);
452 | 	bbr->cycle_mstamp = tp->delivered_mstamp;
453 | 	bbr->pacing_gain = bbr_pacing_gain[bbr->cycle_idx];
454 | }
455 | 
456 | /* Gain cycling: cycle pacing gain to converge to fair share of available bw. */
457 | static void bbr_update_cycle_phase(struct sock *sk,
458 | 				   const struct rate_sample *rs)
459 | {
460 | 	struct bbr *bbr = inet_csk_ca(sk);
461 | 
462 | 	if ((bbr->mode == BBR_PROBE_BW) && !bbr->lt_use_bw &&
463 | 	    bbr_is_next_cycle_phase(sk, rs))
464 | 		bbr_advance_cycle_phase(sk);
465 | }
466 | 
467 | static void bbr_reset_startup_mode(struct sock *sk)
468 | {
469 | 	struct bbr *bbr = inet_csk_ca(sk);
470 | 
471 | 	bbr->mode = BBR_STARTUP;
472 | 	bbr->pacing_gain = bbr_high_gain;
473 | 	bbr->cwnd_gain	 = bbr_high_gain;
474 | }
475 | 
476 | static void bbr_reset_probe_bw_mode(struct sock *sk)
477 | {
478 | 	struct bbr *bbr = inet_csk_ca(sk);
479 | 
480 | 	bbr->mode = BBR_PROBE_BW;
481 | 	bbr->pacing_gain = BBR_UNIT;
482 | 	bbr->cwnd_gain = bbr_cwnd_gain;
483 | 	bbr->cycle_idx = CYCLE_LEN - 1 - prandom_u32_max(bbr_cycle_rand);
484 | 	bbr_advance_cycle_phase(sk);	/* flip to next phase of gain cycle */
485 | }
486 | 
487 | static void bbr_reset_mode(struct sock *sk)
488 | {
489 | 	if (!bbr_full_bw_reached(sk))
490 | 		bbr_reset_startup_mode(sk);
491 | 	else
492 | 		bbr_reset_probe_bw_mode(sk);
493 | }
494 | 
495 | /* Start a new long-term sampling interval. */
496 | static void bbr_reset_lt_bw_sampling_interval(struct sock *sk)
497 | {
498 | 	struct tcp_sock *tp = tcp_sk(sk);
499 | 	struct bbr *bbr = inet_csk_ca(sk);
500 | 
501 | 	bbr->lt_last_stamp = tp->delivered_mstamp.stamp_jiffies;
502 | 	bbr->lt_last_delivered = tp->delivered;
503 | 	bbr->lt_last_lost = tp->lost;
504 | 	bbr->lt_rtt_cnt = 0;
505 | }
506 | 
507 | /* Completely reset long-term bandwidth sampling. */
508 | static void bbr_reset_lt_bw_sampling(struct sock *sk)
509 | {
510 | 	struct bbr *bbr = inet_csk_ca(sk);
511 | 
512 | 	bbr->lt_bw = 0;
513 | 	bbr->lt_use_bw = 0;
514 | 	bbr->lt_is_sampling = false;
515 | 	bbr_reset_lt_bw_sampling_interval(sk);
516 | }
517 | 
518 | /* Long-term bw sampling interval is done. Estimate whether we're policed. */
519 | static void bbr_lt_bw_interval_done(struct sock *sk, u32 bw)
520 | {
521 | 	struct bbr *bbr = inet_csk_ca(sk);
522 | 	u32 diff;
523 | 
524 | 	if (bbr->lt_bw) {  /* do we have bw from a previous interval? */
525 | 		/* Is new bw close to the lt_bw from the previous interval? */
526 | 		diff = abs(bw - bbr->lt_bw);
527 | 		if ((diff * BBR_UNIT <= bbr_lt_bw_ratio * bbr->lt_bw) ||
528 | 		    (bbr_rate_bytes_per_sec(sk, diff, BBR_UNIT) <=
529 | 		     bbr_lt_bw_diff)) {
530 | 			/* All criteria are met; estimate we're policed. */
531 | 			bbr->lt_bw = (bw + bbr->lt_bw) >> 1;  /* avg 2 intvls */
532 | 			bbr->lt_use_bw = 1;
533 | 			bbr->pacing_gain = BBR_UNIT;  /* try to avoid drops */
534 | 			bbr->lt_rtt_cnt = 0;
535 | 			return;
536 | 		}
537 | 	}
538 | 	bbr->lt_bw = bw;
539 | 	bbr_reset_lt_bw_sampling_interval(sk);
540 | }
541 | 
542 | /* Token-bucket traffic policers are common (see "An Internet-Wide Analysis of
543 |  * Traffic Policing", SIGCOMM 2016). BBR detects token-bucket policers and
544 |  * explicitly models their policed rate, to reduce unnecessary losses. We
545 |  * estimate that we're policed if we see 2 consecutive sampling intervals with
546 |  * consistent throughput and high packet loss. If we think we're being policed,
547 |  * set lt_bw to the "long-term" average delivery rate from those 2 intervals.
548 |  */
549 | static void bbr_lt_bw_sampling(struct sock *sk, const struct rate_sample *rs)
550 | {
551 | 	struct tcp_sock *tp = tcp_sk(sk);
552 | 	struct bbr *bbr = inet_csk_ca(sk);
553 | 	u32 lost, delivered;
554 | 	u64 bw;
555 | 	s32 t;
556 | 
557 | 	if (bbr->lt_use_bw) {	/* already using long-term rate, lt_bw? */
558 | 		if (bbr->mode == BBR_PROBE_BW && bbr->round_start &&
559 | 		    ++bbr->lt_rtt_cnt >= bbr_lt_bw_max_rtts) {
560 | 			bbr_reset_lt_bw_sampling(sk);    /* stop using lt_bw */
561 | 			bbr_reset_probe_bw_mode(sk);  /* restart gain cycling */
562 | 		}
563 | 		return;
564 | 	}
565 | 
566 | 	/* Wait for the first loss before sampling, to let the policer exhaust
567 | 	 * its tokens and estimate the steady-state rate allowed by the policer.
568 | 	 * Starting samples earlier includes bursts that over-estimate the bw.
569 | 	 */
570 | 	if (!bbr->lt_is_sampling) {
571 | 		if (!rs->losses)
572 | 			return;
573 | 		bbr_reset_lt_bw_sampling_interval(sk);
574 | 		bbr->lt_is_sampling = true;
575 | 	}
576 | 
577 | 	/* To avoid underestimates, reset sampling if we run out of data. */
578 | 	if (rs->is_app_limited) {
579 | 		bbr_reset_lt_bw_sampling(sk);
580 | 		return;
581 | 	}
582 | 
583 | 	if (bbr->round_start)
584 | 		bbr->lt_rtt_cnt++;	/* count round trips in this interval */
585 | 	if (bbr->lt_rtt_cnt < bbr_lt_intvl_min_rtts)
586 | 		return;		/* sampling interval needs to be longer */
587 | 	if (bbr->lt_rtt_cnt > 4 * bbr_lt_intvl_min_rtts) {
588 | 		bbr_reset_lt_bw_sampling(sk);  /* interval is too long */
589 | 		return;
590 | 	}
591 | 
592 | 	/* End sampling interval when a packet is lost, so we estimate the
593 | 	 * policer tokens were exhausted. Stopping the sampling before the
594 | 	 * tokens are exhausted under-estimates the policed rate.
595 | 	 */
596 | 	if (!rs->losses)
597 | 		return;
598 | 
599 | 	/* Calculate packets lost and delivered in sampling interval. */
600 | 	lost = tp->lost - bbr->lt_last_lost;
601 | 	delivered = tp->delivered - bbr->lt_last_delivered;
602 | 	/* Is loss rate (lost/delivered) >= lt_loss_thresh? If not, wait. */
603 | 	if (!delivered || (lost << BBR_SCALE) < bbr_lt_loss_thresh * delivered)
604 | 		return;
605 | 
606 | 	/* Find average delivery rate in this sampling interval. */
607 | 	t = (s32)(tp->delivered_mstamp.stamp_jiffies - bbr->lt_last_stamp);
608 | 	if (t < 1)
609 | 		return;		/* interval is less than one jiffy, so wait */
610 | 	t = jiffies_to_usecs(t);
611 | 	/* Interval long enough for jiffies_to_usecs() to return a bogus 0? */
612 | 	if (t < 1) {
613 | 		bbr_reset_lt_bw_sampling(sk);  /* interval too long; reset */
614 | 		return;
615 | 	}
616 | 	bw = (u64)delivered * BW_UNIT;
617 | 	do_div(bw, t);
618 | 	bbr_lt_bw_interval_done(sk, bw);
619 | }
620 | 
621 | /* Estimate the bandwidth based on how fast packets are delivered */
622 | static void bbr_update_bw(struct sock *sk, const struct rate_sample *rs)
623 | {
624 | 	struct tcp_sock *tp = tcp_sk(sk);
625 | 	struct bbr *bbr = inet_csk_ca(sk);
626 | 	u64 bw;
627 | 
628 | 	bbr->round_start = 0;
629 | 	if (rs->delivered < 0 || rs->interval_us <= 0)
630 | 		return; /* Not a valid observation */
631 | 
632 | 	/* See if we've reached the next RTT */
633 | 	if (!before(rs->prior_delivered, bbr->next_rtt_delivered)) {
634 | 		bbr->next_rtt_delivered = tp->delivered;
635 | 		bbr->rtt_cnt++;
636 | 		bbr->round_start = 1;
637 | 		bbr->packet_conservation = 0;
638 | 	}
639 | 
640 | 	bbr_lt_bw_sampling(sk, rs);
641 | 
642 | 	/* Divide delivered by the interval to find a (lower bound) bottleneck
643 | 	 * bandwidth sample. Delivered is in packets and interval_us in uS and
644 | 	 * ratio will be <<1 for most connections. So delivered is first scaled.
645 | 	 */
646 | 	bw = (u64)rs->delivered * BW_UNIT;
647 | 	do_div(bw, rs->interval_us);
648 | 
649 | 	/* If this sample is application-limited, it is likely to have a very
650 | 	 * low delivered count that represents application behavior rather than
651 | 	 * the available network rate. Such a sample could drag down estimated
652 | 	 * bw, causing needless slow-down. Thus, to continue to send at the
653 | 	 * last measured network rate, we filter out app-limited samples unless
654 | 	 * they describe the path bw at least as well as our bw model.
655 | 	 *
656 | 	 * So the goal during app-limited phase is to proceed with the best
657 | 	 * network rate no matter how long. We automatically leave this
658 | 	 * phase when app writes faster than the network can deliver :)
659 | 	 */
660 | 	if (!rs->is_app_limited || bw >= bbr_max_bw(sk)) {
661 | 		/* Incorporate new sample into our max bw filter. */
662 | 		minmax_running_max(&bbr->bw, bbr_bw_rtts, bbr->rtt_cnt, bw);
663 | 	}
664 | }
665 | 
666 | /* Estimate when the pipe is full, using the change in delivery rate: BBR
667 |  * estimates that STARTUP filled the pipe if the estimated bw hasn't changed by
668 |  * at least bbr_full_bw_thresh (25%) after bbr_full_bw_cnt (3) non-app-limited
669 |  * rounds. Why 3 rounds: 1: rwin autotuning grows the rwin, 2: we fill the
670 |  * higher rwin, 3: we get higher delivery rate samples. Or transient
671 |  * cross-traffic or radio noise can go away. CUBIC Hystart shares a similar
672 |  * design goal, but uses delay and inter-ACK spacing instead of bandwidth.
673 |  */
674 | static void bbr_check_full_bw_reached(struct sock *sk,
675 | 				      const struct rate_sample *rs)
676 | {
677 | 	struct bbr *bbr = inet_csk_ca(sk);
678 | 	u32 bw_thresh;
679 | 
680 | 	if (bbr_full_bw_reached(sk) || !bbr->round_start || rs->is_app_limited)
681 | 		return;
682 | 
683 | 	bw_thresh = (u64)bbr->full_bw * bbr_full_bw_thresh >> BBR_SCALE;
684 | 	if (bbr_max_bw(sk) >= bw_thresh) {
685 | 		bbr->full_bw = bbr_max_bw(sk);
686 | 		bbr->full_bw_cnt = 0;
687 | 		return;
688 | 	}
689 | 	++bbr->full_bw_cnt;
690 | }
691 | 
692 | /* If pipe is probably full, drain the queue and then enter steady-state. */
693 | static void bbr_check_drain(struct sock *sk, const struct rate_sample *rs)
694 | {
695 | 	struct bbr *bbr = inet_csk_ca(sk);
696 | 
697 | 	if (bbr->mode == BBR_STARTUP && bbr_full_bw_reached(sk)) {
698 | 		bbr->mode = BBR_DRAIN;	/* drain queue we created */
699 | 		bbr->pacing_gain = bbr_drain_gain;	/* pace slow to drain */
700 | 		bbr->cwnd_gain = bbr_high_gain;	/* maintain cwnd */
701 | 	}	/* fall through to check if in-flight is already small: */
702 | 	if (bbr->mode == BBR_DRAIN &&
703 | 	    tcp_packets_in_flight(tcp_sk(sk)) <=
704 | 	    bbr_target_cwnd(sk, bbr_max_bw(sk), BBR_UNIT))
705 | 		bbr_reset_probe_bw_mode(sk);  /* we estimate queue is drained */
706 | }
707 | 
708 | /* The goal of PROBE_RTT mode is to have BBR flows cooperatively and
709 |  * periodically drain the bottleneck queue, to converge to measure the true
710 |  * min_rtt (unloaded propagation delay). This allows the flows to keep queues
711 |  * small (reducing queuing delay and packet loss) and achieve fairness among
712 |  * BBR flows.
713 |  *
714 |  * The min_rtt filter window is 10 seconds. When the min_rtt estimate expires,
715 |  * we enter PROBE_RTT mode and cap the cwnd at bbr_cwnd_min_target=4 packets.
716 |  * After at least bbr_probe_rtt_mode_ms=200ms and at least one packet-timed
717 |  * round trip elapsed with that flight size <= 4, we leave PROBE_RTT mode and
718 |  * re-enter the previous mode. BBR uses 200ms to approximately bound the
719 |  * performance penalty of PROBE_RTT's cwnd capping to roughly 2% (200ms/10s).
720 |  *
721 |  * Note that flows need only pay 2% if they are busy sending over the last 10
722 |  * seconds. Interactive applications (e.g., Web, RPCs, video chunks) often have
723 |  * natural silences or low-rate periods within 10 seconds where the rate is low
724 |  * enough for long enough to drain its queue in the bottleneck. We pick up
725 |  * these min RTT measurements opportunistically with our min_rtt filter. :-)
726 |  */
727 | static void bbr_update_min_rtt(struct sock *sk, const struct rate_sample *rs)
728 | {
729 | 	struct tcp_sock *tp = tcp_sk(sk);
730 | 	struct bbr *bbr = inet_csk_ca(sk);
731 | 	bool filter_expired;
732 | 
733 | 	/* Track min RTT seen in the min_rtt_win_sec filter window: */
734 | 	filter_expired = after(tcp_time_stamp,
735 | 			       bbr->min_rtt_stamp + bbr_min_rtt_win_sec * HZ);
736 | 	if (rs->rtt_us >= 0 &&
737 | 	    (rs->rtt_us <= bbr->min_rtt_us || filter_expired)) {
738 | 		bbr->min_rtt_us = rs->rtt_us;
739 | 		bbr->min_rtt_stamp = tcp_time_stamp;
740 | 	}
741 | 
742 | 	if (bbr_probe_rtt_mode_ms > 0 && filter_expired &&
743 | 	    !bbr->idle_restart && bbr->mode != BBR_PROBE_RTT) {
744 | 		bbr->mode = BBR_PROBE_RTT;  /* dip, drain queue */
745 | 		bbr->pacing_gain = BBR_UNIT;
746 | 		bbr->cwnd_gain = BBR_UNIT;
747 | 		bbr_save_cwnd(sk);  /* note cwnd so we can restore it */
748 | 		bbr->probe_rtt_done_stamp = 0;
749 | 	}
750 | 
751 | 	if (bbr->mode == BBR_PROBE_RTT) {
752 | 		/* Ignore low rate samples during this mode. */
753 | 		tp->app_limited =
754 | 			(tp->delivered + tcp_packets_in_flight(tp)) ? : 1;
755 | 		/* Maintain min packets in flight for max(200 ms, 1 round). */
756 | 		if (!bbr->probe_rtt_done_stamp &&
757 | 		    tcp_packets_in_flight(tp) <= bbr_cwnd_min_target) {
758 | 			bbr->probe_rtt_done_stamp = tcp_time_stamp +
759 | 				msecs_to_jiffies(bbr_probe_rtt_mode_ms);
760 | 			bbr->probe_rtt_round_done = 0;
761 | 			bbr->next_rtt_delivered = tp->delivered;
762 | 		} else if (bbr->probe_rtt_done_stamp) {
763 | 			if (bbr->round_start)
764 | 				bbr->probe_rtt_round_done = 1;
765 | 			if (bbr->probe_rtt_round_done &&
766 | 			    after(tcp_time_stamp, bbr->probe_rtt_done_stamp)) {
767 | 				bbr->min_rtt_stamp = tcp_time_stamp;
768 | 				bbr->restore_cwnd = 1;  /* snap to prior_cwnd */
769 | 				bbr_reset_mode(sk);
770 | 			}
771 | 		}
772 | 	}
773 | 	bbr->idle_restart = 0;
774 | }
775 | 
776 | static void bbr_update_model(struct sock *sk, const struct rate_sample *rs)
777 | {
778 | 	bbr_update_bw(sk, rs);
779 | 	bbr_update_cycle_phase(sk, rs);
780 | 	bbr_check_full_bw_reached(sk, rs);
781 | 	bbr_check_drain(sk, rs);
782 | 	bbr_update_min_rtt(sk, rs);
783 | }
784 | 
785 | static void bbr_main(struct sock *sk, const struct rate_sample *rs)
786 | {
787 | 	struct bbr *bbr = inet_csk_ca(sk);
788 | 	u32 bw;
789 | 
790 | 	bbr_update_model(sk, rs);
791 | 
792 | 	bw = bbr_bw(sk);
793 | 	bbr_set_pacing_rate(sk, bw, bbr->pacing_gain);
794 | 	bbr_set_tso_segs_goal(sk);
795 | 	bbr_set_cwnd(sk, rs, rs->acked_sacked, bw, bbr->cwnd_gain);
796 | }
797 | 
798 | static void bbr_init(struct sock *sk)
799 | {
800 | 	struct tcp_sock *tp = tcp_sk(sk);
801 | 	struct bbr *bbr = inet_csk_ca(sk);
802 | 	u64 bw;
803 | 
804 | 	bbr->prior_cwnd = 0;
805 | 	bbr->tso_segs_goal = 0;	 /* default segs per skb until first ACK */
806 | 	bbr->rtt_cnt = 0;
807 | 	bbr->next_rtt_delivered = 0;
808 | 	bbr->prev_ca_state = TCP_CA_Open;
809 | 	bbr->packet_conservation = 0;
810 | 
811 | 	bbr->probe_rtt_done_stamp = 0;
812 | 	bbr->probe_rtt_round_done = 0;
813 | 	bbr->min_rtt_us = tcp_min_rtt(tp);
814 | 	bbr->min_rtt_stamp = tcp_time_stamp;
815 | 
816 | 	minmax_reset(&bbr->bw, bbr->rtt_cnt, 0);  /* init max bw to 0 */
817 | 
818 | 	/* Initialize pacing rate to: high_gain * init_cwnd / RTT. */
819 | 	bw = (u64)tp->snd_cwnd * BW_UNIT;
820 | 	do_div(bw, (tp->srtt_us >> 3) ? : USEC_PER_MSEC);
821 | 	sk->sk_pacing_rate = 0;		/* force an update of sk_pacing_rate */
822 | 	bbr_set_pacing_rate(sk, bw, bbr_high_gain);
823 | 
824 | 	bbr->restore_cwnd = 0;
825 | 	bbr->round_start = 0;
826 | 	bbr->idle_restart = 0;
827 | 	bbr->full_bw = 0;
828 | 	bbr->full_bw_cnt = 0;
829 | 	bbr->cycle_mstamp.v64 = 0;
830 | 	bbr->cycle_idx = 0;
831 | 	bbr_reset_lt_bw_sampling(sk);
832 | 	bbr_reset_startup_mode(sk);
833 | }
834 | 
835 | static u32 bbr_sndbuf_expand(struct sock *sk)
836 | {
837 | 	/* Provision 3 * cwnd since BBR may slow-start even during recovery. */
838 | 	return 3;
839 | }
840 | 
841 | /* In theory BBR does not need to undo the cwnd since it does not
842 |  * always reduce cwnd on losses (see bbr_main()). Keep it for now.
843 |  */
844 | static u32 bbr_undo_cwnd(struct sock *sk)
845 | {
846 | 	return tcp_sk(sk)->snd_cwnd;
847 | }
848 | 
849 | /* Entering loss recovery, so save cwnd for when we exit or undo recovery. */
850 | static u32 bbr_ssthresh(struct sock *sk)
851 | {
852 | 	bbr_save_cwnd(sk);
853 | 	return TCP_INFINITE_SSTHRESH;	 /* BBR does not use ssthresh */
854 | }
855 | 
856 | static size_t bbr_get_info(struct sock *sk, u32 ext, int *attr,
857 | 			   union tcp_cc_info *info)
858 | {
859 | 	if (ext & (1 << (INET_DIAG_BBRINFO - 1)) ||
860 | 	    ext & (1 << (INET_DIAG_VEGASINFO - 1))) {
861 | 		struct tcp_sock *tp = tcp_sk(sk);
862 | 		struct bbr *bbr = inet_csk_ca(sk);
863 | 		u64 bw = bbr_bw(sk);
864 | 
865 | 		bw = bw * tp->mss_cache * USEC_PER_SEC >> BW_SCALE;
866 | 		memset(&info->bbr, 0, sizeof(info->bbr));
867 | 		info->bbr.bbr_bw_lo		= (u32)bw;
868 | 		info->bbr.bbr_bw_hi		= (u32)(bw >> 32);
869 | 		info->bbr.bbr_min_rtt		= bbr->min_rtt_us;
870 | 		info->bbr.bbr_pacing_gain	= bbr->pacing_gain;
871 | 		info->bbr.bbr_cwnd_gain		= bbr->cwnd_gain;
872 | 		*attr = INET_DIAG_BBRINFO;
873 | 		return sizeof(info->bbr);
874 | 	}
875 | 	return 0;
876 | }
877 | 
878 | static void bbr_set_state(struct sock *sk, u8 new_state)
879 | {
880 | 	struct bbr *bbr = inet_csk_ca(sk);
881 | 
882 | 	if (new_state == TCP_CA_Loss) {
883 | 		struct rate_sample rs = { .losses = 1 };
884 | 
885 | 		bbr->prev_ca_state = TCP_CA_Loss;
886 | 		bbr->full_bw = 0;
887 | 		bbr->round_start = 1;	/* treat RTO like end of a round */
888 | 		bbr_lt_bw_sampling(sk, &rs);
889 | 	}
890 | }
891 | 
892 | static struct tcp_congestion_ops tcp_bbr_cong_ops __read_mostly = {
893 | 	.flags		= TCP_CONG_NON_RESTRICTED,
894 | 	.name		= "bbr",
895 | 	.owner		= THIS_MODULE,
896 | 	.init		= bbr_init,
897 | 	.cong_control	= bbr_main,
898 | 	.sndbuf_expand	= bbr_sndbuf_expand,
899 | 	.undo_cwnd	= bbr_undo_cwnd,
900 | 	.cwnd_event	= bbr_cwnd_event,
901 | 	.ssthresh	= bbr_ssthresh,
902 | 	.tso_segs_goal	= bbr_tso_segs_goal,
903 | 	.get_info	= bbr_get_info,
904 | 	.set_state	= bbr_set_state,
905 | };
906 | 
907 | static int __init bbr_register(void)
908 | {
909 | 	BUILD_BUG_ON(sizeof(struct bbr) > ICSK_CA_PRIV_SIZE);
910 | 	return tcp_register_congestion_control(&tcp_bbr_cong_ops);
911 | }
912 | 
913 | static void __exit bbr_unregister(void)
914 | {
915 | 	tcp_unregister_congestion_control(&tcp_bbr_cong_ops);
916 | }
917 | 
918 | module_init(bbr_register);
919 | module_exit(bbr_unregister);
920 | 
921 | MODULE_AUTHOR("Van Jacobson <vanj@google.com>");
922 | MODULE_AUTHOR("Neal Cardwell <ncardwell@google.com>");
923 | MODULE_AUTHOR("Yuchung Cheng <ycheng@google.com>");
924 | MODULE_AUTHOR("Soheil Hassas Yeganeh <soheil@google.com>");
925 | MODULE_LICENSE("Dual BSD/GPL");
926 | MODULE_DESCRIPTION("TCP BBR (Bottleneck Bandwidth and RTT)");
927 | 


--------------------------------------------------------------------------------
/General/Debian/mod/tcp_nanqinlang-for-v4.10.2.ko:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tcp-nanqinlang/general/0b92fc4efc3a5cc1a2e486b86c6123353f4d5bfe/General/Debian/mod/tcp_nanqinlang-for-v4.10.2.ko


--------------------------------------------------------------------------------
/General/Debian/source/kernel-v4.12andbelow/tcp_bbr.c:
--------------------------------------------------------------------------------
  1 | /* Bottleneck Bandwidth and RTT (BBR) congestion control
  2 |  *
  3 |  * BBR congestion control computes the sending rate based on the delivery
  4 |  * rate (throughput) estimated from ACKs. In a nutshell:
  5 |  *
  6 |  *   On each ACK, update our model of the network path:
  7 |  *      bottleneck_bandwidth = windowed_max(delivered / elapsed, 10 round trips)
  8 |  *      min_rtt = windowed_min(rtt, 10 seconds)
  9 |  *   pacing_rate = pacing_gain * bottleneck_bandwidth
 10 |  *   cwnd = max(cwnd_gain * bottleneck_bandwidth * min_rtt, 4)
 11 |  *
 12 |  * The core algorithm does not react directly to packet losses or delays,
 13 |  * although BBR may adjust the size of next send per ACK when loss is
 14 |  * observed, or adjust the sending rate if it estimates there is a
 15 |  * traffic policer, in order to keep the drop rate reasonable.
 16 |  *
 17 |  * Here is a state transition diagram for BBR:
 18 |  *
 19 |  *             |
 20 |  *             V
 21 |  *    +---> STARTUP  ----+
 22 |  *    |        |         |
 23 |  *    |        V         |
 24 |  *    |      DRAIN   ----+
 25 |  *    |        |         |
 26 |  *    |        V         |
 27 |  *    +---> PROBE_BW ----+
 28 |  *    |      ^    |      |
 29 |  *    |      |    |      |
 30 |  *    |      +----+      |
 31 |  *    |                  |
 32 |  *    +---- PROBE_RTT <--+
 33 |  *
 34 |  * A BBR flow starts in STARTUP, and ramps up its sending rate quickly.
 35 |  * When it estimates the pipe is full, it enters DRAIN to drain the queue.
 36 |  * In steady state a BBR flow only uses PROBE_BW and PROBE_RTT.
 37 |  * A long-lived BBR flow spends the vast majority of its time remaining
 38 |  * (repeatedly) in PROBE_BW, fully probing and utilizing the pipe's bandwidth
 39 |  * in a fair manner, with a small, bounded queue. *If* a flow has been
 40 |  * continuously sending for the entire min_rtt window, and hasn't seen an RTT
 41 |  * sample that matches or decreases its min_rtt estimate for 10 seconds, then
 42 |  * it briefly enters PROBE_RTT to cut inflight to a minimum value to re-probe
 43 |  * the path's two-way propagation delay (min_rtt). When exiting PROBE_RTT, if
 44 |  * we estimated that we reached the full bw of the pipe then we enter PROBE_BW;
 45 |  * otherwise we enter STARTUP to try to fill the pipe.
 46 |  *
 47 |  * BBR is described in detail in:
 48 |  *   "BBR: Congestion-Based Congestion Control",
 49 |  *   Neal Cardwell, Yuchung Cheng, C. Stephen Gunn, Soheil Hassas Yeganeh,
 50 |  *   Van Jacobson. ACM Queue, Vol. 14 No. 5, September-October 2016.
 51 |  *
 52 |  * There is a public e-mail list for discussing BBR development and testing:
 53 |  *   https://groups.google.com/forum/#!forum/bbr-dev
 54 |  *
 55 |  * NOTE: BBR *must* be used with the fq qdisc ("man tc-fq") with pacing enabled,
 56 |  * since pacing is integral to the BBR design and implementation.
 57 |  * BBR without pacing would not function properly, and may incur unnecessary
 58 |  * high packet loss rates.
 59 |  */
 60 | #include <linux/module.h>
 61 | #include <net/tcp.h>
 62 | #include <linux/inet_diag.h>
 63 | #include <linux/inet.h>
 64 | #include <linux/random.h>
 65 | #include <linux/win_minmax.h>
 66 | 
 67 | /* Scale factor for rate in pkt/uSec unit to avoid truncation in bandwidth
 68 |  * estimation. The rate unit ~= (1500 bytes / 1 usec / 2^24) ~= 715 bps.
 69 |  * This handles bandwidths from 0.06pps (715bps) to 256Mpps (3Tbps) in a u32.
 70 |  * Since the minimum window is >=4 packets, the lower bound isn't
 71 |  * an issue. The upper bound isn't an issue with existing technologies.
 72 |  */
 73 | #define BW_SCALE 24
 74 | #define BW_UNIT (1 << BW_SCALE)
 75 | 
 76 | #define BBR_SCALE 8	/* scaling factor for fractions in BBR (e.g. gains) */
 77 | #define BBR_UNIT (1 << BBR_SCALE)
 78 | 
 79 | /* BBR has the following modes for deciding how fast to send: */
 80 | enum bbr_mode {
 81 | 	BBR_STARTUP,	/* ramp up sending rate rapidly to fill pipe */
 82 | 	BBR_DRAIN,	/* drain any queue created during startup */
 83 | 	BBR_PROBE_BW,	/* discover, share bw: pace around estimated bw */
 84 | 	BBR_PROBE_RTT,	/* cut inflight to min to probe min_rtt */
 85 | };
 86 | 
 87 | /* BBR congestion control block */
 88 | struct bbr {
 89 | 	u32	min_rtt_us;	        /* min RTT in min_rtt_win_sec window */
 90 | 	u32	min_rtt_stamp;	        /* timestamp of min_rtt_us */
 91 | 	u32	probe_rtt_done_stamp;   /* end time for BBR_PROBE_RTT mode */
 92 | 	struct minmax bw;	/* Max recent delivery rate in pkts/uS << 24 */
 93 | 	u32	rtt_cnt;	    /* count of packet-timed rounds elapsed */
 94 | 	u32     next_rtt_delivered; /* scb->tx.delivered at end of round */
 95 | 	struct skb_mstamp cycle_mstamp;  /* time of this cycle phase start */
 96 | 	u32     mode:3,		     /* current bbr_mode in state machine */
 97 | 		prev_ca_state:3,     /* CA state on previous ACK */
 98 | 		packet_conservation:1,  /* use packet conservation? */
 99 | 		restore_cwnd:1,	     /* decided to revert cwnd to old value */
100 | 		round_start:1,	     /* start of packet-timed tx->ack round? */
101 | 		tso_segs_goal:7,     /* segments we want in each skb we send */
102 | 		idle_restart:1,	     /* restarting after idle? */
103 | 		probe_rtt_round_done:1,  /* a BBR_PROBE_RTT round at 4 pkts? */
104 | 		unused:5,
105 | 		lt_is_sampling:1,    /* taking long-term ("LT") samples now? */
106 | 		lt_rtt_cnt:7,	     /* round trips in long-term interval */
107 | 		lt_use_bw:1;	     /* use lt_bw as our bw estimate? */
108 | 	u32	lt_bw;		     /* LT est delivery rate in pkts/uS << 24 */
109 | 	u32	lt_last_delivered;   /* LT intvl start: tp->delivered */
110 | 	u32	lt_last_stamp;	     /* LT intvl start: tp->delivered_mstamp */
111 | 	u32	lt_last_lost;	     /* LT intvl start: tp->lost */
112 | 	u32	pacing_gain:10,	/* current gain for setting pacing rate */
113 | 		cwnd_gain:10,	/* current gain for setting cwnd */
114 | 		full_bw_cnt:3,	/* number of rounds without large bw gains */
115 | 		cycle_idx:3,	/* current index in pacing_gain cycle array */
116 | 		unused_b:6;
117 | 	u32	prior_cwnd;	/* prior cwnd upon entering loss recovery */
118 | 	u32	full_bw;	/* recent bw, to estimate if pipe is full */
119 | };
120 | 
121 | #define CYCLE_LEN	8	/* number of phases in a pacing gain cycle */
122 | 
123 | /* Window length of bw filter (in rounds): */
124 | static const int bbr_bw_rtts = CYCLE_LEN + 2;
125 | /* Window length of min_rtt filter (in sec): */
126 | static const u32 bbr_min_rtt_win_sec = 10;
127 | /* Minimum time (in ms) spent at bbr_cwnd_min_target in BBR_PROBE_RTT mode: */
128 | static const u32 bbr_probe_rtt_mode_ms = 200;
129 | /* Skip TSO below the following bandwidth (bits/sec): */
130 | static const int bbr_min_tso_rate = 1200000;
131 | 
132 | /* We use a high_gain value of 2/ln(2) because it's the smallest pacing gain
133 |  * that will allow a smoothly increasing pacing rate that will double each RTT
134 |  * and send the same number of packets per RTT that an un-paced, slow-starting
135 |  * Reno or CUBIC flow would:
136 |  */
137 | static const int bbr_high_gain  = BBR_UNIT * 2885 / 1000 + 1;
138 | /* The pacing gain of 1/high_gain in BBR_DRAIN is calculated to typically drain
139 |  * the queue created in BBR_STARTUP in a single round:
140 |  */
141 | static const int bbr_drain_gain = BBR_UNIT * 1000 / 2885;
142 | /* The gain for deriving steady-state cwnd tolerates delayed/stretched ACKs: */
143 | static const int bbr_cwnd_gain  = BBR_UNIT * 2;
144 | /* The pacing_gain values for the PROBE_BW gain cycle, to discover/share bw: */
145 | static const int bbr_pacing_gain[] = {
146 | 	BBR_UNIT * 5 / 4,	/* probe for more available bw */
147 | 	BBR_UNIT * 3 / 4,	/* drain queue and/or yield bw to other flows */
148 | 	BBR_UNIT, BBR_UNIT, BBR_UNIT,	/* cruise at 1.0*bw to utilize pipe, */
149 | 	BBR_UNIT, BBR_UNIT, BBR_UNIT	/* without creating excess queue... */
150 | };
151 | /* Randomize the starting gain cycling phase over N phases: */
152 | static const u32 bbr_cycle_rand = 7;
153 | 
154 | /* Try to keep at least this many packets in flight, if things go smoothly. For
155 |  * smooth functioning, a sliding window protocol ACKing every other packet
156 |  * needs at least 4 packets in flight:
157 |  */
158 | static const u32 bbr_cwnd_min_target = 4;
159 | 
160 | /* To estimate if BBR_STARTUP mode (i.e. high_gain) has filled pipe... */
161 | /* If bw has increased significantly (1.25x), there may be more bw available: */
162 | static const u32 bbr_full_bw_thresh = BBR_UNIT * 5 / 4;
163 | /* But after 3 rounds w/o significant bw growth, estimate pipe is full: */
164 | static const u32 bbr_full_bw_cnt = 3;
165 | 
166 | /* "long-term" ("LT") bandwidth estimator parameters... */
167 | /* The minimum number of rounds in an LT bw sampling interval: */
168 | static const u32 bbr_lt_intvl_min_rtts = 4;
169 | /* If lost/delivered ratio > 20%, interval is "lossy" and we may be policed: */
170 | static const u32 bbr_lt_loss_thresh = 50;
171 | /* If 2 intervals have a bw ratio <= 1/8, their bw is "consistent": */
172 | static const u32 bbr_lt_bw_ratio = BBR_UNIT / 8;
173 | /* If 2 intervals have a bw diff <= 4 Kbit/sec their bw is "consistent": */
174 | static const u32 bbr_lt_bw_diff = 4000 / 8;
175 | /* If we estimate we're policed, use lt_bw for this many round trips: */
176 | static const u32 bbr_lt_bw_max_rtts = 48;
177 | 
178 | /* Do we estimate that STARTUP filled the pipe? */
179 | static bool bbr_full_bw_reached(const struct sock *sk)
180 | {
181 | 	const struct bbr *bbr = inet_csk_ca(sk);
182 | 
183 | 	return bbr->full_bw_cnt >= bbr_full_bw_cnt;
184 | }
185 | 
186 | /* Return the windowed max recent bandwidth sample, in pkts/uS << BW_SCALE. */
187 | static u32 bbr_max_bw(const struct sock *sk)
188 | {
189 | 	struct bbr *bbr = inet_csk_ca(sk);
190 | 
191 | 	return minmax_get(&bbr->bw);
192 | }
193 | 
194 | /* Return the estimated bandwidth of the path, in pkts/uS << BW_SCALE. */
195 | static u32 bbr_bw(const struct sock *sk)
196 | {
197 | 	struct bbr *bbr = inet_csk_ca(sk);
198 | 
199 | 	return bbr->lt_use_bw ? bbr->lt_bw : bbr_max_bw(sk);
200 | }
201 | 
202 | /* Return rate in bytes per second, optionally with a gain.
203 |  * The order here is chosen carefully to avoid overflow of u64. This should
204 |  * work for input rates of up to 2.9Tbit/sec and gain of 2.89x.
205 |  */
206 | static u64 bbr_rate_bytes_per_sec(struct sock *sk, u64 rate, int gain)
207 | {
208 | 	rate *= tcp_mss_to_mtu(sk, tcp_sk(sk)->mss_cache);
209 | 	rate *= gain;
210 | 	rate >>= BBR_SCALE;
211 | 	rate *= USEC_PER_SEC;
212 | 	return rate >> BW_SCALE;
213 | }
214 | 
215 | /* Pace using current bw estimate and a gain factor. In order to help drive the
216 |  * network toward lower queues while maintaining high utilization and low
217 |  * latency, the average pacing rate aims to be slightly (~1%) lower than the
218 |  * estimated bandwidth. This is an important aspect of the design. In this
219 |  * implementation this slightly lower pacing rate is achieved implicitly by not
220 |  * including link-layer headers in the packet size used for the pacing rate.
221 |  */
222 | static void bbr_set_pacing_rate(struct sock *sk, u32 bw, int gain)
223 | {
224 | 	struct bbr *bbr = inet_csk_ca(sk);
225 | 	u64 rate = bw;
226 | 
227 | 	rate = bbr_rate_bytes_per_sec(sk, rate, gain);
228 | 	rate = min_t(u64, rate, sk->sk_max_pacing_rate);
229 | 	if (bbr->mode != BBR_STARTUP || rate > sk->sk_pacing_rate)
230 | 		sk->sk_pacing_rate = rate;
231 | }
232 | 
233 | /* Return count of segments we want in the skbs we send, or 0 for default. */
234 | static u32 bbr_tso_segs_goal(struct sock *sk)
235 | {
236 | 	struct bbr *bbr = inet_csk_ca(sk);
237 | 
238 | 	return bbr->tso_segs_goal;
239 | }
240 | 
241 | static void bbr_set_tso_segs_goal(struct sock *sk)
242 | {
243 | 	struct tcp_sock *tp = tcp_sk(sk);
244 | 	struct bbr *bbr = inet_csk_ca(sk);
245 | 	u32 min_segs;
246 | 
247 | 	min_segs = sk->sk_pacing_rate < (bbr_min_tso_rate >> 3) ? 1 : 2;
248 | 	bbr->tso_segs_goal = min(tcp_tso_autosize(sk, tp->mss_cache, min_segs),
249 | 				 0x7FU);
250 | }
251 | 
252 | /* Save "last known good" cwnd so we can restore it after losses or PROBE_RTT */
253 | static void bbr_save_cwnd(struct sock *sk)
254 | {
255 | 	struct tcp_sock *tp = tcp_sk(sk);
256 | 	struct bbr *bbr = inet_csk_ca(sk);
257 | 
258 | 	if (bbr->prev_ca_state < TCP_CA_Recovery && bbr->mode != BBR_PROBE_RTT)
259 | 		bbr->prior_cwnd = tp->snd_cwnd;  /* this cwnd is good enough */
260 | 	else  /* loss recovery or BBR_PROBE_RTT have temporarily cut cwnd */
261 | 		bbr->prior_cwnd = max(bbr->prior_cwnd, tp->snd_cwnd);
262 | }
263 | 
264 | static void bbr_cwnd_event(struct sock *sk, enum tcp_ca_event event)
265 | {
266 | 	struct tcp_sock *tp = tcp_sk(sk);
267 | 	struct bbr *bbr = inet_csk_ca(sk);
268 | 
269 | 	if (event == CA_EVENT_TX_START && tp->app_limited) {
270 | 		bbr->idle_restart = 1;
271 | 		/* Avoid pointless buffer overflows: pace at est. bw if we don't
272 | 		 * need more speed (we're restarting from idle and app-limited).
273 | 		 */
274 | 		if (bbr->mode == BBR_PROBE_BW)
275 | 			bbr_set_pacing_rate(sk, bbr_bw(sk), BBR_UNIT);
276 | 	}
277 | }
278 | 
279 | /* Find target cwnd. Right-size the cwnd based on min RTT and the
280 |  * estimated bottleneck bandwidth:
281 |  *
282 |  * cwnd = bw * min_rtt * gain = BDP * gain
283 |  *
284 |  * The key factor, gain, controls the amount of queue. While a small gain
285 |  * builds a smaller queue, it becomes more vulnerable to noise in RTT
286 |  * measurements (e.g., delayed ACKs or other ACK compression effects). This
287 |  * noise may cause BBR to under-estimate the rate.
288 |  *
289 |  * To achieve full performance in high-speed paths, we budget enough cwnd to
290 |  * fit full-sized skbs in-flight on both end hosts to fully utilize the path:
291 |  *   - one skb in sending host Qdisc,
292 |  *   - one skb in sending host TSO/GSO engine
293 |  *   - one skb being received by receiver host LRO/GRO/delayed-ACK engine
294 |  * Don't worry, at low rates (bbr_min_tso_rate) this won't bloat cwnd because
295 |  * in such cases tso_segs_goal is 1. The minimum cwnd is 4 packets,
296 |  * which allows 2 outstanding 2-packet sequences, to try to keep pipe
297 |  * full even with ACK-every-other-packet delayed ACKs.
298 |  */
299 | static u32 bbr_target_cwnd(struct sock *sk, u32 bw, int gain)
300 | {
301 | 	struct bbr *bbr = inet_csk_ca(sk);
302 | 	u32 cwnd;
303 | 	u64 w;
304 | 
305 | 	/* If we've never had a valid RTT sample, cap cwnd at the initial
306 | 	 * default. This should only happen when the connection is not using TCP
307 | 	 * timestamps and has retransmitted all of the SYN/SYNACK/data packets
308 | 	 * ACKed so far. In this case, an RTO can cut cwnd to 1, in which
309 | 	 * case we need to slow-start up toward something safe: TCP_INIT_CWND.
310 | 	 */
311 | 	if (unlikely(bbr->min_rtt_us == ~0U))	 /* no valid RTT samples yet? */
312 | 		return TCP_INIT_CWND;  /* be safe: cap at default initial cwnd*/
313 | 
314 | 	w = (u64)bw * bbr->min_rtt_us;
315 | 
316 | 	/* Apply a gain to the given value, then remove the BW_SCALE shift. */
317 | 	cwnd = (((w * gain) >> BBR_SCALE) + BW_UNIT - 1) / BW_UNIT;
318 | 
319 | 	/* Allow enough full-sized skbs in flight to utilize end systems. */
320 | 	cwnd += 3 * bbr->tso_segs_goal;
321 | 
322 | 	/* Reduce delayed ACKs by rounding up cwnd to the next even number. */
323 | 	cwnd = (cwnd + 1) & ~1U;
324 | 
325 | 	return cwnd;
326 | }
327 | 
328 | /* An optimization in BBR to reduce losses: On the first round of recovery, we
329 |  * follow the packet conservation principle: send P packets per P packets acked.
330 |  * After that, we slow-start and send at most 2*P packets per P packets acked.
331 |  * After recovery finishes, or upon undo, we restore the cwnd we had when
332 |  * recovery started (capped by the target cwnd based on estimated BDP).
333 |  *
334 |  * TODO(ycheng/ncardwell): implement a rate-based approach.
335 |  */
336 | static bool bbr_set_cwnd_to_recover_or_restore(
337 | 	struct sock *sk, const struct rate_sample *rs, u32 acked, u32 *new_cwnd)
338 | {
339 | 	struct tcp_sock *tp = tcp_sk(sk);
340 | 	struct bbr *bbr = inet_csk_ca(sk);
341 | 	u8 prev_state = bbr->prev_ca_state, state = inet_csk(sk)->icsk_ca_state;
342 | 	u32 cwnd = tp->snd_cwnd;
343 | 
344 | 	/* An ACK for P pkts should release at most 2*P packets. We do this
345 | 	 * in two steps. First, here we deduct the number of lost packets.
346 | 	 * Then, in bbr_set_cwnd() we slow start up toward the target cwnd.
347 | 	 */
348 | 	if (rs->losses > 0)
349 | 		cwnd = max_t(s32, cwnd - rs->losses, 1);
350 | 
351 | 	if (state == TCP_CA_Recovery && prev_state != TCP_CA_Recovery) {
352 | 		/* Starting 1st round of Recovery, so do packet conservation. */
353 | 		bbr->packet_conservation = 1;
354 | 		bbr->next_rtt_delivered = tp->delivered;  /* start round now */
355 | 		/* Cut unused cwnd from app behavior, TSQ, or TSO deferral: */
356 | 		cwnd = tcp_packets_in_flight(tp) + acked;
357 | 	} else if (prev_state >= TCP_CA_Recovery && state < TCP_CA_Recovery) {
358 | 		/* Exiting loss recovery; restore cwnd saved before recovery. */
359 | 		bbr->restore_cwnd = 1;
360 | 		bbr->packet_conservation = 0;
361 | 	}
362 | 	bbr->prev_ca_state = state;
363 | 
364 | 	if (bbr->restore_cwnd) {
365 | 		/* Restore cwnd after exiting loss recovery or PROBE_RTT. */
366 | 		cwnd = max(cwnd, bbr->prior_cwnd);
367 | 		bbr->restore_cwnd = 0;
368 | 	}
369 | 
370 | 	if (bbr->packet_conservation) {
371 | 		*new_cwnd = max(cwnd, tcp_packets_in_flight(tp) + acked);
372 | 		return true;	/* yes, using packet conservation */
373 | 	}
374 | 	*new_cwnd = cwnd;
375 | 	return false;
376 | }
377 | 
378 | /* Slow-start up toward target cwnd (if bw estimate is growing, or packet loss
379 |  * has drawn us down below target), or snap down to target if we're above it.
380 |  */
381 | static void bbr_set_cwnd(struct sock *sk, const struct rate_sample *rs,
382 | 			 u32 acked, u32 bw, int gain)
383 | {
384 | 	struct tcp_sock *tp = tcp_sk(sk);
385 | 	struct bbr *bbr = inet_csk_ca(sk);
386 | 	u32 cwnd = 0, target_cwnd = 0;
387 | 
388 | 	if (!acked)
389 | 		return;
390 | 
391 | 	if (bbr_set_cwnd_to_recover_or_restore(sk, rs, acked, &cwnd))
392 | 		goto done;
393 | 
394 | 	/* If we're below target cwnd, slow start cwnd toward target cwnd. */
395 | 	target_cwnd = bbr_target_cwnd(sk, bw, gain);
396 | 	if (bbr_full_bw_reached(sk))  /* only cut cwnd if we filled the pipe */
397 | 		cwnd = min(cwnd + acked, target_cwnd);
398 | 	else if (cwnd < target_cwnd || tp->delivered < TCP_INIT_CWND)
399 | 		cwnd = cwnd + acked;
400 | 	cwnd = max(cwnd, bbr_cwnd_min_target);
401 | 
402 | done:
403 | 	tp->snd_cwnd = min(cwnd, tp->snd_cwnd_clamp);	/* apply global cap */
404 | 	if (bbr->mode == BBR_PROBE_RTT)  /* drain queue, refresh min_rtt */
405 | 		tp->snd_cwnd = min(tp->snd_cwnd, bbr_cwnd_min_target);
406 | }
407 | 
408 | /* End cycle phase if it's time and/or we hit the phase's in-flight target. */
409 | static bool bbr_is_next_cycle_phase(struct sock *sk,
410 | 				    const struct rate_sample *rs)
411 | {
412 | 	struct tcp_sock *tp = tcp_sk(sk);
413 | 	struct bbr *bbr = inet_csk_ca(sk);
414 | 	bool is_full_length =
415 | 		skb_mstamp_us_delta(&tp->delivered_mstamp, &bbr->cycle_mstamp) >
416 | 		bbr->min_rtt_us;
417 | 	u32 inflight, bw;
418 | 
419 | 	/* The pacing_gain of 1.0 paces at the estimated bw to try to fully
420 | 	 * use the pipe without increasing the queue.
421 | 	 */
422 | 	if (bbr->pacing_gain == BBR_UNIT)
423 | 		return is_full_length;		/* just use wall clock time */
424 | 
425 | 	inflight = rs->prior_in_flight;  /* what was in-flight before ACK? */
426 | 	bw = bbr_max_bw(sk);
427 | 
428 | 	/* A pacing_gain > 1.0 probes for bw by trying to raise inflight to at
429 | 	 * least pacing_gain*BDP; this may take more than min_rtt if min_rtt is
430 | 	 * small (e.g. on a LAN). We do not persist if packets are lost, since
431 | 	 * a path with small buffers may not hold that much.
432 | 	 */
433 | 	if (bbr->pacing_gain > BBR_UNIT)
434 | 		return is_full_length &&
435 | 			(rs->losses ||  /* perhaps pacing_gain*BDP won't fit */
436 | 			 inflight >= bbr_target_cwnd(sk, bw, bbr->pacing_gain));
437 | 
438 | 	/* A pacing_gain < 1.0 tries to drain extra queue we added if bw
439 | 	 * probing didn't find more bw. If inflight falls to match BDP then we
440 | 	 * estimate queue is drained; persisting would underutilize the pipe.
441 | 	 */
442 | 	return is_full_length ||
443 | 		inflight <= bbr_target_cwnd(sk, bw, BBR_UNIT);
444 | }
445 | 
446 | static void bbr_advance_cycle_phase(struct sock *sk)
447 | {
448 | 	struct tcp_sock *tp = tcp_sk(sk);
449 | 	struct bbr *bbr = inet_csk_ca(sk);
450 | 
451 | 	bbr->cycle_idx = (bbr->cycle_idx + 1) & (CYCLE_LEN - 1);
452 | 	bbr->cycle_mstamp = tp->delivered_mstamp;
453 | 	bbr->pacing_gain = bbr_pacing_gain[bbr->cycle_idx];
454 | }
455 | 
456 | /* Gain cycling: cycle pacing gain to converge to fair share of available bw. */
457 | static void bbr_update_cycle_phase(struct sock *sk,
458 | 				   const struct rate_sample *rs)
459 | {
460 | 	struct bbr *bbr = inet_csk_ca(sk);
461 | 
462 | 	if ((bbr->mode == BBR_PROBE_BW) && !bbr->lt_use_bw &&
463 | 	    bbr_is_next_cycle_phase(sk, rs))
464 | 		bbr_advance_cycle_phase(sk);
465 | }
466 | 
467 | static void bbr_reset_startup_mode(struct sock *sk)
468 | {
469 | 	struct bbr *bbr = inet_csk_ca(sk);
470 | 
471 | 	bbr->mode = BBR_STARTUP;
472 | 	bbr->pacing_gain = bbr_high_gain;
473 | 	bbr->cwnd_gain	 = bbr_high_gain;
474 | }
475 | 
476 | static void bbr_reset_probe_bw_mode(struct sock *sk)
477 | {
478 | 	struct bbr *bbr = inet_csk_ca(sk);
479 | 
480 | 	bbr->mode = BBR_PROBE_BW;
481 | 	bbr->pacing_gain = BBR_UNIT;
482 | 	bbr->cwnd_gain = bbr_cwnd_gain;
483 | 	bbr->cycle_idx = CYCLE_LEN - 1 - prandom_u32_max(bbr_cycle_rand);
484 | 	bbr_advance_cycle_phase(sk);	/* flip to next phase of gain cycle */
485 | }
486 | 
487 | static void bbr_reset_mode(struct sock *sk)
488 | {
489 | 	if (!bbr_full_bw_reached(sk))
490 | 		bbr_reset_startup_mode(sk);
491 | 	else
492 | 		bbr_reset_probe_bw_mode(sk);
493 | }
494 | 
495 | /* Start a new long-term sampling interval. */
496 | static void bbr_reset_lt_bw_sampling_interval(struct sock *sk)
497 | {
498 | 	struct tcp_sock *tp = tcp_sk(sk);
499 | 	struct bbr *bbr = inet_csk_ca(sk);
500 | 
501 | 	bbr->lt_last_stamp = tp->delivered_mstamp.stamp_jiffies;
502 | 	bbr->lt_last_delivered = tp->delivered;
503 | 	bbr->lt_last_lost = tp->lost;
504 | 	bbr->lt_rtt_cnt = 0;
505 | }
506 | 
507 | /* Completely reset long-term bandwidth sampling. */
508 | static void bbr_reset_lt_bw_sampling(struct sock *sk)
509 | {
510 | 	struct bbr *bbr = inet_csk_ca(sk);
511 | 
512 | 	bbr->lt_bw = 0;
513 | 	bbr->lt_use_bw = 0;
514 | 	bbr->lt_is_sampling = false;
515 | 	bbr_reset_lt_bw_sampling_interval(sk);
516 | }
517 | 
518 | /* Long-term bw sampling interval is done. Estimate whether we're policed. */
519 | static void bbr_lt_bw_interval_done(struct sock *sk, u32 bw)
520 | {
521 | 	struct bbr *bbr = inet_csk_ca(sk);
522 | 	u32 diff;
523 | 
524 | 	if (bbr->lt_bw) {  /* do we have bw from a previous interval? */
525 | 		/* Is new bw close to the lt_bw from the previous interval? */
526 | 		diff = abs(bw - bbr->lt_bw);
527 | 		if ((diff * BBR_UNIT <= bbr_lt_bw_ratio * bbr->lt_bw) ||
528 | 		    (bbr_rate_bytes_per_sec(sk, diff, BBR_UNIT) <=
529 | 		     bbr_lt_bw_diff)) {
530 | 			/* All criteria are met; estimate we're policed. */
531 | 			bbr->lt_bw = (bw + bbr->lt_bw) >> 1;  /* avg 2 intvls */
532 | 			bbr->lt_use_bw = 1;
533 | 			bbr->pacing_gain = BBR_UNIT;  /* try to avoid drops */
534 | 			bbr->lt_rtt_cnt = 0;
535 | 			return;
536 | 		}
537 | 	}
538 | 	bbr->lt_bw = bw;
539 | 	bbr_reset_lt_bw_sampling_interval(sk);
540 | }
541 | 
542 | /* Token-bucket traffic policers are common (see "An Internet-Wide Analysis of
543 |  * Traffic Policing", SIGCOMM 2016). BBR detects token-bucket policers and
544 |  * explicitly models their policed rate, to reduce unnecessary losses. We
545 |  * estimate that we're policed if we see 2 consecutive sampling intervals with
546 |  * consistent throughput and high packet loss. If we think we're being policed,
547 |  * set lt_bw to the "long-term" average delivery rate from those 2 intervals.
548 |  */
549 | static void bbr_lt_bw_sampling(struct sock *sk, const struct rate_sample *rs)
550 | {
551 | 	struct tcp_sock *tp = tcp_sk(sk);
552 | 	struct bbr *bbr = inet_csk_ca(sk);
553 | 	u32 lost, delivered;
554 | 	u64 bw;
555 | 	s32 t;
556 | 
557 | 	if (bbr->lt_use_bw) {	/* already using long-term rate, lt_bw? */
558 | 		if (bbr->mode == BBR_PROBE_BW && bbr->round_start &&
559 | 		    ++bbr->lt_rtt_cnt >= bbr_lt_bw_max_rtts) {
560 | 			bbr_reset_lt_bw_sampling(sk);    /* stop using lt_bw */
561 | 			bbr_reset_probe_bw_mode(sk);  /* restart gain cycling */
562 | 		}
563 | 		return;
564 | 	}
565 | 
566 | 	/* Wait for the first loss before sampling, to let the policer exhaust
567 | 	 * its tokens and estimate the steady-state rate allowed by the policer.
568 | 	 * Starting samples earlier includes bursts that over-estimate the bw.
569 | 	 */
570 | 	if (!bbr->lt_is_sampling) {
571 | 		if (!rs->losses)
572 | 			return;
573 | 		bbr_reset_lt_bw_sampling_interval(sk);
574 | 		bbr->lt_is_sampling = true;
575 | 	}
576 | 
577 | 	/* To avoid underestimates, reset sampling if we run out of data. */
578 | 	if (rs->is_app_limited) {
579 | 		bbr_reset_lt_bw_sampling(sk);
580 | 		return;
581 | 	}
582 | 
583 | 	if (bbr->round_start)
584 | 		bbr->lt_rtt_cnt++;	/* count round trips in this interval */
585 | 	if (bbr->lt_rtt_cnt < bbr_lt_intvl_min_rtts)
586 | 		return;		/* sampling interval needs to be longer */
587 | 	if (bbr->lt_rtt_cnt > 4 * bbr_lt_intvl_min_rtts) {
588 | 		bbr_reset_lt_bw_sampling(sk);  /* interval is too long */
589 | 		return;
590 | 	}
591 | 
592 | 	/* End sampling interval when a packet is lost, so we estimate the
593 | 	 * policer tokens were exhausted. Stopping the sampling before the
594 | 	 * tokens are exhausted under-estimates the policed rate.
595 | 	 */
596 | 	if (!rs->losses)
597 | 		return;
598 | 
599 | 	/* Calculate packets lost and delivered in sampling interval. */
600 | 	lost = tp->lost - bbr->lt_last_lost;
601 | 	delivered = tp->delivered - bbr->lt_last_delivered;
602 | 	/* Is loss rate (lost/delivered) >= lt_loss_thresh? If not, wait. */
603 | 	if (!delivered || (lost << BBR_SCALE) < bbr_lt_loss_thresh * delivered)
604 | 		return;
605 | 
606 | 	/* Find average delivery rate in this sampling interval. */
607 | 	t = (s32)(tp->delivered_mstamp.stamp_jiffies - bbr->lt_last_stamp);
608 | 	if (t < 1)
609 | 		return;		/* interval is less than one jiffy, so wait */
610 | 	t = jiffies_to_usecs(t);
611 | 	/* Interval long enough for jiffies_to_usecs() to return a bogus 0? */
612 | 	if (t < 1) {
613 | 		bbr_reset_lt_bw_sampling(sk);  /* interval too long; reset */
614 | 		return;
615 | 	}
616 | 	bw = (u64)delivered * BW_UNIT;
617 | 	do_div(bw, t);
618 | 	bbr_lt_bw_interval_done(sk, bw);
619 | }
620 | 
621 | /* Estimate the bandwidth based on how fast packets are delivered */
622 | static void bbr_update_bw(struct sock *sk, const struct rate_sample *rs)
623 | {
624 | 	struct tcp_sock *tp = tcp_sk(sk);
625 | 	struct bbr *bbr = inet_csk_ca(sk);
626 | 	u64 bw;
627 | 
628 | 	bbr->round_start = 0;
629 | 	if (rs->delivered < 0 || rs->interval_us <= 0)
630 | 		return; /* Not a valid observation */
631 | 
632 | 	/* See if we've reached the next RTT */
633 | 	if (!before(rs->prior_delivered, bbr->next_rtt_delivered)) {
634 | 		bbr->next_rtt_delivered = tp->delivered;
635 | 		bbr->rtt_cnt++;
636 | 		bbr->round_start = 1;
637 | 		bbr->packet_conservation = 0;
638 | 	}
639 | 
640 | 	bbr_lt_bw_sampling(sk, rs);
641 | 
642 | 	/* Divide delivered by the interval to find a (lower bound) bottleneck
643 | 	 * bandwidth sample. Delivered is in packets and interval_us in uS and
644 | 	 * ratio will be <<1 for most connections. So delivered is first scaled.
645 | 	 */
646 | 	bw = (u64)rs->delivered * BW_UNIT;
647 | 	do_div(bw, rs->interval_us);
648 | 
649 | 	/* If this sample is application-limited, it is likely to have a very
650 | 	 * low delivered count that represents application behavior rather than
651 | 	 * the available network rate. Such a sample could drag down estimated
652 | 	 * bw, causing needless slow-down. Thus, to continue to send at the
653 | 	 * last measured network rate, we filter out app-limited samples unless
654 | 	 * they describe the path bw at least as well as our bw model.
655 | 	 *
656 | 	 * So the goal during app-limited phase is to proceed with the best
657 | 	 * network rate no matter how long. We automatically leave this
658 | 	 * phase when app writes faster than the network can deliver :)
659 | 	 */
660 | 	if (!rs->is_app_limited || bw >= bbr_max_bw(sk)) {
661 | 		/* Incorporate new sample into our max bw filter. */
662 | 		minmax_running_max(&bbr->bw, bbr_bw_rtts, bbr->rtt_cnt, bw);
663 | 	}
664 | }
665 | 
666 | /* Estimate when the pipe is full, using the change in delivery rate: BBR
667 |  * estimates that STARTUP filled the pipe if the estimated bw hasn't changed by
668 |  * at least bbr_full_bw_thresh (25%) after bbr_full_bw_cnt (3) non-app-limited
669 |  * rounds. Why 3 rounds: 1: rwin autotuning grows the rwin, 2: we fill the
670 |  * higher rwin, 3: we get higher delivery rate samples. Or transient
671 |  * cross-traffic or radio noise can go away. CUBIC Hystart shares a similar
672 |  * design goal, but uses delay and inter-ACK spacing instead of bandwidth.
673 |  */
674 | static void bbr_check_full_bw_reached(struct sock *sk,
675 | 				      const struct rate_sample *rs)
676 | {
677 | 	struct bbr *bbr = inet_csk_ca(sk);
678 | 	u32 bw_thresh;
679 | 
680 | 	if (bbr_full_bw_reached(sk) || !bbr->round_start || rs->is_app_limited)
681 | 		return;
682 | 
683 | 	bw_thresh = (u64)bbr->full_bw * bbr_full_bw_thresh >> BBR_SCALE;
684 | 	if (bbr_max_bw(sk) >= bw_thresh) {
685 | 		bbr->full_bw = bbr_max_bw(sk);
686 | 		bbr->full_bw_cnt = 0;
687 | 		return;
688 | 	}
689 | 	++bbr->full_bw_cnt;
690 | }
691 | 
692 | /* If pipe is probably full, drain the queue and then enter steady-state. */
693 | static void bbr_check_drain(struct sock *sk, const struct rate_sample *rs)
694 | {
695 | 	struct bbr *bbr = inet_csk_ca(sk);
696 | 
697 | 	if (bbr->mode == BBR_STARTUP && bbr_full_bw_reached(sk)) {
698 | 		bbr->mode = BBR_DRAIN;	/* drain queue we created */
699 | 		bbr->pacing_gain = bbr_drain_gain;	/* pace slow to drain */
700 | 		bbr->cwnd_gain = bbr_high_gain;	/* maintain cwnd */
701 | 	}	/* fall through to check if in-flight is already small: */
702 | 	if (bbr->mode == BBR_DRAIN &&
703 | 	    tcp_packets_in_flight(tcp_sk(sk)) <=
704 | 	    bbr_target_cwnd(sk, bbr_max_bw(sk), BBR_UNIT))
705 | 		bbr_reset_probe_bw_mode(sk);  /* we estimate queue is drained */
706 | }
707 | 
708 | /* The goal of PROBE_RTT mode is to have BBR flows cooperatively and
709 |  * periodically drain the bottleneck queue, to converge to measure the true
710 |  * min_rtt (unloaded propagation delay). This allows the flows to keep queues
711 |  * small (reducing queuing delay and packet loss) and achieve fairness among
712 |  * BBR flows.
713 |  *
714 |  * The min_rtt filter window is 10 seconds. When the min_rtt estimate expires,
715 |  * we enter PROBE_RTT mode and cap the cwnd at bbr_cwnd_min_target=4 packets.
716 |  * After at least bbr_probe_rtt_mode_ms=200ms and at least one packet-timed
717 |  * round trip elapsed with that flight size <= 4, we leave PROBE_RTT mode and
718 |  * re-enter the previous mode. BBR uses 200ms to approximately bound the
719 |  * performance penalty of PROBE_RTT's cwnd capping to roughly 2% (200ms/10s).
720 |  *
721 |  * Note that flows need only pay 2% if they are busy sending over the last 10
722 |  * seconds. Interactive applications (e.g., Web, RPCs, video chunks) often have
723 |  * natural silences or low-rate periods within 10 seconds where the rate is low
724 |  * enough for long enough to drain its queue in the bottleneck. We pick up
725 |  * these min RTT measurements opportunistically with our min_rtt filter. :-)
726 |  */
727 | static void bbr_update_min_rtt(struct sock *sk, const struct rate_sample *rs)
728 | {
729 | 	struct tcp_sock *tp = tcp_sk(sk);
730 | 	struct bbr *bbr = inet_csk_ca(sk);
731 | 	bool filter_expired;
732 | 
733 | 	/* Track min RTT seen in the min_rtt_win_sec filter window: */
734 | 	filter_expired = after(tcp_time_stamp,
735 | 			       bbr->min_rtt_stamp + bbr_min_rtt_win_sec * HZ);
736 | 	if (rs->rtt_us >= 0 &&
737 | 	    (rs->rtt_us <= bbr->min_rtt_us || filter_expired)) {
738 | 		bbr->min_rtt_us = rs->rtt_us;
739 | 		bbr->min_rtt_stamp = tcp_time_stamp;
740 | 	}
741 | 
742 | 	if (bbr_probe_rtt_mode_ms > 0 && filter_expired &&
743 | 	    !bbr->idle_restart && bbr->mode != BBR_PROBE_RTT) {
744 | 		bbr->mode = BBR_PROBE_RTT;  /* dip, drain queue */
745 | 		bbr->pacing_gain = BBR_UNIT;
746 | 		bbr->cwnd_gain = BBR_UNIT;
747 | 		bbr_save_cwnd(sk);  /* note cwnd so we can restore it */
748 | 		bbr->probe_rtt_done_stamp = 0;
749 | 	}
750 | 
751 | 	if (bbr->mode == BBR_PROBE_RTT) {
752 | 		/* Ignore low rate samples during this mode. */
753 | 		tp->app_limited =
754 | 			(tp->delivered + tcp_packets_in_flight(tp)) ? : 1;
755 | 		/* Maintain min packets in flight for max(200 ms, 1 round). */
756 | 		if (!bbr->probe_rtt_done_stamp &&
757 | 		    tcp_packets_in_flight(tp) <= bbr_cwnd_min_target) {
758 | 			bbr->probe_rtt_done_stamp = tcp_time_stamp +
759 | 				msecs_to_jiffies(bbr_probe_rtt_mode_ms);
760 | 			bbr->probe_rtt_round_done = 0;
761 | 			bbr->next_rtt_delivered = tp->delivered;
762 | 		} else if (bbr->probe_rtt_done_stamp) {
763 | 			if (bbr->round_start)
764 | 				bbr->probe_rtt_round_done = 1;
765 | 			if (bbr->probe_rtt_round_done &&
766 | 			    after(tcp_time_stamp, bbr->probe_rtt_done_stamp)) {
767 | 				bbr->min_rtt_stamp = tcp_time_stamp;
768 | 				bbr->restore_cwnd = 1;  /* snap to prior_cwnd */
769 | 				bbr_reset_mode(sk);
770 | 			}
771 | 		}
772 | 	}
773 | 	bbr->idle_restart = 0;
774 | }
775 | 
776 | static void bbr_update_model(struct sock *sk, const struct rate_sample *rs)
777 | {
778 | 	bbr_update_bw(sk, rs);
779 | 	bbr_update_cycle_phase(sk, rs);
780 | 	bbr_check_full_bw_reached(sk, rs);
781 | 	bbr_check_drain(sk, rs);
782 | 	bbr_update_min_rtt(sk, rs);
783 | }
784 | 
785 | static void bbr_main(struct sock *sk, const struct rate_sample *rs)
786 | {
787 | 	struct bbr *bbr = inet_csk_ca(sk);
788 | 	u32 bw;
789 | 
790 | 	bbr_update_model(sk, rs);
791 | 
792 | 	bw = bbr_bw(sk);
793 | 	bbr_set_pacing_rate(sk, bw, bbr->pacing_gain);
794 | 	bbr_set_tso_segs_goal(sk);
795 | 	bbr_set_cwnd(sk, rs, rs->acked_sacked, bw, bbr->cwnd_gain);
796 | }
797 | 
798 | static void bbr_init(struct sock *sk)
799 | {
800 | 	struct tcp_sock *tp = tcp_sk(sk);
801 | 	struct bbr *bbr = inet_csk_ca(sk);
802 | 	u64 bw;
803 | 
804 | 	bbr->prior_cwnd = 0;
805 | 	bbr->tso_segs_goal = 0;	 /* default segs per skb until first ACK */
806 | 	bbr->rtt_cnt = 0;
807 | 	bbr->next_rtt_delivered = 0;
808 | 	bbr->prev_ca_state = TCP_CA_Open;
809 | 	bbr->packet_conservation = 0;
810 | 
811 | 	bbr->probe_rtt_done_stamp = 0;
812 | 	bbr->probe_rtt_round_done = 0;
813 | 	bbr->min_rtt_us = tcp_min_rtt(tp);
814 | 	bbr->min_rtt_stamp = tcp_time_stamp;
815 | 
816 | 	minmax_reset(&bbr->bw, bbr->rtt_cnt, 0);  /* init max bw to 0 */
817 | 
818 | 	/* Initialize pacing rate to: high_gain * init_cwnd / RTT. */
819 | 	bw = (u64)tp->snd_cwnd * BW_UNIT;
820 | 	do_div(bw, (tp->srtt_us >> 3) ? : USEC_PER_MSEC);
821 | 	sk->sk_pacing_rate = 0;		/* force an update of sk_pacing_rate */
822 | 	bbr_set_pacing_rate(sk, bw, bbr_high_gain);
823 | 
824 | 	bbr->restore_cwnd = 0;
825 | 	bbr->round_start = 0;
826 | 	bbr->idle_restart = 0;
827 | 	bbr->full_bw = 0;
828 | 	bbr->full_bw_cnt = 0;
829 | 	bbr->cycle_mstamp.v64 = 0;
830 | 	bbr->cycle_idx = 0;
831 | 	bbr_reset_lt_bw_sampling(sk);
832 | 	bbr_reset_startup_mode(sk);
833 | }
834 | 
835 | static u32 bbr_sndbuf_expand(struct sock *sk)
836 | {
837 | 	/* Provision 3 * cwnd since BBR may slow-start even during recovery. */
838 | 	return 3;
839 | }
840 | 
841 | /* In theory BBR does not need to undo the cwnd since it does not
842 |  * always reduce cwnd on losses (see bbr_main()). Keep it for now.
843 |  */
844 | static u32 bbr_undo_cwnd(struct sock *sk)
845 | {
846 | 	return tcp_sk(sk)->snd_cwnd;
847 | }
848 | 
849 | /* Entering loss recovery, so save cwnd for when we exit or undo recovery. */
850 | static u32 bbr_ssthresh(struct sock *sk)
851 | {
852 | 	bbr_save_cwnd(sk);
853 | 	return TCP_INFINITE_SSTHRESH;	 /* BBR does not use ssthresh */
854 | }
855 | 
856 | static size_t bbr_get_info(struct sock *sk, u32 ext, int *attr,
857 | 			   union tcp_cc_info *info)
858 | {
859 | 	if (ext & (1 << (INET_DIAG_BBRINFO - 1)) ||
860 | 	    ext & (1 << (INET_DIAG_VEGASINFO - 1))) {
861 | 		struct tcp_sock *tp = tcp_sk(sk);
862 | 		struct bbr *bbr = inet_csk_ca(sk);
863 | 		u64 bw = bbr_bw(sk);
864 | 
865 | 		bw = bw * tp->mss_cache * USEC_PER_SEC >> BW_SCALE;
866 | 		memset(&info->bbr, 0, sizeof(info->bbr));
867 | 		info->bbr.bbr_bw_lo		= (u32)bw;
868 | 		info->bbr.bbr_bw_hi		= (u32)(bw >> 32);
869 | 		info->bbr.bbr_min_rtt		= bbr->min_rtt_us;
870 | 		info->bbr.bbr_pacing_gain	= bbr->pacing_gain;
871 | 		info->bbr.bbr_cwnd_gain		= bbr->cwnd_gain;
872 | 		*attr = INET_DIAG_BBRINFO;
873 | 		return sizeof(info->bbr);
874 | 	}
875 | 	return 0;
876 | }
877 | 
878 | static void bbr_set_state(struct sock *sk, u8 new_state)
879 | {
880 | 	struct bbr *bbr = inet_csk_ca(sk);
881 | 
882 | 	if (new_state == TCP_CA_Loss) {
883 | 		struct rate_sample rs = { .losses = 1 };
884 | 
885 | 		bbr->prev_ca_state = TCP_CA_Loss;
886 | 		bbr->full_bw = 0;
887 | 		bbr->round_start = 1;	/* treat RTO like end of a round */
888 | 		bbr_lt_bw_sampling(sk, &rs);
889 | 	}
890 | }
891 | 
892 | static struct tcp_congestion_ops tcp_bbr_cong_ops __read_mostly = {
893 | 	.flags		= TCP_CONG_NON_RESTRICTED,
894 | 	.name		= "bbr",
895 | 	.owner		= THIS_MODULE,
896 | 	.init		= bbr_init,
897 | 	.cong_control	= bbr_main,
898 | 	.sndbuf_expand	= bbr_sndbuf_expand,
899 | 	.undo_cwnd	= bbr_undo_cwnd,
900 | 	.cwnd_event	= bbr_cwnd_event,
901 | 	.ssthresh	= bbr_ssthresh,
902 | 	.tso_segs_goal	= bbr_tso_segs_goal,
903 | 	.get_info	= bbr_get_info,
904 | 	.set_state	= bbr_set_state,
905 | };
906 | 
907 | static int __init bbr_register(void)
908 | {
909 | 	BUILD_BUG_ON(sizeof(struct bbr) > ICSK_CA_PRIV_SIZE);
910 | 	return tcp_register_congestion_control(&tcp_bbr_cong_ops);
911 | }
912 | 
913 | static void __exit bbr_unregister(void)
914 | {
915 | 	tcp_unregister_congestion_control(&tcp_bbr_cong_ops);
916 | }
917 | 
918 | module_init(bbr_register);
919 | module_exit(bbr_unregister);
920 | 
921 | MODULE_AUTHOR("Van Jacobson <vanj@google.com>");
922 | MODULE_AUTHOR("Neal Cardwell <ncardwell@google.com>");
923 | MODULE_AUTHOR("Yuchung Cheng <ycheng@google.com>");
924 | MODULE_AUTHOR("Soheil Hassas Yeganeh <soheil@google.com>");
925 | MODULE_LICENSE("Dual BSD/GPL");
926 | MODULE_DESCRIPTION("TCP BBR (Bottleneck Bandwidth and RTT)");
927 | 


--------------------------------------------------------------------------------
/General/Debian/source/kernel-v4.13/tcp_bbr.c:
--------------------------------------------------------------------------------
  1 | /* Bottleneck Bandwidth and RTT (BBR) congestion control
  2 |  *
  3 |  * BBR congestion control computes the sending rate based on the delivery
  4 |  * rate (throughput) estimated from ACKs. In a nutshell:
  5 |  *
  6 |  *   On each ACK, update our model of the network path:
  7 |  *      bottleneck_bandwidth = windowed_max(delivered / elapsed, 10 round trips)
  8 |  *      min_rtt = windowed_min(rtt, 10 seconds)
  9 |  *   pacing_rate = pacing_gain * bottleneck_bandwidth
 10 |  *   cwnd = max(cwnd_gain * bottleneck_bandwidth * min_rtt, 4)
 11 |  *
 12 |  * The core algorithm does not react directly to packet losses or delays,
 13 |  * although BBR may adjust the size of next send per ACK when loss is
 14 |  * observed, or adjust the sending rate if it estimates there is a
 15 |  * traffic policer, in order to keep the drop rate reasonable.
 16 |  *
 17 |  * Here is a state transition diagram for BBR:
 18 |  *
 19 |  *             |
 20 |  *             V
 21 |  *    +---> STARTUP  ----+
 22 |  *    |        |         |
 23 |  *    |        V         |
 24 |  *    |      DRAIN   ----+
 25 |  *    |        |         |
 26 |  *    |        V         |
 27 |  *    +---> PROBE_BW ----+
 28 |  *    |      ^    |      |
 29 |  *    |      |    |      |
 30 |  *    |      +----+      |
 31 |  *    |                  |
 32 |  *    +---- PROBE_RTT <--+
 33 |  *
 34 |  * A BBR flow starts in STARTUP, and ramps up its sending rate quickly.
 35 |  * When it estimates the pipe is full, it enters DRAIN to drain the queue.
 36 |  * In steady state a BBR flow only uses PROBE_BW and PROBE_RTT.
 37 |  * A long-lived BBR flow spends the vast majority of its time remaining
 38 |  * (repeatedly) in PROBE_BW, fully probing and utilizing the pipe's bandwidth
 39 |  * in a fair manner, with a small, bounded queue. *If* a flow has been
 40 |  * continuously sending for the entire min_rtt window, and hasn't seen an RTT
 41 |  * sample that matches or decreases its min_rtt estimate for 10 seconds, then
 42 |  * it briefly enters PROBE_RTT to cut inflight to a minimum value to re-probe
 43 |  * the path's two-way propagation delay (min_rtt). When exiting PROBE_RTT, if
 44 |  * we estimated that we reached the full bw of the pipe then we enter PROBE_BW;
 45 |  * otherwise we enter STARTUP to try to fill the pipe.
 46 |  *
 47 |  * BBR is described in detail in:
 48 |  *   "BBR: Congestion-Based Congestion Control",
 49 |  *   Neal Cardwell, Yuchung Cheng, C. Stephen Gunn, Soheil Hassas Yeganeh,
 50 |  *   Van Jacobson. ACM Queue, Vol. 14 No. 5, September-October 2016.
 51 |  *
 52 |  * There is a public e-mail list for discussing BBR development and testing:
 53 |  *   https://groups.google.com/forum/#!forum/bbr-dev
 54 |  *
 55 |  * NOTE: BBR might be used with the fq qdisc ("man tc-fq") with pacing enabled,
 56 |  * otherwise TCP stack falls back to an internal pacing using one high
 57 |  * resolution timer per TCP socket and may use more resources.
 58 |  */
 59 | #include <linux/module.h>
 60 | #include <net/tcp.h>
 61 | #include <linux/inet_diag.h>
 62 | #include <linux/inet.h>
 63 | #include <linux/random.h>
 64 | #include <linux/win_minmax.h>
 65 | 
 66 | /* Scale factor for rate in pkt/uSec unit to avoid truncation in bandwidth
 67 |  * estimation. The rate unit ~= (1500 bytes / 1 usec / 2^24) ~= 715 bps.
 68 |  * This handles bandwidths from 0.06pps (715bps) to 256Mpps (3Tbps) in a u32.
 69 |  * Since the minimum window is >=4 packets, the lower bound isn't
 70 |  * an issue. The upper bound isn't an issue with existing technologies.
 71 |  */
 72 | #define BW_SCALE 24
 73 | #define BW_UNIT (1 << BW_SCALE)
 74 | 
 75 | #define BBR_SCALE 8	/* scaling factor for fractions in BBR (e.g. gains) */
 76 | #define BBR_UNIT (1 << BBR_SCALE)
 77 | 
 78 | /* BBR has the following modes for deciding how fast to send: */
 79 | enum bbr_mode {
 80 | 	BBR_STARTUP,	/* ramp up sending rate rapidly to fill pipe */
 81 | 	BBR_DRAIN,	/* drain any queue created during startup */
 82 | 	BBR_PROBE_BW,	/* discover, share bw: pace around estimated bw */
 83 | 	BBR_PROBE_RTT,	/* cut inflight to min to probe min_rtt */
 84 | };
 85 | 
 86 | /* BBR congestion control block */
 87 | struct bbr {
 88 | 	u32	min_rtt_us;	        /* min RTT in min_rtt_win_sec window */
 89 | 	u32	min_rtt_stamp;	        /* timestamp of min_rtt_us */
 90 | 	u32	probe_rtt_done_stamp;   /* end time for BBR_PROBE_RTT mode */
 91 | 	struct minmax bw;	/* Max recent delivery rate in pkts/uS << 24 */
 92 | 	u32	rtt_cnt;	    /* count of packet-timed rounds elapsed */
 93 | 	u32     next_rtt_delivered; /* scb->tx.delivered at end of round */
 94 | 	u64	cycle_mstamp;	     /* time of this cycle phase start */
 95 | 	u32     mode:3,		     /* current bbr_mode in state machine */
 96 | 		prev_ca_state:3,     /* CA state on previous ACK */
 97 | 		packet_conservation:1,  /* use packet conservation? */
 98 | 		restore_cwnd:1,	     /* decided to revert cwnd to old value */
 99 | 		round_start:1,	     /* start of packet-timed tx->ack round? */
100 | 		tso_segs_goal:7,     /* segments we want in each skb we send */
101 | 		idle_restart:1,	     /* restarting after idle? */
102 | 		probe_rtt_round_done:1,  /* a BBR_PROBE_RTT round at 4 pkts? */
103 | 		unused:5,
104 | 		lt_is_sampling:1,    /* taking long-term ("LT") samples now? */
105 | 		lt_rtt_cnt:7,	     /* round trips in long-term interval */
106 | 		lt_use_bw:1;	     /* use lt_bw as our bw estimate? */
107 | 	u32	lt_bw;		     /* LT est delivery rate in pkts/uS << 24 */
108 | 	u32	lt_last_delivered;   /* LT intvl start: tp->delivered */
109 | 	u32	lt_last_stamp;	     /* LT intvl start: tp->delivered_mstamp */
110 | 	u32	lt_last_lost;	     /* LT intvl start: tp->lost */
111 | 	u32	pacing_gain:10,	/* current gain for setting pacing rate */
112 | 		cwnd_gain:10,	/* current gain for setting cwnd */
113 | 		full_bw_cnt:3,	/* number of rounds without large bw gains */
114 | 		cycle_idx:3,	/* current index in pacing_gain cycle array */
115 | 		has_seen_rtt:1, /* have we seen an RTT sample yet? */
116 | 		unused_b:5;
117 | 	u32	prior_cwnd;	/* prior cwnd upon entering loss recovery */
118 | 	u32	full_bw;	/* recent bw, to estimate if pipe is full */
119 | };
120 | 
121 | #define CYCLE_LEN	8	/* number of phases in a pacing gain cycle */
122 | 
123 | /* Window length of bw filter (in rounds): */
124 | static const int bbr_bw_rtts = CYCLE_LEN + 2;
125 | /* Window length of min_rtt filter (in sec): */
126 | static const u32 bbr_min_rtt_win_sec = 10;
127 | /* Minimum time (in ms) spent at bbr_cwnd_min_target in BBR_PROBE_RTT mode: */
128 | static const u32 bbr_probe_rtt_mode_ms = 200;
129 | /* Skip TSO below the following bandwidth (bits/sec): */
130 | static const int bbr_min_tso_rate = 1200000;
131 | 
132 | /* We use a high_gain value of 2/ln(2) because it's the smallest pacing gain
133 |  * that will allow a smoothly increasing pacing rate that will double each RTT
134 |  * and send the same number of packets per RTT that an un-paced, slow-starting
135 |  * Reno or CUBIC flow would:
136 |  */
137 | static const int bbr_high_gain  = BBR_UNIT * 2885 / 1000 + 1;
138 | /* The pacing gain of 1/high_gain in BBR_DRAIN is calculated to typically drain
139 |  * the queue created in BBR_STARTUP in a single round:
140 |  */
141 | static const int bbr_drain_gain = BBR_UNIT * 1000 / 2885;
142 | /* The gain for deriving steady-state cwnd tolerates delayed/stretched ACKs: */
143 | static const int bbr_cwnd_gain  = BBR_UNIT * 2;
144 | /* The pacing_gain values for the PROBE_BW gain cycle, to discover/share bw: */
145 | static const int bbr_pacing_gain[] = {
146 | 	BBR_UNIT * 5 / 4,	/* probe for more available bw */
147 | 	BBR_UNIT * 3 / 4,	/* drain queue and/or yield bw to other flows */
148 | 	BBR_UNIT, BBR_UNIT, BBR_UNIT,	/* cruise at 1.0*bw to utilize pipe, */
149 | 	BBR_UNIT, BBR_UNIT, BBR_UNIT	/* without creating excess queue... */
150 | };
151 | /* Randomize the starting gain cycling phase over N phases: */
152 | static const u32 bbr_cycle_rand = 7;
153 | 
154 | /* Try to keep at least this many packets in flight, if things go smoothly. For
155 |  * smooth functioning, a sliding window protocol ACKing every other packet
156 |  * needs at least 4 packets in flight:
157 |  */
158 | static const u32 bbr_cwnd_min_target = 4;
159 | 
160 | /* To estimate if BBR_STARTUP mode (i.e. high_gain) has filled pipe... */
161 | /* If bw has increased significantly (1.25x), there may be more bw available: */
162 | static const u32 bbr_full_bw_thresh = BBR_UNIT * 5 / 4;
163 | /* But after 3 rounds w/o significant bw growth, estimate pipe is full: */
164 | static const u32 bbr_full_bw_cnt = 3;
165 | 
166 | /* "long-term" ("LT") bandwidth estimator parameters... */
167 | /* The minimum number of rounds in an LT bw sampling interval: */
168 | static const u32 bbr_lt_intvl_min_rtts = 4;
169 | /* If lost/delivered ratio > 20%, interval is "lossy" and we may be policed: */
170 | static const u32 bbr_lt_loss_thresh = 50;
171 | /* If 2 intervals have a bw ratio <= 1/8, their bw is "consistent": */
172 | static const u32 bbr_lt_bw_ratio = BBR_UNIT / 8;
173 | /* If 2 intervals have a bw diff <= 4 Kbit/sec their bw is "consistent": */
174 | static const u32 bbr_lt_bw_diff = 4000 / 8;
175 | /* If we estimate we're policed, use lt_bw for this many round trips: */
176 | static const u32 bbr_lt_bw_max_rtts = 48;
177 | 
178 | /* Do we estimate that STARTUP filled the pipe? */
179 | static bool bbr_full_bw_reached(const struct sock *sk)
180 | {
181 | 	const struct bbr *bbr = inet_csk_ca(sk);
182 | 
183 | 	return bbr->full_bw_cnt >= bbr_full_bw_cnt;
184 | }
185 | 
186 | /* Return the windowed max recent bandwidth sample, in pkts/uS << BW_SCALE. */
187 | static u32 bbr_max_bw(const struct sock *sk)
188 | {
189 | 	struct bbr *bbr = inet_csk_ca(sk);
190 | 
191 | 	return minmax_get(&bbr->bw);
192 | }
193 | 
194 | /* Return the estimated bandwidth of the path, in pkts/uS << BW_SCALE. */
195 | static u32 bbr_bw(const struct sock *sk)
196 | {
197 | 	struct bbr *bbr = inet_csk_ca(sk);
198 | 
199 | 	return bbr->lt_use_bw ? bbr->lt_bw : bbr_max_bw(sk);
200 | }
201 | 
202 | /* Return rate in bytes per second, optionally with a gain.
203 |  * The order here is chosen carefully to avoid overflow of u64. This should
204 |  * work for input rates of up to 2.9Tbit/sec and gain of 2.89x.
205 |  */
206 | static u64 bbr_rate_bytes_per_sec(struct sock *sk, u64 rate, int gain)
207 | {
208 | 	rate *= tcp_mss_to_mtu(sk, tcp_sk(sk)->mss_cache);
209 | 	rate *= gain;
210 | 	rate >>= BBR_SCALE;
211 | 	rate *= USEC_PER_SEC;
212 | 	return rate >> BW_SCALE;
213 | }
214 | 
215 | /* Convert a BBR bw and gain factor to a pacing rate in bytes per second. */
216 | static u32 bbr_bw_to_pacing_rate(struct sock *sk, u32 bw, int gain)
217 | {
218 | 	u64 rate = bw;
219 | 
220 | 	rate = bbr_rate_bytes_per_sec(sk, rate, gain);
221 | 	rate = min_t(u64, rate, sk->sk_max_pacing_rate);
222 | 	return rate;
223 | }
224 | 
225 | /* Initialize pacing rate to: high_gain * init_cwnd / RTT. */
226 | static void bbr_init_pacing_rate_from_rtt(struct sock *sk)
227 | {
228 | 	struct tcp_sock *tp = tcp_sk(sk);
229 | 	struct bbr *bbr = inet_csk_ca(sk);
230 | 	u64 bw;
231 | 	u32 rtt_us;
232 | 
233 | 	if (tp->srtt_us) {		/* any RTT sample yet? */
234 | 		rtt_us = max(tp->srtt_us >> 3, 1U);
235 | 		bbr->has_seen_rtt = 1;
236 | 	} else {			 /* no RTT sample yet */
237 | 		rtt_us = USEC_PER_MSEC;	 /* use nominal default RTT */
238 | 	}
239 | 	bw = (u64)tp->snd_cwnd * BW_UNIT;
240 | 	do_div(bw, rtt_us);
241 | 	sk->sk_pacing_rate = bbr_bw_to_pacing_rate(sk, bw, bbr_high_gain);
242 | }
243 | 
244 | /* Pace using current bw estimate and a gain factor. In order to help drive the
245 |  * network toward lower queues while maintaining high utilization and low
246 |  * latency, the average pacing rate aims to be slightly (~1%) lower than the
247 |  * estimated bandwidth. This is an important aspect of the design. In this
248 |  * implementation this slightly lower pacing rate is achieved implicitly by not
249 |  * including link-layer headers in the packet size used for the pacing rate.
250 |  */
251 | static void bbr_set_pacing_rate(struct sock *sk, u32 bw, int gain)
252 | {
253 | 	struct tcp_sock *tp = tcp_sk(sk);
254 | 	struct bbr *bbr = inet_csk_ca(sk);
255 | 	u32 rate = bbr_bw_to_pacing_rate(sk, bw, gain);
256 | 
257 | 	if (unlikely(!bbr->has_seen_rtt && tp->srtt_us))
258 | 		bbr_init_pacing_rate_from_rtt(sk);
259 | 	if (bbr_full_bw_reached(sk) || rate > sk->sk_pacing_rate)
260 | 		sk->sk_pacing_rate = rate;
261 | }
262 | 
263 | /* Return count of segments we want in the skbs we send, or 0 for default. */
264 | static u32 bbr_tso_segs_goal(struct sock *sk)
265 | {
266 | 	struct bbr *bbr = inet_csk_ca(sk);
267 | 
268 | 	return bbr->tso_segs_goal;
269 | }
270 | 
271 | static void bbr_set_tso_segs_goal(struct sock *sk)
272 | {
273 | 	struct tcp_sock *tp = tcp_sk(sk);
274 | 	struct bbr *bbr = inet_csk_ca(sk);
275 | 	u32 min_segs;
276 | 
277 | 	min_segs = sk->sk_pacing_rate < (bbr_min_tso_rate >> 3) ? 1 : 2;
278 | 	bbr->tso_segs_goal = min(tcp_tso_autosize(sk, tp->mss_cache, min_segs),
279 | 				 0x7FU);
280 | }
281 | 
282 | /* Save "last known good" cwnd so we can restore it after losses or PROBE_RTT */
283 | static void bbr_save_cwnd(struct sock *sk)
284 | {
285 | 	struct tcp_sock *tp = tcp_sk(sk);
286 | 	struct bbr *bbr = inet_csk_ca(sk);
287 | 
288 | 	if (bbr->prev_ca_state < TCP_CA_Recovery && bbr->mode != BBR_PROBE_RTT)
289 | 		bbr->prior_cwnd = tp->snd_cwnd;  /* this cwnd is good enough */
290 | 	else  /* loss recovery or BBR_PROBE_RTT have temporarily cut cwnd */
291 | 		bbr->prior_cwnd = max(bbr->prior_cwnd, tp->snd_cwnd);
292 | }
293 | 
294 | static void bbr_cwnd_event(struct sock *sk, enum tcp_ca_event event)
295 | {
296 | 	struct tcp_sock *tp = tcp_sk(sk);
297 | 	struct bbr *bbr = inet_csk_ca(sk);
298 | 
299 | 	if (event == CA_EVENT_TX_START && tp->app_limited) {
300 | 		bbr->idle_restart = 1;
301 | 		/* Avoid pointless buffer overflows: pace at est. bw if we don't
302 | 		 * need more speed (we're restarting from idle and app-limited).
303 | 		 */
304 | 		if (bbr->mode == BBR_PROBE_BW)
305 | 			bbr_set_pacing_rate(sk, bbr_bw(sk), BBR_UNIT);
306 | 	}
307 | }
308 | 
309 | /* Find target cwnd. Right-size the cwnd based on min RTT and the
310 |  * estimated bottleneck bandwidth:
311 |  *
312 |  * cwnd = bw * min_rtt * gain = BDP * gain
313 |  *
314 |  * The key factor, gain, controls the amount of queue. While a small gain
315 |  * builds a smaller queue, it becomes more vulnerable to noise in RTT
316 |  * measurements (e.g., delayed ACKs or other ACK compression effects). This
317 |  * noise may cause BBR to under-estimate the rate.
318 |  *
319 |  * To achieve full performance in high-speed paths, we budget enough cwnd to
320 |  * fit full-sized skbs in-flight on both end hosts to fully utilize the path:
321 |  *   - one skb in sending host Qdisc,
322 |  *   - one skb in sending host TSO/GSO engine
323 |  *   - one skb being received by receiver host LRO/GRO/delayed-ACK engine
324 |  * Don't worry, at low rates (bbr_min_tso_rate) this won't bloat cwnd because
325 |  * in such cases tso_segs_goal is 1. The minimum cwnd is 4 packets,
326 |  * which allows 2 outstanding 2-packet sequences, to try to keep pipe
327 |  * full even with ACK-every-other-packet delayed ACKs.
328 |  */
329 | static u32 bbr_target_cwnd(struct sock *sk, u32 bw, int gain)
330 | {
331 | 	struct bbr *bbr = inet_csk_ca(sk);
332 | 	u32 cwnd;
333 | 	u64 w;
334 | 
335 | 	/* If we've never had a valid RTT sample, cap cwnd at the initial
336 | 	 * default. This should only happen when the connection is not using TCP
337 | 	 * timestamps and has retransmitted all of the SYN/SYNACK/data packets
338 | 	 * ACKed so far. In this case, an RTO can cut cwnd to 1, in which
339 | 	 * case we need to slow-start up toward something safe: TCP_INIT_CWND.
340 | 	 */
341 | 	if (unlikely(bbr->min_rtt_us == ~0U))	 /* no valid RTT samples yet? */
342 | 		return TCP_INIT_CWND;  /* be safe: cap at default initial cwnd*/
343 | 
344 | 	w = (u64)bw * bbr->min_rtt_us;
345 | 
346 | 	/* Apply a gain to the given value, then remove the BW_SCALE shift. */
347 | 	cwnd = (((w * gain) >> BBR_SCALE) + BW_UNIT - 1) / BW_UNIT;
348 | 
349 | 	/* Allow enough full-sized skbs in flight to utilize end systems. */
350 | 	cwnd += 3 * bbr->tso_segs_goal;
351 | 
352 | 	/* Reduce delayed ACKs by rounding up cwnd to the next even number. */
353 | 	cwnd = (cwnd + 1) & ~1U;
354 | 
355 | 	return cwnd;
356 | }
357 | 
358 | /* An optimization in BBR to reduce losses: On the first round of recovery, we
359 |  * follow the packet conservation principle: send P packets per P packets acked.
360 |  * After that, we slow-start and send at most 2*P packets per P packets acked.
361 |  * After recovery finishes, or upon undo, we restore the cwnd we had when
362 |  * recovery started (capped by the target cwnd based on estimated BDP).
363 |  *
364 |  * TODO(ycheng/ncardwell): implement a rate-based approach.
365 |  */
366 | static bool bbr_set_cwnd_to_recover_or_restore(
367 | 	struct sock *sk, const struct rate_sample *rs, u32 acked, u32 *new_cwnd)
368 | {
369 | 	struct tcp_sock *tp = tcp_sk(sk);
370 | 	struct bbr *bbr = inet_csk_ca(sk);
371 | 	u8 prev_state = bbr->prev_ca_state, state = inet_csk(sk)->icsk_ca_state;
372 | 	u32 cwnd = tp->snd_cwnd;
373 | 
374 | 	/* An ACK for P pkts should release at most 2*P packets. We do this
375 | 	 * in two steps. First, here we deduct the number of lost packets.
376 | 	 * Then, in bbr_set_cwnd() we slow start up toward the target cwnd.
377 | 	 */
378 | 	if (rs->losses > 0)
379 | 		cwnd = max_t(s32, cwnd - rs->losses, 1);
380 | 
381 | 	if (state == TCP_CA_Recovery && prev_state != TCP_CA_Recovery) {
382 | 		/* Starting 1st round of Recovery, so do packet conservation. */
383 | 		bbr->packet_conservation = 1;
384 | 		bbr->next_rtt_delivered = tp->delivered;  /* start round now */
385 | 		/* Cut unused cwnd from app behavior, TSQ, or TSO deferral: */
386 | 		cwnd = tcp_packets_in_flight(tp) + acked;
387 | 	} else if (prev_state >= TCP_CA_Recovery && state < TCP_CA_Recovery) {
388 | 		/* Exiting loss recovery; restore cwnd saved before recovery. */
389 | 		bbr->restore_cwnd = 1;
390 | 		bbr->packet_conservation = 0;
391 | 	}
392 | 	bbr->prev_ca_state = state;
393 | 
394 | 	if (bbr->restore_cwnd) {
395 | 		/* Restore cwnd after exiting loss recovery or PROBE_RTT. */
396 | 		cwnd = max(cwnd, bbr->prior_cwnd);
397 | 		bbr->restore_cwnd = 0;
398 | 	}
399 | 
400 | 	if (bbr->packet_conservation) {
401 | 		*new_cwnd = max(cwnd, tcp_packets_in_flight(tp) + acked);
402 | 		return true;	/* yes, using packet conservation */
403 | 	}
404 | 	*new_cwnd = cwnd;
405 | 	return false;
406 | }
407 | 
408 | /* Slow-start up toward target cwnd (if bw estimate is growing, or packet loss
409 |  * has drawn us down below target), or snap down to target if we're above it.
410 |  */
411 | static void bbr_set_cwnd(struct sock *sk, const struct rate_sample *rs,
412 | 			 u32 acked, u32 bw, int gain)
413 | {
414 | 	struct tcp_sock *tp = tcp_sk(sk);
415 | 	struct bbr *bbr = inet_csk_ca(sk);
416 | 	u32 cwnd = 0, target_cwnd = 0;
417 | 
418 | 	if (!acked)
419 | 		return;
420 | 
421 | 	if (bbr_set_cwnd_to_recover_or_restore(sk, rs, acked, &cwnd))
422 | 		goto done;
423 | 
424 | 	/* If we're below target cwnd, slow start cwnd toward target cwnd. */
425 | 	target_cwnd = bbr_target_cwnd(sk, bw, gain);
426 | 	if (bbr_full_bw_reached(sk))  /* only cut cwnd if we filled the pipe */
427 | 		cwnd = min(cwnd + acked, target_cwnd);
428 | 	else if (cwnd < target_cwnd || tp->delivered < TCP_INIT_CWND)
429 | 		cwnd = cwnd + acked;
430 | 	cwnd = max(cwnd, bbr_cwnd_min_target);
431 | 
432 | done:
433 | 	tp->snd_cwnd = min(cwnd, tp->snd_cwnd_clamp);	/* apply global cap */
434 | 	if (bbr->mode == BBR_PROBE_RTT)  /* drain queue, refresh min_rtt */
435 | 		tp->snd_cwnd = min(tp->snd_cwnd, bbr_cwnd_min_target);
436 | }
437 | 
438 | /* End cycle phase if it's time and/or we hit the phase's in-flight target. */
439 | static bool bbr_is_next_cycle_phase(struct sock *sk,
440 | 				    const struct rate_sample *rs)
441 | {
442 | 	struct tcp_sock *tp = tcp_sk(sk);
443 | 	struct bbr *bbr = inet_csk_ca(sk);
444 | 	bool is_full_length =
445 | 		tcp_stamp_us_delta(tp->delivered_mstamp, bbr->cycle_mstamp) >
446 | 		bbr->min_rtt_us;
447 | 	u32 inflight, bw;
448 | 
449 | 	/* The pacing_gain of 1.0 paces at the estimated bw to try to fully
450 | 	 * use the pipe without increasing the queue.
451 | 	 */
452 | 	if (bbr->pacing_gain == BBR_UNIT)
453 | 		return is_full_length;		/* just use wall clock time */
454 | 
455 | 	inflight = rs->prior_in_flight;  /* what was in-flight before ACK? */
456 | 	bw = bbr_max_bw(sk);
457 | 
458 | 	/* A pacing_gain > 1.0 probes for bw by trying to raise inflight to at
459 | 	 * least pacing_gain*BDP; this may take more than min_rtt if min_rtt is
460 | 	 * small (e.g. on a LAN). We do not persist if packets are lost, since
461 | 	 * a path with small buffers may not hold that much.
462 | 	 */
463 | 	if (bbr->pacing_gain > BBR_UNIT)
464 | 		return is_full_length &&
465 | 			(rs->losses ||  /* perhaps pacing_gain*BDP won't fit */
466 | 			 inflight >= bbr_target_cwnd(sk, bw, bbr->pacing_gain));
467 | 
468 | 	/* A pacing_gain < 1.0 tries to drain extra queue we added if bw
469 | 	 * probing didn't find more bw. If inflight falls to match BDP then we
470 | 	 * estimate queue is drained; persisting would underutilize the pipe.
471 | 	 */
472 | 	return is_full_length ||
473 | 		inflight <= bbr_target_cwnd(sk, bw, BBR_UNIT);
474 | }
475 | 
476 | static void bbr_advance_cycle_phase(struct sock *sk)
477 | {
478 | 	struct tcp_sock *tp = tcp_sk(sk);
479 | 	struct bbr *bbr = inet_csk_ca(sk);
480 | 
481 | 	bbr->cycle_idx = (bbr->cycle_idx + 1) & (CYCLE_LEN - 1);
482 | 	bbr->cycle_mstamp = tp->delivered_mstamp;
483 | 	bbr->pacing_gain = bbr_pacing_gain[bbr->cycle_idx];
484 | }
485 | 
486 | /* Gain cycling: cycle pacing gain to converge to fair share of available bw. */
487 | static void bbr_update_cycle_phase(struct sock *sk,
488 | 				   const struct rate_sample *rs)
489 | {
490 | 	struct bbr *bbr = inet_csk_ca(sk);
491 | 
492 | 	if ((bbr->mode == BBR_PROBE_BW) && !bbr->lt_use_bw &&
493 | 	    bbr_is_next_cycle_phase(sk, rs))
494 | 		bbr_advance_cycle_phase(sk);
495 | }
496 | 
497 | static void bbr_reset_startup_mode(struct sock *sk)
498 | {
499 | 	struct bbr *bbr = inet_csk_ca(sk);
500 | 
501 | 	bbr->mode = BBR_STARTUP;
502 | 	bbr->pacing_gain = bbr_high_gain;
503 | 	bbr->cwnd_gain	 = bbr_high_gain;
504 | }
505 | 
506 | static void bbr_reset_probe_bw_mode(struct sock *sk)
507 | {
508 | 	struct bbr *bbr = inet_csk_ca(sk);
509 | 
510 | 	bbr->mode = BBR_PROBE_BW;
511 | 	bbr->pacing_gain = BBR_UNIT;
512 | 	bbr->cwnd_gain = bbr_cwnd_gain;
513 | 	bbr->cycle_idx = CYCLE_LEN - 1 - prandom_u32_max(bbr_cycle_rand);
514 | 	bbr_advance_cycle_phase(sk);	/* flip to next phase of gain cycle */
515 | }
516 | 
517 | static void bbr_reset_mode(struct sock *sk)
518 | {
519 | 	if (!bbr_full_bw_reached(sk))
520 | 		bbr_reset_startup_mode(sk);
521 | 	else
522 | 		bbr_reset_probe_bw_mode(sk);
523 | }
524 | 
525 | /* Start a new long-term sampling interval. */
526 | static void bbr_reset_lt_bw_sampling_interval(struct sock *sk)
527 | {
528 | 	struct tcp_sock *tp = tcp_sk(sk);
529 | 	struct bbr *bbr = inet_csk_ca(sk);
530 | 
531 | 	bbr->lt_last_stamp = div_u64(tp->delivered_mstamp, USEC_PER_MSEC);
532 | 	bbr->lt_last_delivered = tp->delivered;
533 | 	bbr->lt_last_lost = tp->lost;
534 | 	bbr->lt_rtt_cnt = 0;
535 | }
536 | 
537 | /* Completely reset long-term bandwidth sampling. */
538 | static void bbr_reset_lt_bw_sampling(struct sock *sk)
539 | {
540 | 	struct bbr *bbr = inet_csk_ca(sk);
541 | 
542 | 	bbr->lt_bw = 0;
543 | 	bbr->lt_use_bw = 0;
544 | 	bbr->lt_is_sampling = false;
545 | 	bbr_reset_lt_bw_sampling_interval(sk);
546 | }
547 | 
548 | /* Long-term bw sampling interval is done. Estimate whether we're policed. */
549 | static void bbr_lt_bw_interval_done(struct sock *sk, u32 bw)
550 | {
551 | 	struct bbr *bbr = inet_csk_ca(sk);
552 | 	u32 diff;
553 | 
554 | 	if (bbr->lt_bw) {  /* do we have bw from a previous interval? */
555 | 		/* Is new bw close to the lt_bw from the previous interval? */
556 | 		diff = abs(bw - bbr->lt_bw);
557 | 		if ((diff * BBR_UNIT <= bbr_lt_bw_ratio * bbr->lt_bw) ||
558 | 		    (bbr_rate_bytes_per_sec(sk, diff, BBR_UNIT) <=
559 | 		     bbr_lt_bw_diff)) {
560 | 			/* All criteria are met; estimate we're policed. */
561 | 			bbr->lt_bw = (bw + bbr->lt_bw) >> 1;  /* avg 2 intvls */
562 | 			bbr->lt_use_bw = 1;
563 | 			bbr->pacing_gain = BBR_UNIT;  /* try to avoid drops */
564 | 			bbr->lt_rtt_cnt = 0;
565 | 			return;
566 | 		}
567 | 	}
568 | 	bbr->lt_bw = bw;
569 | 	bbr_reset_lt_bw_sampling_interval(sk);
570 | }
571 | 
572 | /* Token-bucket traffic policers are common (see "An Internet-Wide Analysis of
573 |  * Traffic Policing", SIGCOMM 2016). BBR detects token-bucket policers and
574 |  * explicitly models their policed rate, to reduce unnecessary losses. We
575 |  * estimate that we're policed if we see 2 consecutive sampling intervals with
576 |  * consistent throughput and high packet loss. If we think we're being policed,
577 |  * set lt_bw to the "long-term" average delivery rate from those 2 intervals.
578 |  */
579 | static void bbr_lt_bw_sampling(struct sock *sk, const struct rate_sample *rs)
580 | {
581 | 	struct tcp_sock *tp = tcp_sk(sk);
582 | 	struct bbr *bbr = inet_csk_ca(sk);
583 | 	u32 lost, delivered;
584 | 	u64 bw;
585 | 	u32 t;
586 | 
587 | 	if (bbr->lt_use_bw) {	/* already using long-term rate, lt_bw? */
588 | 		if (bbr->mode == BBR_PROBE_BW && bbr->round_start &&
589 | 		    ++bbr->lt_rtt_cnt >= bbr_lt_bw_max_rtts) {
590 | 			bbr_reset_lt_bw_sampling(sk);    /* stop using lt_bw */
591 | 			bbr_reset_probe_bw_mode(sk);  /* restart gain cycling */
592 | 		}
593 | 		return;
594 | 	}
595 | 
596 | 	/* Wait for the first loss before sampling, to let the policer exhaust
597 | 	 * its tokens and estimate the steady-state rate allowed by the policer.
598 | 	 * Starting samples earlier includes bursts that over-estimate the bw.
599 | 	 */
600 | 	if (!bbr->lt_is_sampling) {
601 | 		if (!rs->losses)
602 | 			return;
603 | 		bbr_reset_lt_bw_sampling_interval(sk);
604 | 		bbr->lt_is_sampling = true;
605 | 	}
606 | 
607 | 	/* To avoid underestimates, reset sampling if we run out of data. */
608 | 	if (rs->is_app_limited) {
609 | 		bbr_reset_lt_bw_sampling(sk);
610 | 		return;
611 | 	}
612 | 
613 | 	if (bbr->round_start)
614 | 		bbr->lt_rtt_cnt++;	/* count round trips in this interval */
615 | 	if (bbr->lt_rtt_cnt < bbr_lt_intvl_min_rtts)
616 | 		return;		/* sampling interval needs to be longer */
617 | 	if (bbr->lt_rtt_cnt > 4 * bbr_lt_intvl_min_rtts) {
618 | 		bbr_reset_lt_bw_sampling(sk);  /* interval is too long */
619 | 		return;
620 | 	}
621 | 
622 | 	/* End sampling interval when a packet is lost, so we estimate the
623 | 	 * policer tokens were exhausted. Stopping the sampling before the
624 | 	 * tokens are exhausted under-estimates the policed rate.
625 | 	 */
626 | 	if (!rs->losses)
627 | 		return;
628 | 
629 | 	/* Calculate packets lost and delivered in sampling interval. */
630 | 	lost = tp->lost - bbr->lt_last_lost;
631 | 	delivered = tp->delivered - bbr->lt_last_delivered;
632 | 	/* Is loss rate (lost/delivered) >= lt_loss_thresh? If not, wait. */
633 | 	if (!delivered || (lost << BBR_SCALE) < bbr_lt_loss_thresh * delivered)
634 | 		return;
635 | 
636 | 	/* Find average delivery rate in this sampling interval. */
637 | 	t = div_u64(tp->delivered_mstamp, USEC_PER_MSEC) - bbr->lt_last_stamp;
638 | 	if ((s32)t < 1)
639 | 		return;		/* interval is less than one ms, so wait */
640 | 	/* Check if can multiply without overflow */
641 | 	if (t >= ~0U / USEC_PER_MSEC) {
642 | 		bbr_reset_lt_bw_sampling(sk);  /* interval too long; reset */
643 | 		return;
644 | 	}
645 | 	t *= USEC_PER_MSEC;
646 | 	bw = (u64)delivered * BW_UNIT;
647 | 	do_div(bw, t);
648 | 	bbr_lt_bw_interval_done(sk, bw);
649 | }
650 | 
651 | /* Estimate the bandwidth based on how fast packets are delivered */
652 | static void bbr_update_bw(struct sock *sk, const struct rate_sample *rs)
653 | {
654 | 	struct tcp_sock *tp = tcp_sk(sk);
655 | 	struct bbr *bbr = inet_csk_ca(sk);
656 | 	u64 bw;
657 | 
658 | 	bbr->round_start = 0;
659 | 	if (rs->delivered < 0 || rs->interval_us <= 0)
660 | 		return; /* Not a valid observation */
661 | 
662 | 	/* See if we've reached the next RTT */
663 | 	if (!before(rs->prior_delivered, bbr->next_rtt_delivered)) {
664 | 		bbr->next_rtt_delivered = tp->delivered;
665 | 		bbr->rtt_cnt++;
666 | 		bbr->round_start = 1;
667 | 		bbr->packet_conservation = 0;
668 | 	}
669 | 
670 | 	bbr_lt_bw_sampling(sk, rs);
671 | 
672 | 	/* Divide delivered by the interval to find a (lower bound) bottleneck
673 | 	 * bandwidth sample. Delivered is in packets and interval_us in uS and
674 | 	 * ratio will be <<1 for most connections. So delivered is first scaled.
675 | 	 */
676 | 	bw = (u64)rs->delivered * BW_UNIT;
677 | 	do_div(bw, rs->interval_us);
678 | 
679 | 	/* If this sample is application-limited, it is likely to have a very
680 | 	 * low delivered count that represents application behavior rather than
681 | 	 * the available network rate. Such a sample could drag down estimated
682 | 	 * bw, causing needless slow-down. Thus, to continue to send at the
683 | 	 * last measured network rate, we filter out app-limited samples unless
684 | 	 * they describe the path bw at least as well as our bw model.
685 | 	 *
686 | 	 * So the goal during app-limited phase is to proceed with the best
687 | 	 * network rate no matter how long. We automatically leave this
688 | 	 * phase when app writes faster than the network can deliver :)
689 | 	 */
690 | 	if (!rs->is_app_limited || bw >= bbr_max_bw(sk)) {
691 | 		/* Incorporate new sample into our max bw filter. */
692 | 		minmax_running_max(&bbr->bw, bbr_bw_rtts, bbr->rtt_cnt, bw);
693 | 	}
694 | }
695 | 
696 | /* Estimate when the pipe is full, using the change in delivery rate: BBR
697 |  * estimates that STARTUP filled the pipe if the estimated bw hasn't changed by
698 |  * at least bbr_full_bw_thresh (25%) after bbr_full_bw_cnt (3) non-app-limited
699 |  * rounds. Why 3 rounds: 1: rwin autotuning grows the rwin, 2: we fill the
700 |  * higher rwin, 3: we get higher delivery rate samples. Or transient
701 |  * cross-traffic or radio noise can go away. CUBIC Hystart shares a similar
702 |  * design goal, but uses delay and inter-ACK spacing instead of bandwidth.
703 |  */
704 | static void bbr_check_full_bw_reached(struct sock *sk,
705 | 				      const struct rate_sample *rs)
706 | {
707 | 	struct bbr *bbr = inet_csk_ca(sk);
708 | 	u32 bw_thresh;
709 | 
710 | 	if (bbr_full_bw_reached(sk) || !bbr->round_start || rs->is_app_limited)
711 | 		return;
712 | 
713 | 	bw_thresh = (u64)bbr->full_bw * bbr_full_bw_thresh >> BBR_SCALE;
714 | 	if (bbr_max_bw(sk) >= bw_thresh) {
715 | 		bbr->full_bw = bbr_max_bw(sk);
716 | 		bbr->full_bw_cnt = 0;
717 | 		return;
718 | 	}
719 | 	++bbr->full_bw_cnt;
720 | }
721 | 
722 | /* If pipe is probably full, drain the queue and then enter steady-state. */
723 | static void bbr_check_drain(struct sock *sk, const struct rate_sample *rs)
724 | {
725 | 	struct bbr *bbr = inet_csk_ca(sk);
726 | 
727 | 	if (bbr->mode == BBR_STARTUP && bbr_full_bw_reached(sk)) {
728 | 		bbr->mode = BBR_DRAIN;	/* drain queue we created */
729 | 		bbr->pacing_gain = bbr_drain_gain;	/* pace slow to drain */
730 | 		bbr->cwnd_gain = bbr_high_gain;	/* maintain cwnd */
731 | 	}	/* fall through to check if in-flight is already small: */
732 | 	if (bbr->mode == BBR_DRAIN &&
733 | 	    tcp_packets_in_flight(tcp_sk(sk)) <=
734 | 	    bbr_target_cwnd(sk, bbr_max_bw(sk), BBR_UNIT))
735 | 		bbr_reset_probe_bw_mode(sk);  /* we estimate queue is drained */
736 | }
737 | 
738 | /* The goal of PROBE_RTT mode is to have BBR flows cooperatively and
739 |  * periodically drain the bottleneck queue, to converge to measure the true
740 |  * min_rtt (unloaded propagation delay). This allows the flows to keep queues
741 |  * small (reducing queuing delay and packet loss) and achieve fairness among
742 |  * BBR flows.
743 |  *
744 |  * The min_rtt filter window is 10 seconds. When the min_rtt estimate expires,
745 |  * we enter PROBE_RTT mode and cap the cwnd at bbr_cwnd_min_target=4 packets.
746 |  * After at least bbr_probe_rtt_mode_ms=200ms and at least one packet-timed
747 |  * round trip elapsed with that flight size <= 4, we leave PROBE_RTT mode and
748 |  * re-enter the previous mode. BBR uses 200ms to approximately bound the
749 |  * performance penalty of PROBE_RTT's cwnd capping to roughly 2% (200ms/10s).
750 |  *
751 |  * Note that flows need only pay 2% if they are busy sending over the last 10
752 |  * seconds. Interactive applications (e.g., Web, RPCs, video chunks) often have
753 |  * natural silences or low-rate periods within 10 seconds where the rate is low
754 |  * enough for long enough to drain its queue in the bottleneck. We pick up
755 |  * these min RTT measurements opportunistically with our min_rtt filter. :-)
756 |  */
757 | static void bbr_update_min_rtt(struct sock *sk, const struct rate_sample *rs)
758 | {
759 | 	struct tcp_sock *tp = tcp_sk(sk);
760 | 	struct bbr *bbr = inet_csk_ca(sk);
761 | 	bool filter_expired;
762 | 
763 | 	/* Track min RTT seen in the min_rtt_win_sec filter window: */
764 | 	filter_expired = after(tcp_jiffies32,
765 | 			       bbr->min_rtt_stamp + bbr_min_rtt_win_sec * HZ);
766 | 	if (rs->rtt_us >= 0 &&
767 | 	    (rs->rtt_us <= bbr->min_rtt_us || filter_expired)) {
768 | 		bbr->min_rtt_us = rs->rtt_us;
769 | 		bbr->min_rtt_stamp = tcp_jiffies32;
770 | 	}
771 | 
772 | 	if (bbr_probe_rtt_mode_ms > 0 && filter_expired &&
773 | 	    !bbr->idle_restart && bbr->mode != BBR_PROBE_RTT) {
774 | 		bbr->mode = BBR_PROBE_RTT;  /* dip, drain queue */
775 | 		bbr->pacing_gain = BBR_UNIT;
776 | 		bbr->cwnd_gain = BBR_UNIT;
777 | 		bbr_save_cwnd(sk);  /* note cwnd so we can restore it */
778 | 		bbr->probe_rtt_done_stamp = 0;
779 | 	}
780 | 
781 | 	if (bbr->mode == BBR_PROBE_RTT) {
782 | 		/* Ignore low rate samples during this mode. */
783 | 		tp->app_limited =
784 | 			(tp->delivered + tcp_packets_in_flight(tp)) ? : 1;
785 | 		/* Maintain min packets in flight for max(200 ms, 1 round). */
786 | 		if (!bbr->probe_rtt_done_stamp &&
787 | 		    tcp_packets_in_flight(tp) <= bbr_cwnd_min_target) {
788 | 			bbr->probe_rtt_done_stamp = tcp_jiffies32 +
789 | 				msecs_to_jiffies(bbr_probe_rtt_mode_ms);
790 | 			bbr->probe_rtt_round_done = 0;
791 | 			bbr->next_rtt_delivered = tp->delivered;
792 | 		} else if (bbr->probe_rtt_done_stamp) {
793 | 			if (bbr->round_start)
794 | 				bbr->probe_rtt_round_done = 1;
795 | 			if (bbr->probe_rtt_round_done &&
796 | 			    after(tcp_jiffies32, bbr->probe_rtt_done_stamp)) {
797 | 				bbr->min_rtt_stamp = tcp_jiffies32;
798 | 				bbr->restore_cwnd = 1;  /* snap to prior_cwnd */
799 | 				bbr_reset_mode(sk);
800 | 			}
801 | 		}
802 | 	}
803 | 	bbr->idle_restart = 0;
804 | }
805 | 
806 | static void bbr_update_model(struct sock *sk, const struct rate_sample *rs)
807 | {
808 | 	bbr_update_bw(sk, rs);
809 | 	bbr_update_cycle_phase(sk, rs);
810 | 	bbr_check_full_bw_reached(sk, rs);
811 | 	bbr_check_drain(sk, rs);
812 | 	bbr_update_min_rtt(sk, rs);
813 | }
814 | 
815 | static void bbr_main(struct sock *sk, const struct rate_sample *rs)
816 | {
817 | 	struct bbr *bbr = inet_csk_ca(sk);
818 | 	u32 bw;
819 | 
820 | 	bbr_update_model(sk, rs);
821 | 
822 | 	bw = bbr_bw(sk);
823 | 	bbr_set_pacing_rate(sk, bw, bbr->pacing_gain);
824 | 	bbr_set_tso_segs_goal(sk);
825 | 	bbr_set_cwnd(sk, rs, rs->acked_sacked, bw, bbr->cwnd_gain);
826 | }
827 | 
828 | static void bbr_init(struct sock *sk)
829 | {
830 | 	struct tcp_sock *tp = tcp_sk(sk);
831 | 	struct bbr *bbr = inet_csk_ca(sk);
832 | 
833 | 	bbr->prior_cwnd = 0;
834 | 	bbr->tso_segs_goal = 0;	 /* default segs per skb until first ACK */
835 | 	bbr->rtt_cnt = 0;
836 | 	bbr->next_rtt_delivered = 0;
837 | 	bbr->prev_ca_state = TCP_CA_Open;
838 | 	bbr->packet_conservation = 0;
839 | 
840 | 	bbr->probe_rtt_done_stamp = 0;
841 | 	bbr->probe_rtt_round_done = 0;
842 | 	bbr->min_rtt_us = tcp_min_rtt(tp);
843 | 	bbr->min_rtt_stamp = tcp_jiffies32;
844 | 
845 | 	minmax_reset(&bbr->bw, bbr->rtt_cnt, 0);  /* init max bw to 0 */
846 | 
847 | 	bbr->has_seen_rtt = 0;
848 | 	bbr_init_pacing_rate_from_rtt(sk);
849 | 
850 | 	bbr->restore_cwnd = 0;
851 | 	bbr->round_start = 0;
852 | 	bbr->idle_restart = 0;
853 | 	bbr->full_bw = 0;
854 | 	bbr->full_bw_cnt = 0;
855 | 	bbr->cycle_mstamp = 0;
856 | 	bbr->cycle_idx = 0;
857 | 	bbr_reset_lt_bw_sampling(sk);
858 | 	bbr_reset_startup_mode(sk);
859 | 
860 | 	cmpxchg(&sk->sk_pacing_status, SK_PACING_NONE, SK_PACING_NEEDED);
861 | }
862 | 
863 | static u32 bbr_sndbuf_expand(struct sock *sk)
864 | {
865 | 	/* Provision 3 * cwnd since BBR may slow-start even during recovery. */
866 | 	return 3;
867 | }
868 | 
869 | /* In theory BBR does not need to undo the cwnd since it does not
870 |  * always reduce cwnd on losses (see bbr_main()). Keep it for now.
871 |  */
872 | static u32 bbr_undo_cwnd(struct sock *sk)
873 | {
874 | 	return tcp_sk(sk)->snd_cwnd;
875 | }
876 | 
877 | /* Entering loss recovery, so save cwnd for when we exit or undo recovery. */
878 | static u32 bbr_ssthresh(struct sock *sk)
879 | {
880 | 	bbr_save_cwnd(sk);
881 | 	return TCP_INFINITE_SSTHRESH;	 /* BBR does not use ssthresh */
882 | }
883 | 
884 | static size_t bbr_get_info(struct sock *sk, u32 ext, int *attr,
885 | 			   union tcp_cc_info *info)
886 | {
887 | 	if (ext & (1 << (INET_DIAG_BBRINFO - 1)) ||
888 | 	    ext & (1 << (INET_DIAG_VEGASINFO - 1))) {
889 | 		struct tcp_sock *tp = tcp_sk(sk);
890 | 		struct bbr *bbr = inet_csk_ca(sk);
891 | 		u64 bw = bbr_bw(sk);
892 | 
893 | 		bw = bw * tp->mss_cache * USEC_PER_SEC >> BW_SCALE;
894 | 		memset(&info->bbr, 0, sizeof(info->bbr));
895 | 		info->bbr.bbr_bw_lo		= (u32)bw;
896 | 		info->bbr.bbr_bw_hi		= (u32)(bw >> 32);
897 | 		info->bbr.bbr_min_rtt		= bbr->min_rtt_us;
898 | 		info->bbr.bbr_pacing_gain	= bbr->pacing_gain;
899 | 		info->bbr.bbr_cwnd_gain		= bbr->cwnd_gain;
900 | 		*attr = INET_DIAG_BBRINFO;
901 | 		return sizeof(info->bbr);
902 | 	}
903 | 	return 0;
904 | }
905 | 
906 | static void bbr_set_state(struct sock *sk, u8 new_state)
907 | {
908 | 	struct bbr *bbr = inet_csk_ca(sk);
909 | 
910 | 	if (new_state == TCP_CA_Loss) {
911 | 		struct rate_sample rs = { .losses = 1 };
912 | 
913 | 		bbr->prev_ca_state = TCP_CA_Loss;
914 | 		bbr->full_bw = 0;
915 | 		bbr->round_start = 1;	/* treat RTO like end of a round */
916 | 		bbr_lt_bw_sampling(sk, &rs);
917 | 	}
918 | }
919 | 
920 | static struct tcp_congestion_ops tcp_bbr_cong_ops __read_mostly = {
921 | 	.flags		= TCP_CONG_NON_RESTRICTED,
922 | 	.name		= "bbr",
923 | 	.owner		= THIS_MODULE,
924 | 	.init		= bbr_init,
925 | 	.cong_control	= bbr_main,
926 | 	.sndbuf_expand	= bbr_sndbuf_expand,
927 | 	.undo_cwnd	= bbr_undo_cwnd,
928 | 	.cwnd_event	= bbr_cwnd_event,
929 | 	.ssthresh	= bbr_ssthresh,
930 | 	.tso_segs_goal	= bbr_tso_segs_goal,
931 | 	.get_info	= bbr_get_info,
932 | 	.set_state	= bbr_set_state,
933 | };
934 | 
935 | static int __init bbr_register(void)
936 | {
937 | 	BUILD_BUG_ON(sizeof(struct bbr) > ICSK_CA_PRIV_SIZE);
938 | 	return tcp_register_congestion_control(&tcp_bbr_cong_ops);
939 | }
940 | 
941 | static void __exit bbr_unregister(void)
942 | {
943 | 	tcp_unregister_congestion_control(&tcp_bbr_cong_ops);
944 | }
945 | 
946 | module_init(bbr_register);
947 | module_exit(bbr_unregister);
948 | 
949 | MODULE_AUTHOR("Van Jacobson <vanj@google.com>");
950 | MODULE_AUTHOR("Neal Cardwell <ncardwell@google.com>");
951 | MODULE_AUTHOR("Yuchung Cheng <ycheng@google.com>");
952 | MODULE_AUTHOR("Soheil Hassas Yeganeh <soheil@google.com>");
953 | MODULE_LICENSE("Dual BSD/GPL");
954 | MODULE_DESCRIPTION("TCP BBR (Bottleneck Bandwidth and RTT)");
955 | 


--------------------------------------------------------------------------------
/General/Debian/source/kernel-v4.14/tcp_bbr.c:
--------------------------------------------------------------------------------
  1 | /* Bottleneck Bandwidth and RTT (BBR) congestion control
  2 |  *
  3 |  * BBR congestion control computes the sending rate based on the delivery
  4 |  * rate (throughput) estimated from ACKs. In a nutshell:
  5 |  *
  6 |  *   On each ACK, update our model of the network path:
  7 |  *      bottleneck_bandwidth = windowed_max(delivered / elapsed, 10 round trips)
  8 |  *      min_rtt = windowed_min(rtt, 10 seconds)
  9 |  *   pacing_rate = pacing_gain * bottleneck_bandwidth
 10 |  *   cwnd = max(cwnd_gain * bottleneck_bandwidth * min_rtt, 4)
 11 |  *
 12 |  * The core algorithm does not react directly to packet losses or delays,
 13 |  * although BBR may adjust the size of next send per ACK when loss is
 14 |  * observed, or adjust the sending rate if it estimates there is a
 15 |  * traffic policer, in order to keep the drop rate reasonable.
 16 |  *
 17 |  * Here is a state transition diagram for BBR:
 18 |  *
 19 |  *             |
 20 |  *             V
 21 |  *    +---> STARTUP  ----+
 22 |  *    |        |         |
 23 |  *    |        V         |
 24 |  *    |      DRAIN   ----+
 25 |  *    |        |         |
 26 |  *    |        V         |
 27 |  *    +---> PROBE_BW ----+
 28 |  *    |      ^    |      |
 29 |  *    |      |    |      |
 30 |  *    |      +----+      |
 31 |  *    |                  |
 32 |  *    +---- PROBE_RTT <--+
 33 |  *
 34 |  * A BBR flow starts in STARTUP, and ramps up its sending rate quickly.
 35 |  * When it estimates the pipe is full, it enters DRAIN to drain the queue.
 36 |  * In steady state a BBR flow only uses PROBE_BW and PROBE_RTT.
 37 |  * A long-lived BBR flow spends the vast majority of its time remaining
 38 |  * (repeatedly) in PROBE_BW, fully probing and utilizing the pipe's bandwidth
 39 |  * in a fair manner, with a small, bounded queue. *If* a flow has been
 40 |  * continuously sending for the entire min_rtt window, and hasn't seen an RTT
 41 |  * sample that matches or decreases its min_rtt estimate for 10 seconds, then
 42 |  * it briefly enters PROBE_RTT to cut inflight to a minimum value to re-probe
 43 |  * the path's two-way propagation delay (min_rtt). When exiting PROBE_RTT, if
 44 |  * we estimated that we reached the full bw of the pipe then we enter PROBE_BW;
 45 |  * otherwise we enter STARTUP to try to fill the pipe.
 46 |  *
 47 |  * BBR is described in detail in:
 48 |  *   "BBR: Congestion-Based Congestion Control",
 49 |  *   Neal Cardwell, Yuchung Cheng, C. Stephen Gunn, Soheil Hassas Yeganeh,
 50 |  *   Van Jacobson. ACM Queue, Vol. 14 No. 5, September-October 2016.
 51 |  *
 52 |  * There is a public e-mail list for discussing BBR development and testing:
 53 |  *   https://groups.google.com/forum/#!forum/bbr-dev
 54 |  *
 55 |  * NOTE: BBR might be used with the fq qdisc ("man tc-fq") with pacing enabled,
 56 |  * otherwise TCP stack falls back to an internal pacing using one high
 57 |  * resolution timer per TCP socket and may use more resources.
 58 |  */
 59 | #include <linux/module.h>
 60 | #include <net/tcp.h>
 61 | #include <linux/inet_diag.h>
 62 | #include <linux/inet.h>
 63 | #include <linux/random.h>
 64 | #include <linux/win_minmax.h>
 65 | 
 66 | /* Scale factor for rate in pkt/uSec unit to avoid truncation in bandwidth
 67 |  * estimation. The rate unit ~= (1500 bytes / 1 usec / 2^24) ~= 715 bps.
 68 |  * This handles bandwidths from 0.06pps (715bps) to 256Mpps (3Tbps) in a u32.
 69 |  * Since the minimum window is >=4 packets, the lower bound isn't
 70 |  * an issue. The upper bound isn't an issue with existing technologies.
 71 |  */
 72 | #define BW_SCALE 24
 73 | #define BW_UNIT (1 << BW_SCALE)
 74 | 
 75 | #define BBR_SCALE 8	/* scaling factor for fractions in BBR (e.g. gains) */
 76 | #define BBR_UNIT (1 << BBR_SCALE)
 77 | 
 78 | /* BBR has the following modes for deciding how fast to send: */
 79 | enum bbr_mode {
 80 | 	BBR_STARTUP,	/* ramp up sending rate rapidly to fill pipe */
 81 | 	BBR_DRAIN,	/* drain any queue created during startup */
 82 | 	BBR_PROBE_BW,	/* discover, share bw: pace around estimated bw */
 83 | 	BBR_PROBE_RTT,	/* cut inflight to min to probe min_rtt */
 84 | };
 85 | 
 86 | /* BBR congestion control block */
 87 | struct bbr {
 88 | 	u32	min_rtt_us;	        /* min RTT in min_rtt_win_sec window */
 89 | 	u32	min_rtt_stamp;	        /* timestamp of min_rtt_us */
 90 | 	u32	probe_rtt_done_stamp;   /* end time for BBR_PROBE_RTT mode */
 91 | 	struct minmax bw;	/* Max recent delivery rate in pkts/uS << 24 */
 92 | 	u32	rtt_cnt;	    /* count of packet-timed rounds elapsed */
 93 | 	u32     next_rtt_delivered; /* scb->tx.delivered at end of round */
 94 | 	u64	cycle_mstamp;	     /* time of this cycle phase start */
 95 | 	u32     mode:3,		     /* current bbr_mode in state machine */
 96 | 		prev_ca_state:3,     /* CA state on previous ACK */
 97 | 		packet_conservation:1,  /* use packet conservation? */
 98 | 		restore_cwnd:1,	     /* decided to revert cwnd to old value */
 99 | 		round_start:1,	     /* start of packet-timed tx->ack round? */
100 | 		tso_segs_goal:7,     /* segments we want in each skb we send */
101 | 		idle_restart:1,	     /* restarting after idle? */
102 | 		probe_rtt_round_done:1,  /* a BBR_PROBE_RTT round at 4 pkts? */
103 | 		unused:5,
104 | 		lt_is_sampling:1,    /* taking long-term ("LT") samples now? */
105 | 		lt_rtt_cnt:7,	     /* round trips in long-term interval */
106 | 		lt_use_bw:1;	     /* use lt_bw as our bw estimate? */
107 | 	u32	lt_bw;		     /* LT est delivery rate in pkts/uS << 24 */
108 | 	u32	lt_last_delivered;   /* LT intvl start: tp->delivered */
109 | 	u32	lt_last_stamp;	     /* LT intvl start: tp->delivered_mstamp */
110 | 	u32	lt_last_lost;	     /* LT intvl start: tp->lost */
111 | 	u32	pacing_gain:10,	/* current gain for setting pacing rate */
112 | 		cwnd_gain:10,	/* current gain for setting cwnd */
113 | 		full_bw_cnt:3,	/* number of rounds without large bw gains */
114 | 		cycle_idx:3,	/* current index in pacing_gain cycle array */
115 | 		has_seen_rtt:1, /* have we seen an RTT sample yet? */
116 | 		unused_b:5;
117 | 	u32	prior_cwnd;	/* prior cwnd upon entering loss recovery */
118 | 	u32	full_bw;	/* recent bw, to estimate if pipe is full */
119 | };
120 | 
121 | #define CYCLE_LEN	8	/* number of phases in a pacing gain cycle */
122 | 
123 | /* Window length of bw filter (in rounds): */
124 | static const int bbr_bw_rtts = CYCLE_LEN + 2;
125 | /* Window length of min_rtt filter (in sec): */
126 | static const u32 bbr_min_rtt_win_sec = 10;
127 | /* Minimum time (in ms) spent at bbr_cwnd_min_target in BBR_PROBE_RTT mode: */
128 | static const u32 bbr_probe_rtt_mode_ms = 200;
129 | /* Skip TSO below the following bandwidth (bits/sec): */
130 | static const int bbr_min_tso_rate = 1200000;
131 | 
132 | /* We use a high_gain value of 2/ln(2) because it's the smallest pacing gain
133 |  * that will allow a smoothly increasing pacing rate that will double each RTT
134 |  * and send the same number of packets per RTT that an un-paced, slow-starting
135 |  * Reno or CUBIC flow would:
136 |  */
137 | static const int bbr_high_gain  = BBR_UNIT * 2885 / 1000 + 1;
138 | /* The pacing gain of 1/high_gain in BBR_DRAIN is calculated to typically drain
139 |  * the queue created in BBR_STARTUP in a single round:
140 |  */
141 | static const int bbr_drain_gain = BBR_UNIT * 1000 / 2885;
142 | /* The gain for deriving steady-state cwnd tolerates delayed/stretched ACKs: */
143 | static const int bbr_cwnd_gain  = BBR_UNIT * 2;
144 | /* The pacing_gain values for the PROBE_BW gain cycle, to discover/share bw: */
145 | static const int bbr_pacing_gain[] = {
146 | 	BBR_UNIT * 5 / 4,	/* probe for more available bw */
147 | 	BBR_UNIT * 3 / 4,	/* drain queue and/or yield bw to other flows */
148 | 	BBR_UNIT, BBR_UNIT, BBR_UNIT,	/* cruise at 1.0*bw to utilize pipe, */
149 | 	BBR_UNIT, BBR_UNIT, BBR_UNIT	/* without creating excess queue... */
150 | };
151 | /* Randomize the starting gain cycling phase over N phases: */
152 | static const u32 bbr_cycle_rand = 7;
153 | 
154 | /* Try to keep at least this many packets in flight, if things go smoothly. For
155 |  * smooth functioning, a sliding window protocol ACKing every other packet
156 |  * needs at least 4 packets in flight:
157 |  */
158 | static const u32 bbr_cwnd_min_target = 4;
159 | 
160 | /* To estimate if BBR_STARTUP mode (i.e. high_gain) has filled pipe... */
161 | /* If bw has increased significantly (1.25x), there may be more bw available: */
162 | static const u32 bbr_full_bw_thresh = BBR_UNIT * 5 / 4;
163 | /* But after 3 rounds w/o significant bw growth, estimate pipe is full: */
164 | static const u32 bbr_full_bw_cnt = 3;
165 | 
166 | /* "long-term" ("LT") bandwidth estimator parameters... */
167 | /* The minimum number of rounds in an LT bw sampling interval: */
168 | static const u32 bbr_lt_intvl_min_rtts = 4;
169 | /* If lost/delivered ratio > 20%, interval is "lossy" and we may be policed: */
170 | static const u32 bbr_lt_loss_thresh = 50;
171 | /* If 2 intervals have a bw ratio <= 1/8, their bw is "consistent": */
172 | static const u32 bbr_lt_bw_ratio = BBR_UNIT / 8;
173 | /* If 2 intervals have a bw diff <= 4 Kbit/sec their bw is "consistent": */
174 | static const u32 bbr_lt_bw_diff = 4000 / 8;
175 | /* If we estimate we're policed, use lt_bw for this many round trips: */
176 | static const u32 bbr_lt_bw_max_rtts = 48;
177 | 
178 | /* Do we estimate that STARTUP filled the pipe? */
179 | static bool bbr_full_bw_reached(const struct sock *sk)
180 | {
181 | 	const struct bbr *bbr = inet_csk_ca(sk);
182 | 
183 | 	return bbr->full_bw_cnt >= bbr_full_bw_cnt;
184 | }
185 | 
186 | /* Return the windowed max recent bandwidth sample, in pkts/uS << BW_SCALE. */
187 | static u32 bbr_max_bw(const struct sock *sk)
188 | {
189 | 	struct bbr *bbr = inet_csk_ca(sk);
190 | 
191 | 	return minmax_get(&bbr->bw);
192 | }
193 | 
194 | /* Return the estimated bandwidth of the path, in pkts/uS << BW_SCALE. */
195 | static u32 bbr_bw(const struct sock *sk)
196 | {
197 | 	struct bbr *bbr = inet_csk_ca(sk);
198 | 
199 | 	return bbr->lt_use_bw ? bbr->lt_bw : bbr_max_bw(sk);
200 | }
201 | 
202 | /* Return rate in bytes per second, optionally with a gain.
203 |  * The order here is chosen carefully to avoid overflow of u64. This should
204 |  * work for input rates of up to 2.9Tbit/sec and gain of 2.89x.
205 |  */
206 | static u64 bbr_rate_bytes_per_sec(struct sock *sk, u64 rate, int gain)
207 | {
208 | 	rate *= tcp_mss_to_mtu(sk, tcp_sk(sk)->mss_cache);
209 | 	rate *= gain;
210 | 	rate >>= BBR_SCALE;
211 | 	rate *= USEC_PER_SEC;
212 | 	return rate >> BW_SCALE;
213 | }
214 | 
215 | /* Convert a BBR bw and gain factor to a pacing rate in bytes per second. */
216 | static u32 bbr_bw_to_pacing_rate(struct sock *sk, u32 bw, int gain)
217 | {
218 | 	u64 rate = bw;
219 | 
220 | 	rate = bbr_rate_bytes_per_sec(sk, rate, gain);
221 | 	rate = min_t(u64, rate, sk->sk_max_pacing_rate);
222 | 	return rate;
223 | }
224 | 
225 | /* Initialize pacing rate to: high_gain * init_cwnd / RTT. */
226 | static void bbr_init_pacing_rate_from_rtt(struct sock *sk)
227 | {
228 | 	struct tcp_sock *tp = tcp_sk(sk);
229 | 	struct bbr *bbr = inet_csk_ca(sk);
230 | 	u64 bw;
231 | 	u32 rtt_us;
232 | 
233 | 	if (tp->srtt_us) {		/* any RTT sample yet? */
234 | 		rtt_us = max(tp->srtt_us >> 3, 1U);
235 | 		bbr->has_seen_rtt = 1;
236 | 	} else {			 /* no RTT sample yet */
237 | 		rtt_us = USEC_PER_MSEC;	 /* use nominal default RTT */
238 | 	}
239 | 	bw = (u64)tp->snd_cwnd * BW_UNIT;
240 | 	do_div(bw, rtt_us);
241 | 	sk->sk_pacing_rate = bbr_bw_to_pacing_rate(sk, bw, bbr_high_gain);
242 | }
243 | 
244 | /* Pace using current bw estimate and a gain factor. In order to help drive the
245 |  * network toward lower queues while maintaining high utilization and low
246 |  * latency, the average pacing rate aims to be slightly (~1%) lower than the
247 |  * estimated bandwidth. This is an important aspect of the design. In this
248 |  * implementation this slightly lower pacing rate is achieved implicitly by not
249 |  * including link-layer headers in the packet size used for the pacing rate.
250 |  */
251 | static void bbr_set_pacing_rate(struct sock *sk, u32 bw, int gain)
252 | {
253 | 	struct tcp_sock *tp = tcp_sk(sk);
254 | 	struct bbr *bbr = inet_csk_ca(sk);
255 | 	u32 rate = bbr_bw_to_pacing_rate(sk, bw, gain);
256 | 
257 | 	if (unlikely(!bbr->has_seen_rtt && tp->srtt_us))
258 | 		bbr_init_pacing_rate_from_rtt(sk);
259 | 	if (bbr_full_bw_reached(sk) || rate > sk->sk_pacing_rate)
260 | 		sk->sk_pacing_rate = rate;
261 | }
262 | 
263 | /* Return count of segments we want in the skbs we send, or 0 for default. */
264 | static u32 bbr_tso_segs_goal(struct sock *sk)
265 | {
266 | 	struct bbr *bbr = inet_csk_ca(sk);
267 | 
268 | 	return bbr->tso_segs_goal;
269 | }
270 | 
271 | static void bbr_set_tso_segs_goal(struct sock *sk)
272 | {
273 | 	struct tcp_sock *tp = tcp_sk(sk);
274 | 	struct bbr *bbr = inet_csk_ca(sk);
275 | 	u32 min_segs;
276 | 
277 | 	min_segs = sk->sk_pacing_rate < (bbr_min_tso_rate >> 3) ? 1 : 2;
278 | 	bbr->tso_segs_goal = min(tcp_tso_autosize(sk, tp->mss_cache, min_segs),
279 | 				 0x7FU);
280 | }
281 | 
282 | /* Save "last known good" cwnd so we can restore it after losses or PROBE_RTT */
283 | static void bbr_save_cwnd(struct sock *sk)
284 | {
285 | 	struct tcp_sock *tp = tcp_sk(sk);
286 | 	struct bbr *bbr = inet_csk_ca(sk);
287 | 
288 | 	if (bbr->prev_ca_state < TCP_CA_Recovery && bbr->mode != BBR_PROBE_RTT)
289 | 		bbr->prior_cwnd = tp->snd_cwnd;  /* this cwnd is good enough */
290 | 	else  /* loss recovery or BBR_PROBE_RTT have temporarily cut cwnd */
291 | 		bbr->prior_cwnd = max(bbr->prior_cwnd, tp->snd_cwnd);
292 | }
293 | 
294 | static void bbr_cwnd_event(struct sock *sk, enum tcp_ca_event event)
295 | {
296 | 	struct tcp_sock *tp = tcp_sk(sk);
297 | 	struct bbr *bbr = inet_csk_ca(sk);
298 | 
299 | 	if (event == CA_EVENT_TX_START && tp->app_limited) {
300 | 		bbr->idle_restart = 1;
301 | 		/* Avoid pointless buffer overflows: pace at est. bw if we don't
302 | 		 * need more speed (we're restarting from idle and app-limited).
303 | 		 */
304 | 		if (bbr->mode == BBR_PROBE_BW)
305 | 			bbr_set_pacing_rate(sk, bbr_bw(sk), BBR_UNIT);
306 | 	}
307 | }
308 | 
309 | /* Find target cwnd. Right-size the cwnd based on min RTT and the
310 |  * estimated bottleneck bandwidth:
311 |  *
312 |  * cwnd = bw * min_rtt * gain = BDP * gain
313 |  *
314 |  * The key factor, gain, controls the amount of queue. While a small gain
315 |  * builds a smaller queue, it becomes more vulnerable to noise in RTT
316 |  * measurements (e.g., delayed ACKs or other ACK compression effects). This
317 |  * noise may cause BBR to under-estimate the rate.
318 |  *
319 |  * To achieve full performance in high-speed paths, we budget enough cwnd to
320 |  * fit full-sized skbs in-flight on both end hosts to fully utilize the path:
321 |  *   - one skb in sending host Qdisc,
322 |  *   - one skb in sending host TSO/GSO engine
323 |  *   - one skb being received by receiver host LRO/GRO/delayed-ACK engine
324 |  * Don't worry, at low rates (bbr_min_tso_rate) this won't bloat cwnd because
325 |  * in such cases tso_segs_goal is 1. The minimum cwnd is 4 packets,
326 |  * which allows 2 outstanding 2-packet sequences, to try to keep pipe
327 |  * full even with ACK-every-other-packet delayed ACKs.
328 |  */
329 | static u32 bbr_target_cwnd(struct sock *sk, u32 bw, int gain)
330 | {
331 | 	struct bbr *bbr = inet_csk_ca(sk);
332 | 	u32 cwnd;
333 | 	u64 w;
334 | 
335 | 	/* If we've never had a valid RTT sample, cap cwnd at the initial
336 | 	 * default. This should only happen when the connection is not using TCP
337 | 	 * timestamps and has retransmitted all of the SYN/SYNACK/data packets
338 | 	 * ACKed so far. In this case, an RTO can cut cwnd to 1, in which
339 | 	 * case we need to slow-start up toward something safe: TCP_INIT_CWND.
340 | 	 */
341 | 	if (unlikely(bbr->min_rtt_us == ~0U))	 /* no valid RTT samples yet? */
342 | 		return TCP_INIT_CWND;  /* be safe: cap at default initial cwnd*/
343 | 
344 | 	w = (u64)bw * bbr->min_rtt_us;
345 | 
346 | 	/* Apply a gain to the given value, then remove the BW_SCALE shift. */
347 | 	cwnd = (((w * gain) >> BBR_SCALE) + BW_UNIT - 1) / BW_UNIT;
348 | 
349 | 	/* Allow enough full-sized skbs in flight to utilize end systems. */
350 | 	cwnd += 3 * bbr->tso_segs_goal;
351 | 
352 | 	/* Reduce delayed ACKs by rounding up cwnd to the next even number. */
353 | 	cwnd = (cwnd + 1) & ~1U;
354 | 
355 | 	return cwnd;
356 | }
357 | 
358 | /* An optimization in BBR to reduce losses: On the first round of recovery, we
359 |  * follow the packet conservation principle: send P packets per P packets acked.
360 |  * After that, we slow-start and send at most 2*P packets per P packets acked.
361 |  * After recovery finishes, or upon undo, we restore the cwnd we had when
362 |  * recovery started (capped by the target cwnd based on estimated BDP).
363 |  *
364 |  * TODO(ycheng/ncardwell): implement a rate-based approach.
365 |  */
366 | static bool bbr_set_cwnd_to_recover_or_restore(
367 | 	struct sock *sk, const struct rate_sample *rs, u32 acked, u32 *new_cwnd)
368 | {
369 | 	struct tcp_sock *tp = tcp_sk(sk);
370 | 	struct bbr *bbr = inet_csk_ca(sk);
371 | 	u8 prev_state = bbr->prev_ca_state, state = inet_csk(sk)->icsk_ca_state;
372 | 	u32 cwnd = tp->snd_cwnd;
373 | 
374 | 	/* An ACK for P pkts should release at most 2*P packets. We do this
375 | 	 * in two steps. First, here we deduct the number of lost packets.
376 | 	 * Then, in bbr_set_cwnd() we slow start up toward the target cwnd.
377 | 	 */
378 | 	if (rs->losses > 0)
379 | 		cwnd = max_t(s32, cwnd - rs->losses, 1);
380 | 
381 | 	if (state == TCP_CA_Recovery && prev_state != TCP_CA_Recovery) {
382 | 		/* Starting 1st round of Recovery, so do packet conservation. */
383 | 		bbr->packet_conservation = 1;
384 | 		bbr->next_rtt_delivered = tp->delivered;  /* start round now */
385 | 		/* Cut unused cwnd from app behavior, TSQ, or TSO deferral: */
386 | 		cwnd = tcp_packets_in_flight(tp) + acked;
387 | 	} else if (prev_state >= TCP_CA_Recovery && state < TCP_CA_Recovery) {
388 | 		/* Exiting loss recovery; restore cwnd saved before recovery. */
389 | 		bbr->restore_cwnd = 1;
390 | 		bbr->packet_conservation = 0;
391 | 	}
392 | 	bbr->prev_ca_state = state;
393 | 
394 | 	if (bbr->restore_cwnd) {
395 | 		/* Restore cwnd after exiting loss recovery or PROBE_RTT. */
396 | 		cwnd = max(cwnd, bbr->prior_cwnd);
397 | 		bbr->restore_cwnd = 0;
398 | 	}
399 | 
400 | 	if (bbr->packet_conservation) {
401 | 		*new_cwnd = max(cwnd, tcp_packets_in_flight(tp) + acked);
402 | 		return true;	/* yes, using packet conservation */
403 | 	}
404 | 	*new_cwnd = cwnd;
405 | 	return false;
406 | }
407 | 
408 | /* Slow-start up toward target cwnd (if bw estimate is growing, or packet loss
409 |  * has drawn us down below target), or snap down to target if we're above it.
410 |  */
411 | static void bbr_set_cwnd(struct sock *sk, const struct rate_sample *rs,
412 | 			 u32 acked, u32 bw, int gain)
413 | {
414 | 	struct tcp_sock *tp = tcp_sk(sk);
415 | 	struct bbr *bbr = inet_csk_ca(sk);
416 | 	u32 cwnd = 0, target_cwnd = 0;
417 | 
418 | 	if (!acked)
419 | 		return;
420 | 
421 | 	if (bbr_set_cwnd_to_recover_or_restore(sk, rs, acked, &cwnd))
422 | 		goto done;
423 | 
424 | 	/* If we're below target cwnd, slow start cwnd toward target cwnd. */
425 | 	target_cwnd = bbr_target_cwnd(sk, bw, gain);
426 | 	if (bbr_full_bw_reached(sk))  /* only cut cwnd if we filled the pipe */
427 | 		cwnd = min(cwnd + acked, target_cwnd);
428 | 	else if (cwnd < target_cwnd || tp->delivered < TCP_INIT_CWND)
429 | 		cwnd = cwnd + acked;
430 | 	cwnd = max(cwnd, bbr_cwnd_min_target);
431 | 
432 | done:
433 | 	tp->snd_cwnd = min(cwnd, tp->snd_cwnd_clamp);	/* apply global cap */
434 | 	if (bbr->mode == BBR_PROBE_RTT)  /* drain queue, refresh min_rtt */
435 | 		tp->snd_cwnd = min(tp->snd_cwnd, bbr_cwnd_min_target);
436 | }
437 | 
438 | /* End cycle phase if it's time and/or we hit the phase's in-flight target. */
439 | static bool bbr_is_next_cycle_phase(struct sock *sk,
440 | 				    const struct rate_sample *rs)
441 | {
442 | 	struct tcp_sock *tp = tcp_sk(sk);
443 | 	struct bbr *bbr = inet_csk_ca(sk);
444 | 	bool is_full_length =
445 | 		tcp_stamp_us_delta(tp->delivered_mstamp, bbr->cycle_mstamp) >
446 | 		bbr->min_rtt_us;
447 | 	u32 inflight, bw;
448 | 
449 | 	/* The pacing_gain of 1.0 paces at the estimated bw to try to fully
450 | 	 * use the pipe without increasing the queue.
451 | 	 */
452 | 	if (bbr->pacing_gain == BBR_UNIT)
453 | 		return is_full_length;		/* just use wall clock time */
454 | 
455 | 	inflight = rs->prior_in_flight;  /* what was in-flight before ACK? */
456 | 	bw = bbr_max_bw(sk);
457 | 
458 | 	/* A pacing_gain > 1.0 probes for bw by trying to raise inflight to at
459 | 	 * least pacing_gain*BDP; this may take more than min_rtt if min_rtt is
460 | 	 * small (e.g. on a LAN). We do not persist if packets are lost, since
461 | 	 * a path with small buffers may not hold that much.
462 | 	 */
463 | 	if (bbr->pacing_gain > BBR_UNIT)
464 | 		return is_full_length &&
465 | 			(rs->losses ||  /* perhaps pacing_gain*BDP won't fit */
466 | 			 inflight >= bbr_target_cwnd(sk, bw, bbr->pacing_gain));
467 | 
468 | 	/* A pacing_gain < 1.0 tries to drain extra queue we added if bw
469 | 	 * probing didn't find more bw. If inflight falls to match BDP then we
470 | 	 * estimate queue is drained; persisting would underutilize the pipe.
471 | 	 */
472 | 	return is_full_length ||
473 | 		inflight <= bbr_target_cwnd(sk, bw, BBR_UNIT);
474 | }
475 | 
476 | static void bbr_advance_cycle_phase(struct sock *sk)
477 | {
478 | 	struct tcp_sock *tp = tcp_sk(sk);
479 | 	struct bbr *bbr = inet_csk_ca(sk);
480 | 
481 | 	bbr->cycle_idx = (bbr->cycle_idx + 1) & (CYCLE_LEN - 1);
482 | 	bbr->cycle_mstamp = tp->delivered_mstamp;
483 | 	bbr->pacing_gain = bbr_pacing_gain[bbr->cycle_idx];
484 | }
485 | 
486 | /* Gain cycling: cycle pacing gain to converge to fair share of available bw. */
487 | static void bbr_update_cycle_phase(struct sock *sk,
488 | 				   const struct rate_sample *rs)
489 | {
490 | 	struct bbr *bbr = inet_csk_ca(sk);
491 | 
492 | 	if ((bbr->mode == BBR_PROBE_BW) && !bbr->lt_use_bw &&
493 | 	    bbr_is_next_cycle_phase(sk, rs))
494 | 		bbr_advance_cycle_phase(sk);
495 | }
496 | 
497 | static void bbr_reset_startup_mode(struct sock *sk)
498 | {
499 | 	struct bbr *bbr = inet_csk_ca(sk);
500 | 
501 | 	bbr->mode = BBR_STARTUP;
502 | 	bbr->pacing_gain = bbr_high_gain;
503 | 	bbr->cwnd_gain	 = bbr_high_gain;
504 | }
505 | 
506 | static void bbr_reset_probe_bw_mode(struct sock *sk)
507 | {
508 | 	struct bbr *bbr = inet_csk_ca(sk);
509 | 
510 | 	bbr->mode = BBR_PROBE_BW;
511 | 	bbr->pacing_gain = BBR_UNIT;
512 | 	bbr->cwnd_gain = bbr_cwnd_gain;
513 | 	bbr->cycle_idx = CYCLE_LEN - 1 - prandom_u32_max(bbr_cycle_rand);
514 | 	bbr_advance_cycle_phase(sk);	/* flip to next phase of gain cycle */
515 | }
516 | 
517 | static void bbr_reset_mode(struct sock *sk)
518 | {
519 | 	if (!bbr_full_bw_reached(sk))
520 | 		bbr_reset_startup_mode(sk);
521 | 	else
522 | 		bbr_reset_probe_bw_mode(sk);
523 | }
524 | 
525 | /* Start a new long-term sampling interval. */
526 | static void bbr_reset_lt_bw_sampling_interval(struct sock *sk)
527 | {
528 | 	struct tcp_sock *tp = tcp_sk(sk);
529 | 	struct bbr *bbr = inet_csk_ca(sk);
530 | 
531 | 	bbr->lt_last_stamp = div_u64(tp->delivered_mstamp, USEC_PER_MSEC);
532 | 	bbr->lt_last_delivered = tp->delivered;
533 | 	bbr->lt_last_lost = tp->lost;
534 | 	bbr->lt_rtt_cnt = 0;
535 | }
536 | 
537 | /* Completely reset long-term bandwidth sampling. */
538 | static void bbr_reset_lt_bw_sampling(struct sock *sk)
539 | {
540 | 	struct bbr *bbr = inet_csk_ca(sk);
541 | 
542 | 	bbr->lt_bw = 0;
543 | 	bbr->lt_use_bw = 0;
544 | 	bbr->lt_is_sampling = false;
545 | 	bbr_reset_lt_bw_sampling_interval(sk);
546 | }
547 | 
548 | /* Long-term bw sampling interval is done. Estimate whether we're policed. */
549 | static void bbr_lt_bw_interval_done(struct sock *sk, u32 bw)
550 | {
551 | 	struct bbr *bbr = inet_csk_ca(sk);
552 | 	u32 diff;
553 | 
554 | 	if (bbr->lt_bw) {  /* do we have bw from a previous interval? */
555 | 		/* Is new bw close to the lt_bw from the previous interval? */
556 | 		diff = abs(bw - bbr->lt_bw);
557 | 		if ((diff * BBR_UNIT <= bbr_lt_bw_ratio * bbr->lt_bw) ||
558 | 		    (bbr_rate_bytes_per_sec(sk, diff, BBR_UNIT) <=
559 | 		     bbr_lt_bw_diff)) {
560 | 			/* All criteria are met; estimate we're policed. */
561 | 			bbr->lt_bw = (bw + bbr->lt_bw) >> 1;  /* avg 2 intvls */
562 | 			bbr->lt_use_bw = 1;
563 | 			bbr->pacing_gain = BBR_UNIT;  /* try to avoid drops */
564 | 			bbr->lt_rtt_cnt = 0;
565 | 			return;
566 | 		}
567 | 	}
568 | 	bbr->lt_bw = bw;
569 | 	bbr_reset_lt_bw_sampling_interval(sk);
570 | }
571 | 
572 | /* Token-bucket traffic policers are common (see "An Internet-Wide Analysis of
573 |  * Traffic Policing", SIGCOMM 2016). BBR detects token-bucket policers and
574 |  * explicitly models their policed rate, to reduce unnecessary losses. We
575 |  * estimate that we're policed if we see 2 consecutive sampling intervals with
576 |  * consistent throughput and high packet loss. If we think we're being policed,
577 |  * set lt_bw to the "long-term" average delivery rate from those 2 intervals.
578 |  */
579 | static void bbr_lt_bw_sampling(struct sock *sk, const struct rate_sample *rs)
580 | {
581 | 	struct tcp_sock *tp = tcp_sk(sk);
582 | 	struct bbr *bbr = inet_csk_ca(sk);
583 | 	u32 lost, delivered;
584 | 	u64 bw;
585 | 	u32 t;
586 | 
587 | 	if (bbr->lt_use_bw) {	/* already using long-term rate, lt_bw? */
588 | 		if (bbr->mode == BBR_PROBE_BW && bbr->round_start &&
589 | 		    ++bbr->lt_rtt_cnt >= bbr_lt_bw_max_rtts) {
590 | 			bbr_reset_lt_bw_sampling(sk);    /* stop using lt_bw */
591 | 			bbr_reset_probe_bw_mode(sk);  /* restart gain cycling */
592 | 		}
593 | 		return;
594 | 	}
595 | 
596 | 	/* Wait for the first loss before sampling, to let the policer exhaust
597 | 	 * its tokens and estimate the steady-state rate allowed by the policer.
598 | 	 * Starting samples earlier includes bursts that over-estimate the bw.
599 | 	 */
600 | 	if (!bbr->lt_is_sampling) {
601 | 		if (!rs->losses)
602 | 			return;
603 | 		bbr_reset_lt_bw_sampling_interval(sk);
604 | 		bbr->lt_is_sampling = true;
605 | 	}
606 | 
607 | 	/* To avoid underestimates, reset sampling if we run out of data. */
608 | 	if (rs->is_app_limited) {
609 | 		bbr_reset_lt_bw_sampling(sk);
610 | 		return;
611 | 	}
612 | 
613 | 	if (bbr->round_start)
614 | 		bbr->lt_rtt_cnt++;	/* count round trips in this interval */
615 | 	if (bbr->lt_rtt_cnt < bbr_lt_intvl_min_rtts)
616 | 		return;		/* sampling interval needs to be longer */
617 | 	if (bbr->lt_rtt_cnt > 4 * bbr_lt_intvl_min_rtts) {
618 | 		bbr_reset_lt_bw_sampling(sk);  /* interval is too long */
619 | 		return;
620 | 	}
621 | 
622 | 	/* End sampling interval when a packet is lost, so we estimate the
623 | 	 * policer tokens were exhausted. Stopping the sampling before the
624 | 	 * tokens are exhausted under-estimates the policed rate.
625 | 	 */
626 | 	if (!rs->losses)
627 | 		return;
628 | 
629 | 	/* Calculate packets lost and delivered in sampling interval. */
630 | 	lost = tp->lost - bbr->lt_last_lost;
631 | 	delivered = tp->delivered - bbr->lt_last_delivered;
632 | 	/* Is loss rate (lost/delivered) >= lt_loss_thresh? If not, wait. */
633 | 	if (!delivered || (lost << BBR_SCALE) < bbr_lt_loss_thresh * delivered)
634 | 		return;
635 | 
636 | 	/* Find average delivery rate in this sampling interval. */
637 | 	t = div_u64(tp->delivered_mstamp, USEC_PER_MSEC) - bbr->lt_last_stamp;
638 | 	if ((s32)t < 1)
639 | 		return;		/* interval is less than one ms, so wait */
640 | 	/* Check if can multiply without overflow */
641 | 	if (t >= ~0U / USEC_PER_MSEC) {
642 | 		bbr_reset_lt_bw_sampling(sk);  /* interval too long; reset */
643 | 		return;
644 | 	}
645 | 	t *= USEC_PER_MSEC;
646 | 	bw = (u64)delivered * BW_UNIT;
647 | 	do_div(bw, t);
648 | 	bbr_lt_bw_interval_done(sk, bw);
649 | }
650 | 
651 | /* Estimate the bandwidth based on how fast packets are delivered */
652 | static void bbr_update_bw(struct sock *sk, const struct rate_sample *rs)
653 | {
654 | 	struct tcp_sock *tp = tcp_sk(sk);
655 | 	struct bbr *bbr = inet_csk_ca(sk);
656 | 	u64 bw;
657 | 
658 | 	bbr->round_start = 0;
659 | 	if (rs->delivered < 0 || rs->interval_us <= 0)
660 | 		return; /* Not a valid observation */
661 | 
662 | 	/* See if we've reached the next RTT */
663 | 	if (!before(rs->prior_delivered, bbr->next_rtt_delivered)) {
664 | 		bbr->next_rtt_delivered = tp->delivered;
665 | 		bbr->rtt_cnt++;
666 | 		bbr->round_start = 1;
667 | 		bbr->packet_conservation = 0;
668 | 	}
669 | 
670 | 	bbr_lt_bw_sampling(sk, rs);
671 | 
672 | 	/* Divide delivered by the interval to find a (lower bound) bottleneck
673 | 	 * bandwidth sample. Delivered is in packets and interval_us in uS and
674 | 	 * ratio will be <<1 for most connections. So delivered is first scaled.
675 | 	 */
676 | 	bw = (u64)rs->delivered * BW_UNIT;
677 | 	do_div(bw, rs->interval_us);
678 | 
679 | 	/* If this sample is application-limited, it is likely to have a very
680 | 	 * low delivered count that represents application behavior rather than
681 | 	 * the available network rate. Such a sample could drag down estimated
682 | 	 * bw, causing needless slow-down. Thus, to continue to send at the
683 | 	 * last measured network rate, we filter out app-limited samples unless
684 | 	 * they describe the path bw at least as well as our bw model.
685 | 	 *
686 | 	 * So the goal during app-limited phase is to proceed with the best
687 | 	 * network rate no matter how long. We automatically leave this
688 | 	 * phase when app writes faster than the network can deliver :)
689 | 	 */
690 | 	if (!rs->is_app_limited || bw >= bbr_max_bw(sk)) {
691 | 		/* Incorporate new sample into our max bw filter. */
692 | 		minmax_running_max(&bbr->bw, bbr_bw_rtts, bbr->rtt_cnt, bw);
693 | 	}
694 | }
695 | 
696 | /* Estimate when the pipe is full, using the change in delivery rate: BBR
697 |  * estimates that STARTUP filled the pipe if the estimated bw hasn't changed by
698 |  * at least bbr_full_bw_thresh (25%) after bbr_full_bw_cnt (3) non-app-limited
699 |  * rounds. Why 3 rounds: 1: rwin autotuning grows the rwin, 2: we fill the
700 |  * higher rwin, 3: we get higher delivery rate samples. Or transient
701 |  * cross-traffic or radio noise can go away. CUBIC Hystart shares a similar
702 |  * design goal, but uses delay and inter-ACK spacing instead of bandwidth.
703 |  */
704 | static void bbr_check_full_bw_reached(struct sock *sk,
705 | 				      const struct rate_sample *rs)
706 | {
707 | 	struct bbr *bbr = inet_csk_ca(sk);
708 | 	u32 bw_thresh;
709 | 
710 | 	if (bbr_full_bw_reached(sk) || !bbr->round_start || rs->is_app_limited)
711 | 		return;
712 | 
713 | 	bw_thresh = (u64)bbr->full_bw * bbr_full_bw_thresh >> BBR_SCALE;
714 | 	if (bbr_max_bw(sk) >= bw_thresh) {
715 | 		bbr->full_bw = bbr_max_bw(sk);
716 | 		bbr->full_bw_cnt = 0;
717 | 		return;
718 | 	}
719 | 	++bbr->full_bw_cnt;
720 | }
721 | 
722 | /* If pipe is probably full, drain the queue and then enter steady-state. */
723 | static void bbr_check_drain(struct sock *sk, const struct rate_sample *rs)
724 | {
725 | 	struct bbr *bbr = inet_csk_ca(sk);
726 | 
727 | 	if (bbr->mode == BBR_STARTUP && bbr_full_bw_reached(sk)) {
728 | 		bbr->mode = BBR_DRAIN;	/* drain queue we created */
729 | 		bbr->pacing_gain = bbr_drain_gain;	/* pace slow to drain */
730 | 		bbr->cwnd_gain = bbr_high_gain;	/* maintain cwnd */
731 | 	}	/* fall through to check if in-flight is already small: */
732 | 	if (bbr->mode == BBR_DRAIN &&
733 | 	    tcp_packets_in_flight(tcp_sk(sk)) <=
734 | 	    bbr_target_cwnd(sk, bbr_max_bw(sk), BBR_UNIT))
735 | 		bbr_reset_probe_bw_mode(sk);  /* we estimate queue is drained */
736 | }
737 | 
738 | /* The goal of PROBE_RTT mode is to have BBR flows cooperatively and
739 |  * periodically drain the bottleneck queue, to converge to measure the true
740 |  * min_rtt (unloaded propagation delay). This allows the flows to keep queues
741 |  * small (reducing queuing delay and packet loss) and achieve fairness among
742 |  * BBR flows.
743 |  *
744 |  * The min_rtt filter window is 10 seconds. When the min_rtt estimate expires,
745 |  * we enter PROBE_RTT mode and cap the cwnd at bbr_cwnd_min_target=4 packets.
746 |  * After at least bbr_probe_rtt_mode_ms=200ms and at least one packet-timed
747 |  * round trip elapsed with that flight size <= 4, we leave PROBE_RTT mode and
748 |  * re-enter the previous mode. BBR uses 200ms to approximately bound the
749 |  * performance penalty of PROBE_RTT's cwnd capping to roughly 2% (200ms/10s).
750 |  *
751 |  * Note that flows need only pay 2% if they are busy sending over the last 10
752 |  * seconds. Interactive applications (e.g., Web, RPCs, video chunks) often have
753 |  * natural silences or low-rate periods within 10 seconds where the rate is low
754 |  * enough for long enough to drain its queue in the bottleneck. We pick up
755 |  * these min RTT measurements opportunistically with our min_rtt filter. :-)
756 |  */
757 | static void bbr_update_min_rtt(struct sock *sk, const struct rate_sample *rs)
758 | {
759 | 	struct tcp_sock *tp = tcp_sk(sk);
760 | 	struct bbr *bbr = inet_csk_ca(sk);
761 | 	bool filter_expired;
762 | 
763 | 	/* Track min RTT seen in the min_rtt_win_sec filter window: */
764 | 	filter_expired = after(tcp_jiffies32,
765 | 			       bbr->min_rtt_stamp + bbr_min_rtt_win_sec * HZ);
766 | 	if (rs->rtt_us >= 0 &&
767 | 	    (rs->rtt_us <= bbr->min_rtt_us || filter_expired)) {
768 | 		bbr->min_rtt_us = rs->rtt_us;
769 | 		bbr->min_rtt_stamp = tcp_jiffies32;
770 | 	}
771 | 
772 | 	if (bbr_probe_rtt_mode_ms > 0 && filter_expired &&
773 | 	    !bbr->idle_restart && bbr->mode != BBR_PROBE_RTT) {
774 | 		bbr->mode = BBR_PROBE_RTT;  /* dip, drain queue */
775 | 		bbr->pacing_gain = BBR_UNIT;
776 | 		bbr->cwnd_gain = BBR_UNIT;
777 | 		bbr_save_cwnd(sk);  /* note cwnd so we can restore it */
778 | 		bbr->probe_rtt_done_stamp = 0;
779 | 	}
780 | 
781 | 	if (bbr->mode == BBR_PROBE_RTT) {
782 | 		/* Ignore low rate samples during this mode. */
783 | 		tp->app_limited =
784 | 			(tp->delivered + tcp_packets_in_flight(tp)) ? : 1;
785 | 		/* Maintain min packets in flight for max(200 ms, 1 round). */
786 | 		if (!bbr->probe_rtt_done_stamp &&
787 | 		    tcp_packets_in_flight(tp) <= bbr_cwnd_min_target) {
788 | 			bbr->probe_rtt_done_stamp = tcp_jiffies32 +
789 | 				msecs_to_jiffies(bbr_probe_rtt_mode_ms);
790 | 			bbr->probe_rtt_round_done = 0;
791 | 			bbr->next_rtt_delivered = tp->delivered;
792 | 		} else if (bbr->probe_rtt_done_stamp) {
793 | 			if (bbr->round_start)
794 | 				bbr->probe_rtt_round_done = 1;
795 | 			if (bbr->probe_rtt_round_done &&
796 | 			    after(tcp_jiffies32, bbr->probe_rtt_done_stamp)) {
797 | 				bbr->min_rtt_stamp = tcp_jiffies32;
798 | 				bbr->restore_cwnd = 1;  /* snap to prior_cwnd */
799 | 				bbr_reset_mode(sk);
800 | 			}
801 | 		}
802 | 	}
803 | 	bbr->idle_restart = 0;
804 | }
805 | 
806 | static void bbr_update_model(struct sock *sk, const struct rate_sample *rs)
807 | {
808 | 	bbr_update_bw(sk, rs);
809 | 	bbr_update_cycle_phase(sk, rs);
810 | 	bbr_check_full_bw_reached(sk, rs);
811 | 	bbr_check_drain(sk, rs);
812 | 	bbr_update_min_rtt(sk, rs);
813 | }
814 | 
815 | static void bbr_main(struct sock *sk, const struct rate_sample *rs)
816 | {
817 | 	struct bbr *bbr = inet_csk_ca(sk);
818 | 	u32 bw;
819 | 
820 | 	bbr_update_model(sk, rs);
821 | 
822 | 	bw = bbr_bw(sk);
823 | 	bbr_set_pacing_rate(sk, bw, bbr->pacing_gain);
824 | 	bbr_set_tso_segs_goal(sk);
825 | 	bbr_set_cwnd(sk, rs, rs->acked_sacked, bw, bbr->cwnd_gain);
826 | }
827 | 
828 | static void bbr_init(struct sock *sk)
829 | {
830 | 	struct tcp_sock *tp = tcp_sk(sk);
831 | 	struct bbr *bbr = inet_csk_ca(sk);
832 | 
833 | 	bbr->prior_cwnd = 0;
834 | 	bbr->tso_segs_goal = 0;	 /* default segs per skb until first ACK */
835 | 	bbr->rtt_cnt = 0;
836 | 	bbr->next_rtt_delivered = 0;
837 | 	bbr->prev_ca_state = TCP_CA_Open;
838 | 	bbr->packet_conservation = 0;
839 | 
840 | 	bbr->probe_rtt_done_stamp = 0;
841 | 	bbr->probe_rtt_round_done = 0;
842 | 	bbr->min_rtt_us = tcp_min_rtt(tp);
843 | 	bbr->min_rtt_stamp = tcp_jiffies32;
844 | 
845 | 	minmax_reset(&bbr->bw, bbr->rtt_cnt, 0);  /* init max bw to 0 */
846 | 
847 | 	bbr->has_seen_rtt = 0;
848 | 	bbr_init_pacing_rate_from_rtt(sk);
849 | 
850 | 	bbr->restore_cwnd = 0;
851 | 	bbr->round_start = 0;
852 | 	bbr->idle_restart = 0;
853 | 	bbr->full_bw = 0;
854 | 	bbr->full_bw_cnt = 0;
855 | 	bbr->cycle_mstamp = 0;
856 | 	bbr->cycle_idx = 0;
857 | 	bbr_reset_lt_bw_sampling(sk);
858 | 	bbr_reset_startup_mode(sk);
859 | 
860 | 	cmpxchg(&sk->sk_pacing_status, SK_PACING_NONE, SK_PACING_NEEDED);
861 | }
862 | 
863 | static u32 bbr_sndbuf_expand(struct sock *sk)
864 | {
865 | 	/* Provision 3 * cwnd since BBR may slow-start even during recovery. */
866 | 	return 3;
867 | }
868 | 
869 | /* In theory BBR does not need to undo the cwnd since it does not
870 |  * always reduce cwnd on losses (see bbr_main()). Keep it for now.
871 |  */
872 | static u32 bbr_undo_cwnd(struct sock *sk)
873 | {
874 | 	return tcp_sk(sk)->snd_cwnd;
875 | }
876 | 
877 | /* Entering loss recovery, so save cwnd for when we exit or undo recovery. */
878 | static u32 bbr_ssthresh(struct sock *sk)
879 | {
880 | 	bbr_save_cwnd(sk);
881 | 	return TCP_INFINITE_SSTHRESH;	 /* BBR does not use ssthresh */
882 | }
883 | 
884 | static size_t bbr_get_info(struct sock *sk, u32 ext, int *attr,
885 | 			   union tcp_cc_info *info)
886 | {
887 | 	if (ext & (1 << (INET_DIAG_BBRINFO - 1)) ||
888 | 	    ext & (1 << (INET_DIAG_VEGASINFO - 1))) {
889 | 		struct tcp_sock *tp = tcp_sk(sk);
890 | 		struct bbr *bbr = inet_csk_ca(sk);
891 | 		u64 bw = bbr_bw(sk);
892 | 
893 | 		bw = bw * tp->mss_cache * USEC_PER_SEC >> BW_SCALE;
894 | 		memset(&info->bbr, 0, sizeof(info->bbr));
895 | 		info->bbr.bbr_bw_lo		= (u32)bw;
896 | 		info->bbr.bbr_bw_hi		= (u32)(bw >> 32);
897 | 		info->bbr.bbr_min_rtt		= bbr->min_rtt_us;
898 | 		info->bbr.bbr_pacing_gain	= bbr->pacing_gain;
899 | 		info->bbr.bbr_cwnd_gain		= bbr->cwnd_gain;
900 | 		*attr = INET_DIAG_BBRINFO;
901 | 		return sizeof(info->bbr);
902 | 	}
903 | 	return 0;
904 | }
905 | 
906 | static void bbr_set_state(struct sock *sk, u8 new_state)
907 | {
908 | 	struct bbr *bbr = inet_csk_ca(sk);
909 | 
910 | 	if (new_state == TCP_CA_Loss) {
911 | 		struct rate_sample rs = { .losses = 1 };
912 | 
913 | 		bbr->prev_ca_state = TCP_CA_Loss;
914 | 		bbr->full_bw = 0;
915 | 		bbr->round_start = 1;	/* treat RTO like end of a round */
916 | 		bbr_lt_bw_sampling(sk, &rs);
917 | 	}
918 | }
919 | 
920 | static struct tcp_congestion_ops tcp_bbr_cong_ops __read_mostly = {
921 | 	.flags		= TCP_CONG_NON_RESTRICTED,
922 | 	.name		= "bbr",
923 | 	.owner		= THIS_MODULE,
924 | 	.init		= bbr_init,
925 | 	.cong_control	= bbr_main,
926 | 	.sndbuf_expand	= bbr_sndbuf_expand,
927 | 	.undo_cwnd	= bbr_undo_cwnd,
928 | 	.cwnd_event	= bbr_cwnd_event,
929 | 	.ssthresh	= bbr_ssthresh,
930 | 	.tso_segs_goal	= bbr_tso_segs_goal,
931 | 	.get_info	= bbr_get_info,
932 | 	.set_state	= bbr_set_state,
933 | };
934 | 
935 | static int __init bbr_register(void)
936 | {
937 | 	BUILD_BUG_ON(sizeof(struct bbr) > ICSK_CA_PRIV_SIZE);
938 | 	return tcp_register_congestion_control(&tcp_bbr_cong_ops);
939 | }
940 | 
941 | static void __exit bbr_unregister(void)
942 | {
943 | 	tcp_unregister_congestion_control(&tcp_bbr_cong_ops);
944 | }
945 | 
946 | module_init(bbr_register);
947 | module_exit(bbr_unregister);
948 | 
949 | MODULE_AUTHOR("Van Jacobson <vanj@google.com>");
950 | MODULE_AUTHOR("Neal Cardwell <ncardwell@google.com>");
951 | MODULE_AUTHOR("Yuchung Cheng <ycheng@google.com>");
952 | MODULE_AUTHOR("Soheil Hassas Yeganeh <soheil@google.com>");
953 | MODULE_LICENSE("Dual BSD/GPL");
954 | MODULE_DESCRIPTION("TCP BBR (Bottleneck Bandwidth and RTT)");
955 | 


--------------------------------------------------------------------------------
/General/Debian/source/kernel-v4.15/tcp_bbr.c:
--------------------------------------------------------------------------------
  1 | /* Bottleneck Bandwidth and RTT (BBR) congestion control
  2 |  *
  3 |  * BBR congestion control computes the sending rate based on the delivery
  4 |  * rate (throughput) estimated from ACKs. In a nutshell:
  5 |  *
  6 |  *   On each ACK, update our model of the network path:
  7 |  *      bottleneck_bandwidth = windowed_max(delivered / elapsed, 10 round trips)
  8 |  *      min_rtt = windowed_min(rtt, 10 seconds)
  9 |  *   pacing_rate = pacing_gain * bottleneck_bandwidth
 10 |  *   cwnd = max(cwnd_gain * bottleneck_bandwidth * min_rtt, 4)
 11 |  *
 12 |  * The core algorithm does not react directly to packet losses or delays,
 13 |  * although BBR may adjust the size of next send per ACK when loss is
 14 |  * observed, or adjust the sending rate if it estimates there is a
 15 |  * traffic policer, in order to keep the drop rate reasonable.
 16 |  *
 17 |  * Here is a state transition diagram for BBR:
 18 |  *
 19 |  *             |
 20 |  *             V
 21 |  *    +---> STARTUP  ----+
 22 |  *    |        |         |
 23 |  *    |        V         |
 24 |  *    |      DRAIN   ----+
 25 |  *    |        |         |
 26 |  *    |        V         |
 27 |  *    +---> PROBE_BW ----+
 28 |  *    |      ^    |      |
 29 |  *    |      |    |      |
 30 |  *    |      +----+      |
 31 |  *    |                  |
 32 |  *    +---- PROBE_RTT <--+
 33 |  *
 34 |  * A BBR flow starts in STARTUP, and ramps up its sending rate quickly.
 35 |  * When it estimates the pipe is full, it enters DRAIN to drain the queue.
 36 |  * In steady state a BBR flow only uses PROBE_BW and PROBE_RTT.
 37 |  * A long-lived BBR flow spends the vast majority of its time remaining
 38 |  * (repeatedly) in PROBE_BW, fully probing and utilizing the pipe's bandwidth
 39 |  * in a fair manner, with a small, bounded queue. *If* a flow has been
 40 |  * continuously sending for the entire min_rtt window, and hasn't seen an RTT
 41 |  * sample that matches or decreases its min_rtt estimate for 10 seconds, then
 42 |  * it briefly enters PROBE_RTT to cut inflight to a minimum value to re-probe
 43 |  * the path's two-way propagation delay (min_rtt). When exiting PROBE_RTT, if
 44 |  * we estimated that we reached the full bw of the pipe then we enter PROBE_BW;
 45 |  * otherwise we enter STARTUP to try to fill the pipe.
 46 |  *
 47 |  * BBR is described in detail in:
 48 |  *   "BBR: Congestion-Based Congestion Control",
 49 |  *   Neal Cardwell, Yuchung Cheng, C. Stephen Gunn, Soheil Hassas Yeganeh,
 50 |  *   Van Jacobson. ACM Queue, Vol. 14 No. 5, September-October 2016.
 51 |  *
 52 |  * There is a public e-mail list for discussing BBR development and testing:
 53 |  *   https://groups.google.com/forum/#!forum/bbr-dev
 54 |  *
 55 |  * NOTE: BBR might be used with the fq qdisc ("man tc-fq") with pacing enabled,
 56 |  * otherwise TCP stack falls back to an internal pacing using one high
 57 |  * resolution timer per TCP socket and may use more resources.
 58 |  */
 59 | #include <linux/module.h>
 60 | #include <net/tcp.h>
 61 | #include <linux/inet_diag.h>
 62 | #include <linux/inet.h>
 63 | #include <linux/random.h>
 64 | #include <linux/win_minmax.h>
 65 | 
 66 | /* Scale factor for rate in pkt/uSec unit to avoid truncation in bandwidth
 67 |  * estimation. The rate unit ~= (1500 bytes / 1 usec / 2^24) ~= 715 bps.
 68 |  * This handles bandwidths from 0.06pps (715bps) to 256Mpps (3Tbps) in a u32.
 69 |  * Since the minimum window is >=4 packets, the lower bound isn't
 70 |  * an issue. The upper bound isn't an issue with existing technologies.
 71 |  */
 72 | #define BW_SCALE 24
 73 | #define BW_UNIT (1 << BW_SCALE)
 74 | 
 75 | #define BBR_SCALE 8	/* scaling factor for fractions in BBR (e.g. gains) */
 76 | #define BBR_UNIT (1 << BBR_SCALE)
 77 | 
 78 | /* BBR has the following modes for deciding how fast to send: */
 79 | enum bbr_mode {
 80 | 	BBR_STARTUP,	/* ramp up sending rate rapidly to fill pipe */
 81 | 	BBR_DRAIN,	/* drain any queue created during startup */
 82 | 	BBR_PROBE_BW,	/* discover, share bw: pace around estimated bw */
 83 | 	BBR_PROBE_RTT,	/* cut inflight to min to probe min_rtt */
 84 | };
 85 | 
 86 | /* BBR congestion control block */
 87 | struct bbr {
 88 | 	u32	min_rtt_us;	        /* min RTT in min_rtt_win_sec window */
 89 | 	u32	min_rtt_stamp;	        /* timestamp of min_rtt_us */
 90 | 	u32	probe_rtt_done_stamp;   /* end time for BBR_PROBE_RTT mode */
 91 | 	struct minmax bw;	/* Max recent delivery rate in pkts/uS << 24 */
 92 | 	u32	rtt_cnt;	    /* count of packet-timed rounds elapsed */
 93 | 	u32     next_rtt_delivered; /* scb->tx.delivered at end of round */
 94 | 	u64	cycle_mstamp;	     /* time of this cycle phase start */
 95 | 	u32     mode:3,		     /* current bbr_mode in state machine */
 96 | 		prev_ca_state:3,     /* CA state on previous ACK */
 97 | 		packet_conservation:1,  /* use packet conservation? */
 98 | 		restore_cwnd:1,	     /* decided to revert cwnd to old value */
 99 | 		round_start:1,	     /* start of packet-timed tx->ack round? */
100 | 		tso_segs_goal:7,     /* segments we want in each skb we send */
101 | 		idle_restart:1,	     /* restarting after idle? */
102 | 		probe_rtt_round_done:1,  /* a BBR_PROBE_RTT round at 4 pkts? */
103 | 		unused:5,
104 | 		lt_is_sampling:1,    /* taking long-term ("LT") samples now? */
105 | 		lt_rtt_cnt:7,	     /* round trips in long-term interval */
106 | 		lt_use_bw:1;	     /* use lt_bw as our bw estimate? */
107 | 	u32	lt_bw;		     /* LT est delivery rate in pkts/uS << 24 */
108 | 	u32	lt_last_delivered;   /* LT intvl start: tp->delivered */
109 | 	u32	lt_last_stamp;	     /* LT intvl start: tp->delivered_mstamp */
110 | 	u32	lt_last_lost;	     /* LT intvl start: tp->lost */
111 | 	u32	pacing_gain:10,	/* current gain for setting pacing rate */
112 | 		cwnd_gain:10,	/* current gain for setting cwnd */
113 | 		full_bw_reached:1,   /* reached full bw in Startup? */
114 | 		full_bw_cnt:2,	/* number of rounds without large bw gains */
115 | 		cycle_idx:3,	/* current index in pacing_gain cycle array */
116 | 		has_seen_rtt:1, /* have we seen an RTT sample yet? */
117 | 		unused_b:5;
118 | 	u32	prior_cwnd;	/* prior cwnd upon entering loss recovery */
119 | 	u32	full_bw;	/* recent bw, to estimate if pipe is full */
120 | };
121 | 
122 | #define CYCLE_LEN	8	/* number of phases in a pacing gain cycle */
123 | 
124 | /* Window length of bw filter (in rounds): */
125 | static const int bbr_bw_rtts = CYCLE_LEN + 2;
126 | /* Window length of min_rtt filter (in sec): */
127 | static const u32 bbr_min_rtt_win_sec = 10;
128 | /* Minimum time (in ms) spent at bbr_cwnd_min_target in BBR_PROBE_RTT mode: */
129 | static const u32 bbr_probe_rtt_mode_ms = 200;
130 | /* Skip TSO below the following bandwidth (bits/sec): */
131 | static const int bbr_min_tso_rate = 1200000;
132 | 
133 | /* We use a high_gain value of 2/ln(2) because it's the smallest pacing gain
134 |  * that will allow a smoothly increasing pacing rate that will double each RTT
135 |  * and send the same number of packets per RTT that an un-paced, slow-starting
136 |  * Reno or CUBIC flow would:
137 |  */
138 | static const int bbr_high_gain  = BBR_UNIT * 2885 / 1000 + 1;
139 | /* The pacing gain of 1/high_gain in BBR_DRAIN is calculated to typically drain
140 |  * the queue created in BBR_STARTUP in a single round:
141 |  */
142 | static const int bbr_drain_gain = BBR_UNIT * 1000 / 2885;
143 | /* The gain for deriving steady-state cwnd tolerates delayed/stretched ACKs: */
144 | static const int bbr_cwnd_gain  = BBR_UNIT * 2;
145 | /* The pacing_gain values for the PROBE_BW gain cycle, to discover/share bw: */
146 | static const int bbr_pacing_gain[] = {
147 | 	BBR_UNIT * 5 / 4,	/* probe for more available bw */
148 | 	BBR_UNIT * 3 / 4,	/* drain queue and/or yield bw to other flows */
149 | 	BBR_UNIT, BBR_UNIT, BBR_UNIT,	/* cruise at 1.0*bw to utilize pipe, */
150 | 	BBR_UNIT, BBR_UNIT, BBR_UNIT	/* without creating excess queue... */
151 | };
152 | /* Randomize the starting gain cycling phase over N phases: */
153 | static const u32 bbr_cycle_rand = 7;
154 | 
155 | /* Try to keep at least this many packets in flight, if things go smoothly. For
156 |  * smooth functioning, a sliding window protocol ACKing every other packet
157 |  * needs at least 4 packets in flight:
158 |  */
159 | static const u32 bbr_cwnd_min_target = 4;
160 | 
161 | /* To estimate if BBR_STARTUP mode (i.e. high_gain) has filled pipe... */
162 | /* If bw has increased significantly (1.25x), there may be more bw available: */
163 | static const u32 bbr_full_bw_thresh = BBR_UNIT * 5 / 4;
164 | /* But after 3 rounds w/o significant bw growth, estimate pipe is full: */
165 | static const u32 bbr_full_bw_cnt = 3;
166 | 
167 | /* "long-term" ("LT") bandwidth estimator parameters... */
168 | /* The minimum number of rounds in an LT bw sampling interval: */
169 | static const u32 bbr_lt_intvl_min_rtts = 4;
170 | /* If lost/delivered ratio > 20%, interval is "lossy" and we may be policed: */
171 | static const u32 bbr_lt_loss_thresh = 50;
172 | /* If 2 intervals have a bw ratio <= 1/8, their bw is "consistent": */
173 | static const u32 bbr_lt_bw_ratio = BBR_UNIT / 8;
174 | /* If 2 intervals have a bw diff <= 4 Kbit/sec their bw is "consistent": */
175 | static const u32 bbr_lt_bw_diff = 4000 / 8;
176 | /* If we estimate we're policed, use lt_bw for this many round trips: */
177 | static const u32 bbr_lt_bw_max_rtts = 48;
178 | 
179 | /* Do we estimate that STARTUP filled the pipe? */
180 | static bool bbr_full_bw_reached(const struct sock *sk)
181 | {
182 | 	const struct bbr *bbr = inet_csk_ca(sk);
183 | 
184 | 	return bbr->full_bw_reached;
185 | }
186 | 
187 | /* Return the windowed max recent bandwidth sample, in pkts/uS << BW_SCALE. */
188 | static u32 bbr_max_bw(const struct sock *sk)
189 | {
190 | 	struct bbr *bbr = inet_csk_ca(sk);
191 | 
192 | 	return minmax_get(&bbr->bw);
193 | }
194 | 
195 | /* Return the estimated bandwidth of the path, in pkts/uS << BW_SCALE. */
196 | static u32 bbr_bw(const struct sock *sk)
197 | {
198 | 	struct bbr *bbr = inet_csk_ca(sk);
199 | 
200 | 	return bbr->lt_use_bw ? bbr->lt_bw : bbr_max_bw(sk);
201 | }
202 | 
203 | /* Return rate in bytes per second, optionally with a gain.
204 |  * The order here is chosen carefully to avoid overflow of u64. This should
205 |  * work for input rates of up to 2.9Tbit/sec and gain of 2.89x.
206 |  */
207 | static u64 bbr_rate_bytes_per_sec(struct sock *sk, u64 rate, int gain)
208 | {
209 | 	rate *= tcp_mss_to_mtu(sk, tcp_sk(sk)->mss_cache);
210 | 	rate *= gain;
211 | 	rate >>= BBR_SCALE;
212 | 	rate *= USEC_PER_SEC;
213 | 	return rate >> BW_SCALE;
214 | }
215 | 
216 | /* Convert a BBR bw and gain factor to a pacing rate in bytes per second. */
217 | static u32 bbr_bw_to_pacing_rate(struct sock *sk, u32 bw, int gain)
218 | {
219 | 	u64 rate = bw;
220 | 
221 | 	rate = bbr_rate_bytes_per_sec(sk, rate, gain);
222 | 	rate = min_t(u64, rate, sk->sk_max_pacing_rate);
223 | 	return rate;
224 | }
225 | 
226 | /* Initialize pacing rate to: high_gain * init_cwnd / RTT. */
227 | static void bbr_init_pacing_rate_from_rtt(struct sock *sk)
228 | {
229 | 	struct tcp_sock *tp = tcp_sk(sk);
230 | 	struct bbr *bbr = inet_csk_ca(sk);
231 | 	u64 bw;
232 | 	u32 rtt_us;
233 | 
234 | 	if (tp->srtt_us) {		/* any RTT sample yet? */
235 | 		rtt_us = max(tp->srtt_us >> 3, 1U);
236 | 		bbr->has_seen_rtt = 1;
237 | 	} else {			 /* no RTT sample yet */
238 | 		rtt_us = USEC_PER_MSEC;	 /* use nominal default RTT */
239 | 	}
240 | 	bw = (u64)tp->snd_cwnd * BW_UNIT;
241 | 	do_div(bw, rtt_us);
242 | 	sk->sk_pacing_rate = bbr_bw_to_pacing_rate(sk, bw, bbr_high_gain);
243 | }
244 | 
245 | /* Pace using current bw estimate and a gain factor. In order to help drive the
246 |  * network toward lower queues while maintaining high utilization and low
247 |  * latency, the average pacing rate aims to be slightly (~1%) lower than the
248 |  * estimated bandwidth. This is an important aspect of the design. In this
249 |  * implementation this slightly lower pacing rate is achieved implicitly by not
250 |  * including link-layer headers in the packet size used for the pacing rate.
251 |  */
252 | static void bbr_set_pacing_rate(struct sock *sk, u32 bw, int gain)
253 | {
254 | 	struct tcp_sock *tp = tcp_sk(sk);
255 | 	struct bbr *bbr = inet_csk_ca(sk);
256 | 	u32 rate = bbr_bw_to_pacing_rate(sk, bw, gain);
257 | 
258 | 	if (unlikely(!bbr->has_seen_rtt && tp->srtt_us))
259 | 		bbr_init_pacing_rate_from_rtt(sk);
260 | 	if (bbr_full_bw_reached(sk) || rate > sk->sk_pacing_rate)
261 | 		sk->sk_pacing_rate = rate;
262 | }
263 | 
264 | /* Return count of segments we want in the skbs we send, or 0 for default. */
265 | static u32 bbr_tso_segs_goal(struct sock *sk)
266 | {
267 | 	struct bbr *bbr = inet_csk_ca(sk);
268 | 
269 | 	return bbr->tso_segs_goal;
270 | }
271 | 
272 | static void bbr_set_tso_segs_goal(struct sock *sk)
273 | {
274 | 	struct tcp_sock *tp = tcp_sk(sk);
275 | 	struct bbr *bbr = inet_csk_ca(sk);
276 | 	u32 min_segs;
277 | 
278 | 	min_segs = sk->sk_pacing_rate < (bbr_min_tso_rate >> 3) ? 1 : 2;
279 | 	bbr->tso_segs_goal = min(tcp_tso_autosize(sk, tp->mss_cache, min_segs),
280 | 				 0x7FU);
281 | }
282 | 
283 | /* Save "last known good" cwnd so we can restore it after losses or PROBE_RTT */
284 | static void bbr_save_cwnd(struct sock *sk)
285 | {
286 | 	struct tcp_sock *tp = tcp_sk(sk);
287 | 	struct bbr *bbr = inet_csk_ca(sk);
288 | 
289 | 	if (bbr->prev_ca_state < TCP_CA_Recovery && bbr->mode != BBR_PROBE_RTT)
290 | 		bbr->prior_cwnd = tp->snd_cwnd;  /* this cwnd is good enough */
291 | 	else  /* loss recovery or BBR_PROBE_RTT have temporarily cut cwnd */
292 | 		bbr->prior_cwnd = max(bbr->prior_cwnd, tp->snd_cwnd);
293 | }
294 | 
295 | static void bbr_cwnd_event(struct sock *sk, enum tcp_ca_event event)
296 | {
297 | 	struct tcp_sock *tp = tcp_sk(sk);
298 | 	struct bbr *bbr = inet_csk_ca(sk);
299 | 
300 | 	if (event == CA_EVENT_TX_START && tp->app_limited) {
301 | 		bbr->idle_restart = 1;
302 | 		/* Avoid pointless buffer overflows: pace at est. bw if we don't
303 | 		 * need more speed (we're restarting from idle and app-limited).
304 | 		 */
305 | 		if (bbr->mode == BBR_PROBE_BW)
306 | 			bbr_set_pacing_rate(sk, bbr_bw(sk), BBR_UNIT);
307 | 	}
308 | }
309 | 
310 | /* Find target cwnd. Right-size the cwnd based on min RTT and the
311 |  * estimated bottleneck bandwidth:
312 |  *
313 |  * cwnd = bw * min_rtt * gain = BDP * gain
314 |  *
315 |  * The key factor, gain, controls the amount of queue. While a small gain
316 |  * builds a smaller queue, it becomes more vulnerable to noise in RTT
317 |  * measurements (e.g., delayed ACKs or other ACK compression effects). This
318 |  * noise may cause BBR to under-estimate the rate.
319 |  *
320 |  * To achieve full performance in high-speed paths, we budget enough cwnd to
321 |  * fit full-sized skbs in-flight on both end hosts to fully utilize the path:
322 |  *   - one skb in sending host Qdisc,
323 |  *   - one skb in sending host TSO/GSO engine
324 |  *   - one skb being received by receiver host LRO/GRO/delayed-ACK engine
325 |  * Don't worry, at low rates (bbr_min_tso_rate) this won't bloat cwnd because
326 |  * in such cases tso_segs_goal is 1. The minimum cwnd is 4 packets,
327 |  * which allows 2 outstanding 2-packet sequences, to try to keep pipe
328 |  * full even with ACK-every-other-packet delayed ACKs.
329 |  */
330 | static u32 bbr_target_cwnd(struct sock *sk, u32 bw, int gain)
331 | {
332 | 	struct bbr *bbr = inet_csk_ca(sk);
333 | 	u32 cwnd;
334 | 	u64 w;
335 | 
336 | 	/* If we've never had a valid RTT sample, cap cwnd at the initial
337 | 	 * default. This should only happen when the connection is not using TCP
338 | 	 * timestamps and has retransmitted all of the SYN/SYNACK/data packets
339 | 	 * ACKed so far. In this case, an RTO can cut cwnd to 1, in which
340 | 	 * case we need to slow-start up toward something safe: TCP_INIT_CWND.
341 | 	 */
342 | 	if (unlikely(bbr->min_rtt_us == ~0U))	 /* no valid RTT samples yet? */
343 | 		return TCP_INIT_CWND;  /* be safe: cap at default initial cwnd*/
344 | 
345 | 	w = (u64)bw * bbr->min_rtt_us;
346 | 
347 | 	/* Apply a gain to the given value, then remove the BW_SCALE shift. */
348 | 	cwnd = (((w * gain) >> BBR_SCALE) + BW_UNIT - 1) / BW_UNIT;
349 | 
350 | 	/* Allow enough full-sized skbs in flight to utilize end systems. */
351 | 	cwnd += 3 * bbr->tso_segs_goal;
352 | 
353 | 	/* Reduce delayed ACKs by rounding up cwnd to the next even number. */
354 | 	cwnd = (cwnd + 1) & ~1U;
355 | 
356 | 	return cwnd;
357 | }
358 | 
359 | /* An optimization in BBR to reduce losses: On the first round of recovery, we
360 |  * follow the packet conservation principle: send P packets per P packets acked.
361 |  * After that, we slow-start and send at most 2*P packets per P packets acked.
362 |  * After recovery finishes, or upon undo, we restore the cwnd we had when
363 |  * recovery started (capped by the target cwnd based on estimated BDP).
364 |  *
365 |  * TODO(ycheng/ncardwell): implement a rate-based approach.
366 |  */
367 | static bool bbr_set_cwnd_to_recover_or_restore(
368 | 	struct sock *sk, const struct rate_sample *rs, u32 acked, u32 *new_cwnd)
369 | {
370 | 	struct tcp_sock *tp = tcp_sk(sk);
371 | 	struct bbr *bbr = inet_csk_ca(sk);
372 | 	u8 prev_state = bbr->prev_ca_state, state = inet_csk(sk)->icsk_ca_state;
373 | 	u32 cwnd = tp->snd_cwnd;
374 | 
375 | 	/* An ACK for P pkts should release at most 2*P packets. We do this
376 | 	 * in two steps. First, here we deduct the number of lost packets.
377 | 	 * Then, in bbr_set_cwnd() we slow start up toward the target cwnd.
378 | 	 */
379 | 	if (rs->losses > 0)
380 | 		cwnd = max_t(s32, cwnd - rs->losses, 1);
381 | 
382 | 	if (state == TCP_CA_Recovery && prev_state != TCP_CA_Recovery) {
383 | 		/* Starting 1st round of Recovery, so do packet conservation. */
384 | 		bbr->packet_conservation = 1;
385 | 		bbr->next_rtt_delivered = tp->delivered;  /* start round now */
386 | 		/* Cut unused cwnd from app behavior, TSQ, or TSO deferral: */
387 | 		cwnd = tcp_packets_in_flight(tp) + acked;
388 | 	} else if (prev_state >= TCP_CA_Recovery && state < TCP_CA_Recovery) {
389 | 		/* Exiting loss recovery; restore cwnd saved before recovery. */
390 | 		bbr->restore_cwnd = 1;
391 | 		bbr->packet_conservation = 0;
392 | 	}
393 | 	bbr->prev_ca_state = state;
394 | 
395 | 	if (bbr->restore_cwnd) {
396 | 		/* Restore cwnd after exiting loss recovery or PROBE_RTT. */
397 | 		cwnd = max(cwnd, bbr->prior_cwnd);
398 | 		bbr->restore_cwnd = 0;
399 | 	}
400 | 
401 | 	if (bbr->packet_conservation) {
402 | 		*new_cwnd = max(cwnd, tcp_packets_in_flight(tp) + acked);
403 | 		return true;	/* yes, using packet conservation */
404 | 	}
405 | 	*new_cwnd = cwnd;
406 | 	return false;
407 | }
408 | 
409 | /* Slow-start up toward target cwnd (if bw estimate is growing, or packet loss
410 |  * has drawn us down below target), or snap down to target if we're above it.
411 |  */
412 | static void bbr_set_cwnd(struct sock *sk, const struct rate_sample *rs,
413 | 			 u32 acked, u32 bw, int gain)
414 | {
415 | 	struct tcp_sock *tp = tcp_sk(sk);
416 | 	struct bbr *bbr = inet_csk_ca(sk);
417 | 	u32 cwnd = 0, target_cwnd = 0;
418 | 
419 | 	if (!acked)
420 | 		return;
421 | 
422 | 	if (bbr_set_cwnd_to_recover_or_restore(sk, rs, acked, &cwnd))
423 | 		goto done;
424 | 
425 | 	/* If we're below target cwnd, slow start cwnd toward target cwnd. */
426 | 	target_cwnd = bbr_target_cwnd(sk, bw, gain);
427 | 	if (bbr_full_bw_reached(sk))  /* only cut cwnd if we filled the pipe */
428 | 		cwnd = min(cwnd + acked, target_cwnd);
429 | 	else if (cwnd < target_cwnd || tp->delivered < TCP_INIT_CWND)
430 | 		cwnd = cwnd + acked;
431 | 	cwnd = max(cwnd, bbr_cwnd_min_target);
432 | 
433 | done:
434 | 	tp->snd_cwnd = min(cwnd, tp->snd_cwnd_clamp);	/* apply global cap */
435 | 	if (bbr->mode == BBR_PROBE_RTT)  /* drain queue, refresh min_rtt */
436 | 		tp->snd_cwnd = min(tp->snd_cwnd, bbr_cwnd_min_target);
437 | }
438 | 
439 | /* End cycle phase if it's time and/or we hit the phase's in-flight target. */
440 | static bool bbr_is_next_cycle_phase(struct sock *sk,
441 | 				    const struct rate_sample *rs)
442 | {
443 | 	struct tcp_sock *tp = tcp_sk(sk);
444 | 	struct bbr *bbr = inet_csk_ca(sk);
445 | 	bool is_full_length =
446 | 		tcp_stamp_us_delta(tp->delivered_mstamp, bbr->cycle_mstamp) >
447 | 		bbr->min_rtt_us;
448 | 	u32 inflight, bw;
449 | 
450 | 	/* The pacing_gain of 1.0 paces at the estimated bw to try to fully
451 | 	 * use the pipe without increasing the queue.
452 | 	 */
453 | 	if (bbr->pacing_gain == BBR_UNIT)
454 | 		return is_full_length;		/* just use wall clock time */
455 | 
456 | 	inflight = rs->prior_in_flight;  /* what was in-flight before ACK? */
457 | 	bw = bbr_max_bw(sk);
458 | 
459 | 	/* A pacing_gain > 1.0 probes for bw by trying to raise inflight to at
460 | 	 * least pacing_gain*BDP; this may take more than min_rtt if min_rtt is
461 | 	 * small (e.g. on a LAN). We do not persist if packets are lost, since
462 | 	 * a path with small buffers may not hold that much.
463 | 	 */
464 | 	if (bbr->pacing_gain > BBR_UNIT)
465 | 		return is_full_length &&
466 | 			(rs->losses ||  /* perhaps pacing_gain*BDP won't fit */
467 | 			 inflight >= bbr_target_cwnd(sk, bw, bbr->pacing_gain));
468 | 
469 | 	/* A pacing_gain < 1.0 tries to drain extra queue we added if bw
470 | 	 * probing didn't find more bw. If inflight falls to match BDP then we
471 | 	 * estimate queue is drained; persisting would underutilize the pipe.
472 | 	 */
473 | 	return is_full_length ||
474 | 		inflight <= bbr_target_cwnd(sk, bw, BBR_UNIT);
475 | }
476 | 
477 | static void bbr_advance_cycle_phase(struct sock *sk)
478 | {
479 | 	struct tcp_sock *tp = tcp_sk(sk);
480 | 	struct bbr *bbr = inet_csk_ca(sk);
481 | 
482 | 	bbr->cycle_idx = (bbr->cycle_idx + 1) & (CYCLE_LEN - 1);
483 | 	bbr->cycle_mstamp = tp->delivered_mstamp;
484 | 	bbr->pacing_gain = bbr_pacing_gain[bbr->cycle_idx];
485 | }
486 | 
487 | /* Gain cycling: cycle pacing gain to converge to fair share of available bw. */
488 | static void bbr_update_cycle_phase(struct sock *sk,
489 | 				   const struct rate_sample *rs)
490 | {
491 | 	struct bbr *bbr = inet_csk_ca(sk);
492 | 
493 | 	if ((bbr->mode == BBR_PROBE_BW) && !bbr->lt_use_bw &&
494 | 	    bbr_is_next_cycle_phase(sk, rs))
495 | 		bbr_advance_cycle_phase(sk);
496 | }
497 | 
498 | static void bbr_reset_startup_mode(struct sock *sk)
499 | {
500 | 	struct bbr *bbr = inet_csk_ca(sk);
501 | 
502 | 	bbr->mode = BBR_STARTUP;
503 | 	bbr->pacing_gain = bbr_high_gain;
504 | 	bbr->cwnd_gain	 = bbr_high_gain;
505 | }
506 | 
507 | static void bbr_reset_probe_bw_mode(struct sock *sk)
508 | {
509 | 	struct bbr *bbr = inet_csk_ca(sk);
510 | 
511 | 	bbr->mode = BBR_PROBE_BW;
512 | 	bbr->pacing_gain = BBR_UNIT;
513 | 	bbr->cwnd_gain = bbr_cwnd_gain;
514 | 	bbr->cycle_idx = CYCLE_LEN - 1 - prandom_u32_max(bbr_cycle_rand);
515 | 	bbr_advance_cycle_phase(sk);	/* flip to next phase of gain cycle */
516 | }
517 | 
518 | static void bbr_reset_mode(struct sock *sk)
519 | {
520 | 	if (!bbr_full_bw_reached(sk))
521 | 		bbr_reset_startup_mode(sk);
522 | 	else
523 | 		bbr_reset_probe_bw_mode(sk);
524 | }
525 | 
526 | /* Start a new long-term sampling interval. */
527 | static void bbr_reset_lt_bw_sampling_interval(struct sock *sk)
528 | {
529 | 	struct tcp_sock *tp = tcp_sk(sk);
530 | 	struct bbr *bbr = inet_csk_ca(sk);
531 | 
532 | 	bbr->lt_last_stamp = div_u64(tp->delivered_mstamp, USEC_PER_MSEC);
533 | 	bbr->lt_last_delivered = tp->delivered;
534 | 	bbr->lt_last_lost = tp->lost;
535 | 	bbr->lt_rtt_cnt = 0;
536 | }
537 | 
538 | /* Completely reset long-term bandwidth sampling. */
539 | static void bbr_reset_lt_bw_sampling(struct sock *sk)
540 | {
541 | 	struct bbr *bbr = inet_csk_ca(sk);
542 | 
543 | 	bbr->lt_bw = 0;
544 | 	bbr->lt_use_bw = 0;
545 | 	bbr->lt_is_sampling = false;
546 | 	bbr_reset_lt_bw_sampling_interval(sk);
547 | }
548 | 
549 | /* Long-term bw sampling interval is done. Estimate whether we're policed. */
550 | static void bbr_lt_bw_interval_done(struct sock *sk, u32 bw)
551 | {
552 | 	struct bbr *bbr = inet_csk_ca(sk);
553 | 	u32 diff;
554 | 
555 | 	if (bbr->lt_bw) {  /* do we have bw from a previous interval? */
556 | 		/* Is new bw close to the lt_bw from the previous interval? */
557 | 		diff = abs(bw - bbr->lt_bw);
558 | 		if ((diff * BBR_UNIT <= bbr_lt_bw_ratio * bbr->lt_bw) ||
559 | 		    (bbr_rate_bytes_per_sec(sk, diff, BBR_UNIT) <=
560 | 		     bbr_lt_bw_diff)) {
561 | 			/* All criteria are met; estimate we're policed. */
562 | 			bbr->lt_bw = (bw + bbr->lt_bw) >> 1;  /* avg 2 intvls */
563 | 			bbr->lt_use_bw = 1;
564 | 			bbr->pacing_gain = BBR_UNIT;  /* try to avoid drops */
565 | 			bbr->lt_rtt_cnt = 0;
566 | 			return;
567 | 		}
568 | 	}
569 | 	bbr->lt_bw = bw;
570 | 	bbr_reset_lt_bw_sampling_interval(sk);
571 | }
572 | 
573 | /* Token-bucket traffic policers are common (see "An Internet-Wide Analysis of
574 |  * Traffic Policing", SIGCOMM 2016). BBR detects token-bucket policers and
575 |  * explicitly models their policed rate, to reduce unnecessary losses. We
576 |  * estimate that we're policed if we see 2 consecutive sampling intervals with
577 |  * consistent throughput and high packet loss. If we think we're being policed,
578 |  * set lt_bw to the "long-term" average delivery rate from those 2 intervals.
579 |  */
580 | static void bbr_lt_bw_sampling(struct sock *sk, const struct rate_sample *rs)
581 | {
582 | 	struct tcp_sock *tp = tcp_sk(sk);
583 | 	struct bbr *bbr = inet_csk_ca(sk);
584 | 	u32 lost, delivered;
585 | 	u64 bw;
586 | 	u32 t;
587 | 
588 | 	if (bbr->lt_use_bw) {	/* already using long-term rate, lt_bw? */
589 | 		if (bbr->mode == BBR_PROBE_BW && bbr->round_start &&
590 | 		    ++bbr->lt_rtt_cnt >= bbr_lt_bw_max_rtts) {
591 | 			bbr_reset_lt_bw_sampling(sk);    /* stop using lt_bw */
592 | 			bbr_reset_probe_bw_mode(sk);  /* restart gain cycling */
593 | 		}
594 | 		return;
595 | 	}
596 | 
597 | 	/* Wait for the first loss before sampling, to let the policer exhaust
598 | 	 * its tokens and estimate the steady-state rate allowed by the policer.
599 | 	 * Starting samples earlier includes bursts that over-estimate the bw.
600 | 	 */
601 | 	if (!bbr->lt_is_sampling) {
602 | 		if (!rs->losses)
603 | 			return;
604 | 		bbr_reset_lt_bw_sampling_interval(sk);
605 | 		bbr->lt_is_sampling = true;
606 | 	}
607 | 
608 | 	/* To avoid underestimates, reset sampling if we run out of data. */
609 | 	if (rs->is_app_limited) {
610 | 		bbr_reset_lt_bw_sampling(sk);
611 | 		return;
612 | 	}
613 | 
614 | 	if (bbr->round_start)
615 | 		bbr->lt_rtt_cnt++;	/* count round trips in this interval */
616 | 	if (bbr->lt_rtt_cnt < bbr_lt_intvl_min_rtts)
617 | 		return;		/* sampling interval needs to be longer */
618 | 	if (bbr->lt_rtt_cnt > 4 * bbr_lt_intvl_min_rtts) {
619 | 		bbr_reset_lt_bw_sampling(sk);  /* interval is too long */
620 | 		return;
621 | 	}
622 | 
623 | 	/* End sampling interval when a packet is lost, so we estimate the
624 | 	 * policer tokens were exhausted. Stopping the sampling before the
625 | 	 * tokens are exhausted under-estimates the policed rate.
626 | 	 */
627 | 	if (!rs->losses)
628 | 		return;
629 | 
630 | 	/* Calculate packets lost and delivered in sampling interval. */
631 | 	lost = tp->lost - bbr->lt_last_lost;
632 | 	delivered = tp->delivered - bbr->lt_last_delivered;
633 | 	/* Is loss rate (lost/delivered) >= lt_loss_thresh? If not, wait. */
634 | 	if (!delivered || (lost << BBR_SCALE) < bbr_lt_loss_thresh * delivered)
635 | 		return;
636 | 
637 | 	/* Find average delivery rate in this sampling interval. */
638 | 	t = div_u64(tp->delivered_mstamp, USEC_PER_MSEC) - bbr->lt_last_stamp;
639 | 	if ((s32)t < 1)
640 | 		return;		/* interval is less than one ms, so wait */
641 | 	/* Check if can multiply without overflow */
642 | 	if (t >= ~0U / USEC_PER_MSEC) {
643 | 		bbr_reset_lt_bw_sampling(sk);  /* interval too long; reset */
644 | 		return;
645 | 	}
646 | 	t *= USEC_PER_MSEC;
647 | 	bw = (u64)delivered * BW_UNIT;
648 | 	do_div(bw, t);
649 | 	bbr_lt_bw_interval_done(sk, bw);
650 | }
651 | 
652 | /* Estimate the bandwidth based on how fast packets are delivered */
653 | static void bbr_update_bw(struct sock *sk, const struct rate_sample *rs)
654 | {
655 | 	struct tcp_sock *tp = tcp_sk(sk);
656 | 	struct bbr *bbr = inet_csk_ca(sk);
657 | 	u64 bw;
658 | 
659 | 	bbr->round_start = 0;
660 | 	if (rs->delivered < 0 || rs->interval_us <= 0)
661 | 		return; /* Not a valid observation */
662 | 
663 | 	/* See if we've reached the next RTT */
664 | 	if (!before(rs->prior_delivered, bbr->next_rtt_delivered)) {
665 | 		bbr->next_rtt_delivered = tp->delivered;
666 | 		bbr->rtt_cnt++;
667 | 		bbr->round_start = 1;
668 | 		bbr->packet_conservation = 0;
669 | 	}
670 | 
671 | 	bbr_lt_bw_sampling(sk, rs);
672 | 
673 | 	/* Divide delivered by the interval to find a (lower bound) bottleneck
674 | 	 * bandwidth sample. Delivered is in packets and interval_us in uS and
675 | 	 * ratio will be <<1 for most connections. So delivered is first scaled.
676 | 	 */
677 | 	bw = (u64)rs->delivered * BW_UNIT;
678 | 	do_div(bw, rs->interval_us);
679 | 
680 | 	/* If this sample is application-limited, it is likely to have a very
681 | 	 * low delivered count that represents application behavior rather than
682 | 	 * the available network rate. Such a sample could drag down estimated
683 | 	 * bw, causing needless slow-down. Thus, to continue to send at the
684 | 	 * last measured network rate, we filter out app-limited samples unless
685 | 	 * they describe the path bw at least as well as our bw model.
686 | 	 *
687 | 	 * So the goal during app-limited phase is to proceed with the best
688 | 	 * network rate no matter how long. We automatically leave this
689 | 	 * phase when app writes faster than the network can deliver :)
690 | 	 */
691 | 	if (!rs->is_app_limited || bw >= bbr_max_bw(sk)) {
692 | 		/* Incorporate new sample into our max bw filter. */
693 | 		minmax_running_max(&bbr->bw, bbr_bw_rtts, bbr->rtt_cnt, bw);
694 | 	}
695 | }
696 | 
697 | /* Estimate when the pipe is full, using the change in delivery rate: BBR
698 |  * estimates that STARTUP filled the pipe if the estimated bw hasn't changed by
699 |  * at least bbr_full_bw_thresh (25%) after bbr_full_bw_cnt (3) non-app-limited
700 |  * rounds. Why 3 rounds: 1: rwin autotuning grows the rwin, 2: we fill the
701 |  * higher rwin, 3: we get higher delivery rate samples. Or transient
702 |  * cross-traffic or radio noise can go away. CUBIC Hystart shares a similar
703 |  * design goal, but uses delay and inter-ACK spacing instead of bandwidth.
704 |  */
705 | static void bbr_check_full_bw_reached(struct sock *sk,
706 | 				      const struct rate_sample *rs)
707 | {
708 | 	struct bbr *bbr = inet_csk_ca(sk);
709 | 	u32 bw_thresh;
710 | 
711 | 	if (bbr_full_bw_reached(sk) || !bbr->round_start || rs->is_app_limited)
712 | 		return;
713 | 
714 | 	bw_thresh = (u64)bbr->full_bw * bbr_full_bw_thresh >> BBR_SCALE;
715 | 	if (bbr_max_bw(sk) >= bw_thresh) {
716 | 		bbr->full_bw = bbr_max_bw(sk);
717 | 		bbr->full_bw_cnt = 0;
718 | 		return;
719 | 	}
720 | 	++bbr->full_bw_cnt;
721 | 	bbr->full_bw_reached = bbr->full_bw_cnt >= bbr_full_bw_cnt;
722 | }
723 | 
724 | /* If pipe is probably full, drain the queue and then enter steady-state. */
725 | static void bbr_check_drain(struct sock *sk, const struct rate_sample *rs)
726 | {
727 | 	struct bbr *bbr = inet_csk_ca(sk);
728 | 
729 | 	if (bbr->mode == BBR_STARTUP && bbr_full_bw_reached(sk)) {
730 | 		bbr->mode = BBR_DRAIN;	/* drain queue we created */
731 | 		bbr->pacing_gain = bbr_drain_gain;	/* pace slow to drain */
732 | 		bbr->cwnd_gain = bbr_high_gain;	/* maintain cwnd */
733 | 	}	/* fall through to check if in-flight is already small: */
734 | 	if (bbr->mode == BBR_DRAIN &&
735 | 	    tcp_packets_in_flight(tcp_sk(sk)) <=
736 | 	    bbr_target_cwnd(sk, bbr_max_bw(sk), BBR_UNIT))
737 | 		bbr_reset_probe_bw_mode(sk);  /* we estimate queue is drained */
738 | }
739 | 
740 | /* The goal of PROBE_RTT mode is to have BBR flows cooperatively and
741 |  * periodically drain the bottleneck queue, to converge to measure the true
742 |  * min_rtt (unloaded propagation delay). This allows the flows to keep queues
743 |  * small (reducing queuing delay and packet loss) and achieve fairness among
744 |  * BBR flows.
745 |  *
746 |  * The min_rtt filter window is 10 seconds. When the min_rtt estimate expires,
747 |  * we enter PROBE_RTT mode and cap the cwnd at bbr_cwnd_min_target=4 packets.
748 |  * After at least bbr_probe_rtt_mode_ms=200ms and at least one packet-timed
749 |  * round trip elapsed with that flight size <= 4, we leave PROBE_RTT mode and
750 |  * re-enter the previous mode. BBR uses 200ms to approximately bound the
751 |  * performance penalty of PROBE_RTT's cwnd capping to roughly 2% (200ms/10s).
752 |  *
753 |  * Note that flows need only pay 2% if they are busy sending over the last 10
754 |  * seconds. Interactive applications (e.g., Web, RPCs, video chunks) often have
755 |  * natural silences or low-rate periods within 10 seconds where the rate is low
756 |  * enough for long enough to drain its queue in the bottleneck. We pick up
757 |  * these min RTT measurements opportunistically with our min_rtt filter. :-)
758 |  */
759 | static void bbr_update_min_rtt(struct sock *sk, const struct rate_sample *rs)
760 | {
761 | 	struct tcp_sock *tp = tcp_sk(sk);
762 | 	struct bbr *bbr = inet_csk_ca(sk);
763 | 	bool filter_expired;
764 | 
765 | 	/* Track min RTT seen in the min_rtt_win_sec filter window: */
766 | 	filter_expired = after(tcp_jiffies32,
767 | 			       bbr->min_rtt_stamp + bbr_min_rtt_win_sec * HZ);
768 | 	if (rs->rtt_us >= 0 &&
769 | 	    (rs->rtt_us <= bbr->min_rtt_us || filter_expired)) {
770 | 		bbr->min_rtt_us = rs->rtt_us;
771 | 		bbr->min_rtt_stamp = tcp_jiffies32;
772 | 	}
773 | 
774 | 	if (bbr_probe_rtt_mode_ms > 0 && filter_expired &&
775 | 	    !bbr->idle_restart && bbr->mode != BBR_PROBE_RTT) {
776 | 		bbr->mode = BBR_PROBE_RTT;  /* dip, drain queue */
777 | 		bbr->pacing_gain = BBR_UNIT;
778 | 		bbr->cwnd_gain = BBR_UNIT;
779 | 		bbr_save_cwnd(sk);  /* note cwnd so we can restore it */
780 | 		bbr->probe_rtt_done_stamp = 0;
781 | 	}
782 | 
783 | 	if (bbr->mode == BBR_PROBE_RTT) {
784 | 		/* Ignore low rate samples during this mode. */
785 | 		tp->app_limited =
786 | 			(tp->delivered + tcp_packets_in_flight(tp)) ? : 1;
787 | 		/* Maintain min packets in flight for max(200 ms, 1 round). */
788 | 		if (!bbr->probe_rtt_done_stamp &&
789 | 		    tcp_packets_in_flight(tp) <= bbr_cwnd_min_target) {
790 | 			bbr->probe_rtt_done_stamp = tcp_jiffies32 +
791 | 				msecs_to_jiffies(bbr_probe_rtt_mode_ms);
792 | 			bbr->probe_rtt_round_done = 0;
793 | 			bbr->next_rtt_delivered = tp->delivered;
794 | 		} else if (bbr->probe_rtt_done_stamp) {
795 | 			if (bbr->round_start)
796 | 				bbr->probe_rtt_round_done = 1;
797 | 			if (bbr->probe_rtt_round_done &&
798 | 			    after(tcp_jiffies32, bbr->probe_rtt_done_stamp)) {
799 | 				bbr->min_rtt_stamp = tcp_jiffies32;
800 | 				bbr->restore_cwnd = 1;  /* snap to prior_cwnd */
801 | 				bbr_reset_mode(sk);
802 | 			}
803 | 		}
804 | 	}
805 | 	bbr->idle_restart = 0;
806 | }
807 | 
808 | static void bbr_update_model(struct sock *sk, const struct rate_sample *rs)
809 | {
810 | 	bbr_update_bw(sk, rs);
811 | 	bbr_update_cycle_phase(sk, rs);
812 | 	bbr_check_full_bw_reached(sk, rs);
813 | 	bbr_check_drain(sk, rs);
814 | 	bbr_update_min_rtt(sk, rs);
815 | }
816 | 
817 | static void bbr_main(struct sock *sk, const struct rate_sample *rs)
818 | {
819 | 	struct bbr *bbr = inet_csk_ca(sk);
820 | 	u32 bw;
821 | 
822 | 	bbr_update_model(sk, rs);
823 | 
824 | 	bw = bbr_bw(sk);
825 | 	bbr_set_pacing_rate(sk, bw, bbr->pacing_gain);
826 | 	bbr_set_tso_segs_goal(sk);
827 | 	bbr_set_cwnd(sk, rs, rs->acked_sacked, bw, bbr->cwnd_gain);
828 | }
829 | 
830 | static void bbr_init(struct sock *sk)
831 | {
832 | 	struct tcp_sock *tp = tcp_sk(sk);
833 | 	struct bbr *bbr = inet_csk_ca(sk);
834 | 
835 | 	bbr->prior_cwnd = 0;
836 | 	bbr->tso_segs_goal = 0;	 /* default segs per skb until first ACK */
837 | 	bbr->rtt_cnt = 0;
838 | 	bbr->next_rtt_delivered = 0;
839 | 	bbr->prev_ca_state = TCP_CA_Open;
840 | 	bbr->packet_conservation = 0;
841 | 
842 | 	bbr->probe_rtt_done_stamp = 0;
843 | 	bbr->probe_rtt_round_done = 0;
844 | 	bbr->min_rtt_us = tcp_min_rtt(tp);
845 | 	bbr->min_rtt_stamp = tcp_jiffies32;
846 | 
847 | 	minmax_reset(&bbr->bw, bbr->rtt_cnt, 0);  /* init max bw to 0 */
848 | 
849 | 	bbr->has_seen_rtt = 0;
850 | 	bbr_init_pacing_rate_from_rtt(sk);
851 | 
852 | 	bbr->restore_cwnd = 0;
853 | 	bbr->round_start = 0;
854 | 	bbr->idle_restart = 0;
855 | 	bbr->full_bw_reached = 0;
856 | 	bbr->full_bw = 0;
857 | 	bbr->full_bw_cnt = 0;
858 | 	bbr->cycle_mstamp = 0;
859 | 	bbr->cycle_idx = 0;
860 | 	bbr_reset_lt_bw_sampling(sk);
861 | 	bbr_reset_startup_mode(sk);
862 | 
863 | 	cmpxchg(&sk->sk_pacing_status, SK_PACING_NONE, SK_PACING_NEEDED);
864 | }
865 | 
866 | static u32 bbr_sndbuf_expand(struct sock *sk)
867 | {
868 | 	/* Provision 3 * cwnd since BBR may slow-start even during recovery. */
869 | 	return 3;
870 | }
871 | 
872 | /* In theory BBR does not need to undo the cwnd since it does not
873 |  * always reduce cwnd on losses (see bbr_main()). Keep it for now.
874 |  */
875 | static u32 bbr_undo_cwnd(struct sock *sk)
876 | {
877 | 	struct bbr *bbr = inet_csk_ca(sk);
878 | 
879 | 	bbr->full_bw = 0;   /* spurious slow-down; reset full pipe detection */
880 | 	bbr->full_bw_cnt = 0;
881 | 	bbr_reset_lt_bw_sampling(sk);
882 | 	return tcp_sk(sk)->snd_cwnd;
883 | }
884 | 
885 | /* Entering loss recovery, so save cwnd for when we exit or undo recovery. */
886 | static u32 bbr_ssthresh(struct sock *sk)
887 | {
888 | 	bbr_save_cwnd(sk);
889 | 	return TCP_INFINITE_SSTHRESH;	 /* BBR does not use ssthresh */
890 | }
891 | 
892 | static size_t bbr_get_info(struct sock *sk, u32 ext, int *attr,
893 | 			   union tcp_cc_info *info)
894 | {
895 | 	if (ext & (1 << (INET_DIAG_BBRINFO - 1)) ||
896 | 	    ext & (1 << (INET_DIAG_VEGASINFO - 1))) {
897 | 		struct tcp_sock *tp = tcp_sk(sk);
898 | 		struct bbr *bbr = inet_csk_ca(sk);
899 | 		u64 bw = bbr_bw(sk);
900 | 
901 | 		bw = bw * tp->mss_cache * USEC_PER_SEC >> BW_SCALE;
902 | 		memset(&info->bbr, 0, sizeof(info->bbr));
903 | 		info->bbr.bbr_bw_lo		= (u32)bw;
904 | 		info->bbr.bbr_bw_hi		= (u32)(bw >> 32);
905 | 		info->bbr.bbr_min_rtt		= bbr->min_rtt_us;
906 | 		info->bbr.bbr_pacing_gain	= bbr->pacing_gain;
907 | 		info->bbr.bbr_cwnd_gain		= bbr->cwnd_gain;
908 | 		*attr = INET_DIAG_BBRINFO;
909 | 		return sizeof(info->bbr);
910 | 	}
911 | 	return 0;
912 | }
913 | 
914 | static void bbr_set_state(struct sock *sk, u8 new_state)
915 | {
916 | 	struct bbr *bbr = inet_csk_ca(sk);
917 | 
918 | 	if (new_state == TCP_CA_Loss) {
919 | 		struct rate_sample rs = { .losses = 1 };
920 | 
921 | 		bbr->prev_ca_state = TCP_CA_Loss;
922 | 		bbr->full_bw = 0;
923 | 		bbr->round_start = 1;	/* treat RTO like end of a round */
924 | 		bbr_lt_bw_sampling(sk, &rs);
925 | 	}
926 | }
927 | 
928 | static struct tcp_congestion_ops tcp_bbr_cong_ops __read_mostly = {
929 | 	.flags		= TCP_CONG_NON_RESTRICTED,
930 | 	.name		= "bbr",
931 | 	.owner		= THIS_MODULE,
932 | 	.init		= bbr_init,
933 | 	.cong_control	= bbr_main,
934 | 	.sndbuf_expand	= bbr_sndbuf_expand,
935 | 	.undo_cwnd	= bbr_undo_cwnd,
936 | 	.cwnd_event	= bbr_cwnd_event,
937 | 	.ssthresh	= bbr_ssthresh,
938 | 	.tso_segs_goal	= bbr_tso_segs_goal,
939 | 	.get_info	= bbr_get_info,
940 | 	.set_state	= bbr_set_state,
941 | };
942 | 
943 | static int __init bbr_register(void)
944 | {
945 | 	BUILD_BUG_ON(sizeof(struct bbr) > ICSK_CA_PRIV_SIZE);
946 | 	return tcp_register_congestion_control(&tcp_bbr_cong_ops);
947 | }
948 | 
949 | static void __exit bbr_unregister(void)
950 | {
951 | 	tcp_unregister_congestion_control(&tcp_bbr_cong_ops);
952 | }
953 | 
954 | module_init(bbr_register);
955 | module_exit(bbr_unregister);
956 | 
957 | MODULE_AUTHOR("Van Jacobson <vanj@google.com>");
958 | MODULE_AUTHOR("Neal Cardwell <ncardwell@google.com>");
959 | MODULE_AUTHOR("Yuchung Cheng <ycheng@google.com>");
960 | MODULE_AUTHOR("Soheil Hassas Yeganeh <soheil@google.com>");
961 | MODULE_LICENSE("Dual BSD/GPL");
962 | MODULE_DESCRIPTION("TCP BBR (Bottleneck Bandwidth and RTT)");
963 | 


--------------------------------------------------------------------------------
/Makefile/Makefile-CentOS:
--------------------------------------------------------------------------------
 1 | obj-m := tcp_nanqinlang.o
 2 | 
 3 | all:
 4 | 	make -C /lib/modules/`uname -r`/build M=`pwd` modules CC=/usr/bin/gcc
 5 | 
 6 | clean:
 7 | 	make -C /lib/modules/`uname -r`/build M=`pwd` clean
 8 | 
 9 | install:
10 | 	install tcp_nanqinlang.ko /lib/modules/`uname -r`/kernel/net/ipv4
11 | 	insmod /lib/modules/`uname -r`/kernel/net/ipv4/tcp_nanqinlang.ko
12 | 	depmod -a
13 | 
14 | uninstall:
15 | 	rm /lib/modules/`uname -r`/kernel/net/ipv4/tcp_nanqinlang.ko


--------------------------------------------------------------------------------
/Makefile/Makefile-Debian7or8:
--------------------------------------------------------------------------------
 1 | ﻿obj-m := tcp_nanqinlang.o
 2 | 
 3 | all:
 4 | 	make -C /lib/modules/`uname -r`/build M=`pwd` modules CC=/usr/bin/gcc-4.9
 5 | 
 6 | clean:
 7 | 	make -C /lib/modules/`uname -r`/build M=`pwd` clean
 8 | 
 9 | install:
10 | 	install tcp_nanqinlang.ko /lib/modules/`uname -r`/kernel/net/ipv4
11 | 	insmod /lib/modules/`uname -r`/kernel/net/ipv4/tcp_nanqinlang.ko
12 | 	depmod -a
13 | 
14 | uninstall:
15 | 	rm /lib/modules/`uname -r`/kernel/net/ipv4/tcp_nanqinlang.ko


--------------------------------------------------------------------------------
/Makefile/Makefile-Debian9:
--------------------------------------------------------------------------------
 1 | ﻿obj-m := tcp_nanqinlang.o
 2 | 
 3 | all:
 4 | 	make -C /lib/modules/`uname -r`/build M=`pwd` modules CC=/usr/bin/gcc-6
 5 | 
 6 | clean:
 7 | 	make -C /lib/modules/`uname -r`/build M=`pwd` clean
 8 | 
 9 | install:
10 | 	install tcp_nanqinlang.ko /lib/modules/`uname -r`/kernel/net/ipv4
11 | 	insmod /lib/modules/`uname -r`/kernel/net/ipv4/tcp_nanqinlang.ko
12 | 	depmod -a
13 | 
14 | uninstall:
15 | 	rm /lib/modules/`uname -r`/kernel/net/ipv4/tcp_nanqinlang.ko


--------------------------------------------------------------------------------
/readme.md:
--------------------------------------------------------------------------------
 1 | # tcp_nanqinlang
 2 | 
 3 | [![build](https://github.com/nanqinlang/SVG/blob/master/build%20passing.svg)](https://github.com/tcp-nanqinlang/general)
 4 | [![language1](https://github.com/nanqinlang/SVG/blob/master/language-c-blue.svg)](https://github.com/tcp-nanqinlang/general)
 5 | [![language2](https://github.com/nanqinlang/SVG/blob/master/language-shell-blue.svg)](https://github.com/tcp-nanqinlang/general)
 6 | [![author](https://github.com/nanqinlang/SVG/blob/master/author-nanqinlang-lightgrey.svg)](https://github.com/tcp-nanqinlang/general)
 7 | [![license](https://github.com/nanqinlang/SVG/blob/master/license-GPLv3-orange.svg)](https://github.com/tcp-nanqinlang/general)
 8 | 
 9 | A tcp_bbr enhancement with `violence`
10 | 
11 | as this will, the script works with `KVM or higher`
12 | 
13 | ## according
14 | Update History  
15 | https://github.com/tcp-nanqinlang/general/releases
16 | 
17 | 中文文档  
18 | https://github.com/tcp-nanqinlang/wiki/wiki/general
19 | 


--------------------------------------------------------------------------------