├── README.md
├── Makefile
├── bbrplus.sh
└── tcp_bbr_plus.c


/README.md:
--------------------------------------------------------------------------------
 1 | # bbrplus-debian
 2 | TCP-BBR Plus For Debian
 3 | 
 4 | # Usage
 5 | ```
 6 | $ bash <(curl -f -L -sS https://raw.githubusercontent.com/Xaster/bbrplus-debian/master/bbrplus.sh)
 7 | ```
 8 | 
 9 | # Note
10 | ```
11 | Only support Debian 9 or above.
12 | ```
13 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | obj-m := tcp_bbr_plus.o
 2 | 
 3 | all:
 4 | 	make -C /lib/modules/`uname -r`/build M=`pwd` modules CC=/usr/bin/gcc-6
 5 | 
 6 | clean:
 7 | 	make -C /lib/modules/`uname -r`/build M=`pwd` clean
 8 | 
 9 | install:
10 | 	install tcp_bbr_plus.ko /lib/modules/`uname -r`/kernel/net/ipv4
11 | 	insmod /lib/modules/`uname -r`/kernel/net/ipv4/tcp_bbr_plus.ko
12 | 	depmod -a
13 | 
14 | uninstall:
15 | 	rm /lib/modules/`uname -r`/kernel/net/ipv4/tcp_bbr_plus.ko
16 | 


--------------------------------------------------------------------------------
/bbrplus.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | #Change working directory to home directory
 4 | cd
 5 | 
 6 | #Update system
 7 | apt update && apt full-upgrade -y
 8 | 
 9 | #Install build dependencies
10 | apt install -y \
11 |   wget \
12 |   ca-certificates \
13 |   build-essential \
14 |   linux-headers-amd64 \
15 |   | tee build-deps.txt
16 | 
17 | #Update CA Certificates
18 | update-ca-certificates
19 | 
20 | #Build and install TCP-BBR Plus
21 | mkdir bbrplus-debian
22 | cd bbrplus-debian
23 | wget https://raw.githubusercontent.com/Xaster/bbrplus-debian/master/Makefile
24 | wget https://raw.githubusercontent.com/Xaster/bbrplus-debian/master/tcp_bbr_plus.c
25 | make
26 | make install
27 | cd
28 | 
29 | #Config TCP-BBR Plus
30 | [ ! -f /etc/sysctl.conf ] && touch /etc/sysctl.conf
31 | sed -i '/net.core.default_qdisc.*/d' /etc/sysctl.conf
32 | sed -i '/net.ipv4.tcp_congestion_control.*/d' /etc/sysctl.conf
33 | cat >> /etc/sysctl.conf << \EOF
34 | net.core.default_qdisc=fq
35 | net.ipv4.tcp_congestion_control=bbr_plus
36 | EOF
37 | sysctl -p
38 | 
39 | #Remove build dependencies
40 | apt purge --auto-remove -y $(cat build-deps.txt | grep "Unpacking " | cut -d " " -f 2)
41 | apt clean
42 | 
43 | #Remove temporary files
44 | rm -rf \
45 |   $HOME/bbrplus-debian \
46 |   $HOME/build-deps.txt \
47 |   /var/lib/apt/lists/*
48 | 
49 | #Check TCP-BBR Plus status
50 | sysctl net.ipv4.tcp_available_congestion_control | grep -q bbr_plus
51 | if [ $? -eq 0 ];then
52 |   lsmod | grep -q tcp_bbr_plus
53 |   if [ $? -eq 0 ];then
54 |     echo -e "\033[92m TCP-BBR Plus has been built and load. \033[0m"
55 |   else
56 |     echo -e "\033[91m TCP-BBR Plus load failed. \033[0m"
57 |   fi
58 | else
59 |   echo -e "\033[91m TCP-BBR Plus not found. \033[0m"
60 | fi
61 | 


--------------------------------------------------------------------------------
/tcp_bbr_plus.c:
--------------------------------------------------------------------------------
  1 | /* Bottleneck Bandwidth and RTT (BBR) congestion control
  2 |  *
  3 |  * BBR congestion control computes the sending rate based on the delivery
  4 |  * rate (throughput) estimated from ACKs. In a nutshell:
  5 |  *
  6 |  *   On each ACK, update our model of the network path:
  7 |  *      bottleneck_bandwidth = windowed_max(delivered / elapsed, 10 round trips)
  8 |  *      min_rtt = windowed_min(rtt, 10 seconds)
  9 |  *   pacing_rate = pacing_gain * bottleneck_bandwidth
 10 |  *   cwnd = max(cwnd_gain * bottleneck_bandwidth * min_rtt, 4)
 11 |  *
 12 |  * The core algorithm does not react directly to packet losses or delays,
 13 |  * although BBR may adjust the size of next send per ACK when loss is
 14 |  * observed, or adjust the sending rate if it estimates there is a
 15 |  * traffic policer, in order to keep the drop rate reasonable.
 16 |  *
 17 |  * Here is a state transition diagram for BBR:
 18 |  *
 19 |  *             |
 20 |  *             V
 21 |  *    +---> STARTUP  ----+
 22 |  *    |        |         |
 23 |  *    |        V         |
 24 |  *    |      DRAIN   ----+
 25 |  *    |        |         |
 26 |  *    |        V         |
 27 |  *    +---> PROBE_BW ----+
 28 |  *    |      ^    |      |
 29 |  *    |      |    |      |
 30 |  *    |      +----+      |
 31 |  *    |                  |
 32 |  *    +---- PROBE_RTT <--+
 33 |  *
 34 |  * A BBR flow starts in STARTUP, and ramps up its sending rate quickly.
 35 |  * When it estimates the pipe is full, it enters DRAIN to drain the queue.
 36 |  * In steady state a BBR flow only uses PROBE_BW and PROBE_RTT.
 37 |  * A long-lived BBR flow spends the vast majority of its time remaining
 38 |  * (repeatedly) in PROBE_BW, fully probing and utilizing the pipe's bandwidth
 39 |  * in a fair manner, with a small, bounded queue. *If* a flow has been
 40 |  * continuously sending for the entire min_rtt window, and hasn't seen an RTT
 41 |  * sample that matches or decreases its min_rtt estimate for 10 seconds, then
 42 |  * it briefly enters PROBE_RTT to cut inflight to a minimum value to re-probe
 43 |  * the path's two-way propagation delay (min_rtt). When exiting PROBE_RTT, if
 44 |  * we estimated that we reached the full bw of the pipe then we enter PROBE_BW;
 45 |  * otherwise we enter STARTUP to try to fill the pipe.
 46 |  *
 47 |  * BBR is described in detail in:
 48 |  *   "BBR: Congestion-Based Congestion Control",
 49 |  *   Neal Cardwell, Yuchung Cheng, C. Stephen Gunn, Soheil Hassas Yeganeh,
 50 |  *   Van Jacobson. ACM Queue, Vol. 14 No. 5, September-October 2016.
 51 |  *
 52 |  * There is a public e-mail list for discussing BBR development and testing:
 53 |  *   https://groups.google.com/forum/#!forum/bbr-dev
 54 |  *
 55 |  * NOTE: BBR *must* be used with the fq qdisc ("man tc-fq") with pacing enabled,
 56 |  * since pacing is integral to the BBR design and implementation.
 57 |  * BBR without pacing would not function properly, and may incur unnecessary
 58 |  * high packet loss rates.
 59 |  */
 60 | 
 61 | #include <linux/module.h>
 62 | #include <linux/inet.h>
 63 | #include <linux/inet_diag.h>
 64 | #include <linux/random.h>
 65 | #include <linux/win_minmax.h>
 66 | #include <net/tcp.h>
 67 | 
 68 | /* Scale factor for rate in pkt/uSec unit to avoid truncation in bandwidth
 69 |  * estimation. The rate unit ~= (1500 bytes / 1 usec / 2^24) ~= 715 bps.
 70 |  * This handles bandwidths from 0.06pps (715bps) to 256Mpps (3Tbps) in a u32.
 71 |  * Since the minimum window is >=4 packets, the lower bound isn't
 72 |  * an issue. The upper bound isn't an issue with existing technologies.
 73 |  */
 74 | #define BW_SCALE  24
 75 | #define BW_UNIT   (1 << BW_SCALE)
 76 | 
 77 | #define BBR_SCALE 8	/* scaling factor for fractions in BBR (e.g. gains) */
 78 | #define BBR_UNIT  (1 << BBR_SCALE)
 79 | 
 80 | #define CYCLE_LEN 8	/* number of phases in a pacing gain cycle */
 81 | 
 82 | 
 83 | // **************************************************************************
 84 | // the following is the main
 85 | // **************************************************************************
 86 | 
 87 | 
 88 | /* BBR has the following modes for deciding how fast to send: */
 89 | // four working mode
 90 | enum bbr_mode {
 91 | 	BBR_STARTUP,	/* ramp up sending rate rapidly to fill pipe */
 92 | 	BBR_DRAIN,	    /* drain any queue created during startup */
 93 | 	BBR_PROBE_BW,	/* discover, share bw: pace around estimated bw */
 94 | 	BBR_PROBE_RTT,	/* cut cwnd to min to probe min_rtt */
 95 | };
 96 | 
 97 | 
 98 | /* BBR congestion control block */
 99 | // control block with u32 values you set
100 | struct bbr {
101 | 	u32	min_rtt_us;	             	 /* min RTT in min_rtt_win_sec window */
102 | 	u32	min_rtt_stamp;	             /* timestamp of min_rtt_us */
103 | 	u32	probe_rtt_done_stamp;        /* end time for BBR_PROBE_RTT mode */
104 | 	struct minmax bw;		         /* Max recent delivery rate in pkts/uS << 24 */
105 | 	u32	rtt_cnt;	    	         /* count of packet-timed rounds elapsed */
106 | 	u32 next_rtt_delivered;          /* scb->tx.delivered at end of round */
107 | 	struct skb_mstamp cycle_mstamp;  /* time of this cycle phase start */
108 | 	u32	mode:3,		     	 /* current bbr_mode in state machine */
109 | 		prev_ca_state:3,        /* CA state on previous ACK */
110 | 		packet_conservation:1,  /* use packet conservation? */
111 | 		restore_cwnd:1,	        /* decided to revert cwnd to old value */
112 | 		round_start:1,	        /* start of packet-timed tx->ack round? */
113 | 		tso_segs_goal:7,        /* segments we want in each skb we send */
114 | 		idle_restart:1,	        /* restarting after idle? */
115 | 		probe_rtt_round_done:1, /* a BBR_PROBE_RTT round at 4 pkts? */
116 | 		unused:5,
117 | 		lt_is_sampling:1,       /* taking long-term ("LT") samples now? */
118 | 		lt_rtt_cnt:7,	        /* round trips in long-term interval */
119 | 		lt_use_bw:1;	        /* use lt_bw as our bw estimate? */
120 | 	u32	lt_bw;		         /* LT est delivery rate in pkts/uS << 24 */
121 | 	u32	lt_last_delivered;   /* LT intvl start: tp->delivered */
122 | 	u32	lt_last_stamp;	     /* LT intvl start: tp->delivered_mstamp */
123 | 	u32	lt_last_lost;	     /* LT intvl start: tp->lost */
124 | 	u32	pacing_gain:10,	     /* current gain for setting pacing rate */
125 | 		cwnd_gain:10,	        /* current gain for setting cwnd */
126 | 		full_bw_cnt:3,	        /* number of rounds without large bw gains */
127 | 		cycle_idx:3,	        /* current index in pacing_gain cycle array */
128 | 		unused_b:5;
129 | 	u32	prior_cwnd;	  	     /* prior cwnd upon entering loss recovery */
130 | 	u32	full_bw;	  	     /* recent bw, to estimate if pipe is full */
131 | };
132 | 
133 | 
134 | /* Window length of bw filter (in rounds): */
135 | static const int bbr_bw_rtts = CYCLE_LEN + 7;
136 | /* Window length of min_rtt filter (in sec): */
137 | static const u32 bbr_min_rtt_win_sec = 7;
138 | /* Minimum time (in ms) spent at bbr_cwnd_min_target in BBR_PROBE_RTT mode: */
139 | static const u32 bbr_probe_rtt_mode_ms = 70;
140 | /* Skip TSO below the following bandwidth (bits/sec): */
141 | static const int bbr_min_tso_rate = 1024000;
142 | 
143 | /* We use a high_gain value of 2/ln(2) because it's the smallest pacing gain
144 |  * that will allow a smoothly increasing pacing rate that will double each RTT
145 |  * and send the same number of packets per RTT that an un-paced, slow-starting
146 |  * Reno or CUBIC flow would:
147 |  */
148 | static const int bbr_high_gain  = BBR_UNIT * 3250 / 1000 + 1;
149 | /* The pacing gain of 1/high_gain in BBR_DRAIN is calculated to typically drain
150 |  * the queue created in BBR_STARTUP in a single round:
151 |  */
152 | static const int bbr_drain_gain = BBR_UNIT * 1000 / 3250;
153 | /* The gain for deriving steady-state cwnd tolerates delayed/stretched ACKs: */
154 | static const int bbr_cwnd_gain  = BBR_UNIT * 2;
155 | /* The pacing_gain values for the PROBE_BW gain cycle, to discover/share bw: */
156 | static const int bbr_pacing_gain[] = {
157 | // for the stable bbr mode "BBR_PROBE_BW" which makes the fastest speed mode.
158 | // there are 8 pacing rate
159 | 	BBR_UNIT * 8 / 4,	/* probe for more available bw */
160 | 	BBR_UNIT * 3 / 4,	/* drain queue and/or yield bw to other flows */
161 | 	BBR_UNIT * 7 / 4, BBR_UNIT * 7 / 4, BBR_UNIT * 7 / 4,	/* cruise at 1.0*bw to utilize pipe, */
162 | 	BBR_UNIT * 8 / 4, BBR_UNIT * 8 / 4, BBR_UNIT * 8 / 4	/* without creating excess queue... */
163 | };
164 | 
165 | /* Randomize the starting gain cycling phase over N phases: */
166 | static const u32 bbr_cycle_rand = 7;
167 | 
168 | /* Try to keep at least this many packets in flight, if things go smoothly. For
169 |  * smooth functioning, a sliding window protocol ACKing every other packet
170 |  * needs at least 4 packets in flight:
171 |  */
172 | // minimumly keeps 4 package when discover minimum rtt
173 | static const u32 bbr_cwnd_min_target = 4;
174 | 
175 | /* To estimate if BBR_STARTUP mode (i.e. high_gain) has filled pipe... */
176 | /* If bw has increased significantly (1.25x), there may be more bw available: */
177 | static const u32 bbr_full_bw_thresh = BBR_UNIT * 8 / 4;
178 | /* But after 3 rounds w/o significant bw growth, estimate pipe is full: */
179 | static const u32 bbr_full_bw_cnt = 3;
180 | 
181 | /* "long-term" ("LT") bandwidth estimator parameters... */
182 | /* The minimum number of rounds in an LT bw sampling interval: */
183 | static const u32 bbr_lt_intvl_min_rtts = 4;
184 | /* If lost/delivered ratio > 20%, interval is "lossy" and we may be policed: */
185 | static const u32 bbr_lt_loss_thresh = 60;
186 | /* If 2 intervals have a bw ratio <= 1/8, their bw is "consistent": */
187 | static const u32 bbr_lt_bw_ratio = BBR_UNIT / 4;
188 | /* If 2 intervals have a bw diff <= 4 Kbit/sec their bw is "consistent": */
189 | static const u32 bbr_lt_bw_diff = 4000 / 4;
190 | /* If we estimate we're policed, use lt_bw for this many round trips: */
191 | static const u32 bbr_lt_bw_max_rtts = 40;
192 | 
193 | /* Do we estimate that STARTUP filled the pipe? */
194 | static bool bbr_full_bw_reached(const struct sock *sk)
195 | {
196 | 	const struct bbr *bbr = inet_csk_ca(sk);
197 | 
198 | 	return bbr->full_bw_cnt >= bbr_full_bw_cnt;
199 | }
200 | 
201 | /* Return the windowed max recent bandwidth sample, in pkts/uS << BW_SCALE. */
202 | static u32 bbr_max_bw(const struct sock *sk)
203 | {
204 | 	struct bbr *bbr = inet_csk_ca(sk);
205 | 
206 | 	return minmax_get(&bbr->bw);
207 | }
208 | 
209 | /* Return the estimated bandwidth of the path, in pkts/uS << BW_SCALE. */
210 | static u32 bbr_bw(const struct sock *sk)
211 | {
212 | 	struct bbr *bbr = inet_csk_ca(sk);
213 | 
214 | 	return bbr->lt_use_bw ? bbr->lt_bw : bbr_max_bw(sk);
215 | }
216 | 
217 | /* Return rate in bytes per second, optionally with a gain.
218 |  * The order here is chosen carefully to avoid overflow of u64. This should
219 |  * work for input rates of up to 2.9Tbit/sec and gain of 2.89x.
220 |  */
221 | static u64 bbr_rate_bytes_per_sec(struct sock *sk, u64 rate, int gain)
222 | {
223 | 	rate *= tcp_mss_to_mtu(sk, tcp_sk(sk)->mss_cache);
224 | 	rate *= gain;
225 | 	rate >>= BBR_SCALE;
226 | 	rate *= USEC_PER_SEC;
227 | 	return rate >> BW_SCALE;
228 | }
229 | 
230 | /* Pace using current bw estimate and a gain factor. In order to help drive the
231 |  * network toward lower queues while maintaining high utilization and low
232 |  * latency, the average pacing rate aims to be slightly (~1%) lower than the
233 |  * estimated bandwidth. This is an important aspect of the design. In this
234 |  * implementation this slightly lower pacing rate is achieved implicitly by not
235 |  * including link-layer headers in the packet size used for the pacing rate.
236 |  */
237 | static void bbr_set_pacing_rate(struct sock *sk, u32 bw, int gain)
238 | {
239 | 	struct bbr *bbr = inet_csk_ca(sk);
240 | 	u64 rate = bw;
241 | 
242 | 	rate = bbr_rate_bytes_per_sec(sk, rate, gain);
243 | 	rate = min_t(u64, rate, sk->sk_max_pacing_rate);
244 | 	if (bbr->mode != BBR_STARTUP || rate > sk->sk_pacing_rate)
245 | 		sk->sk_pacing_rate = rate;
246 | }
247 | 
248 | /* Return count of segments we want in the skbs we send, or 0 for default. */
249 | static u32 bbr_tso_segs_goal(struct sock *sk)
250 | {
251 | 	struct bbr *bbr = inet_csk_ca(sk);
252 | 
253 | 	return bbr->tso_segs_goal;
254 | }
255 | 
256 | static void bbr_set_tso_segs_goal(struct sock *sk)
257 | {
258 | 	struct tcp_sock *tp = tcp_sk(sk);
259 | 	struct bbr *bbr = inet_csk_ca(sk);
260 | 	u32 min_segs;
261 | 
262 | 	min_segs = sk->sk_pacing_rate < (bbr_min_tso_rate >> 3) ? 1 : 2;
263 | 	bbr->tso_segs_goal = min(tcp_tso_autosize(sk, tp->mss_cache, min_segs),
264 | 				 0x7FU);
265 | }
266 | 
267 | /* Save "last known good" cwnd so we can restore it after losses or PROBE_RTT */
268 | static void bbr_save_cwnd(struct sock *sk)
269 | {
270 | 	struct tcp_sock *tp = tcp_sk(sk);
271 | 	struct bbr *bbr = inet_csk_ca(sk);
272 | 
273 | 	if (bbr->prev_ca_state < TCP_CA_Recovery && bbr->mode != BBR_PROBE_RTT)
274 | 		bbr->prior_cwnd = tp->snd_cwnd;  /* this cwnd is good enough */
275 | 	else  /* loss recovery or BBR_PROBE_RTT have temporarily cut cwnd */
276 | 		bbr->prior_cwnd = max(bbr->prior_cwnd, tp->snd_cwnd);
277 | }
278 | 
279 | static void bbr_cwnd_event(struct sock *sk, enum tcp_ca_event event)
280 | {
281 | 	struct tcp_sock *tp = tcp_sk(sk);
282 | 	struct bbr *bbr = inet_csk_ca(sk);
283 | 
284 | 	if (event == CA_EVENT_TX_START && tp->app_limited) {
285 | 		bbr->idle_restart = 1;
286 | 		/* Avoid pointless buffer overflows: pace at est. bw if we don't
287 | 		 * need more speed (we're restarting from idle and app-limited).
288 | 		 */
289 | 		if (bbr->mode == BBR_PROBE_BW)
290 | 			bbr_set_pacing_rate(sk, bbr_bw(sk), BBR_UNIT);
291 | 	}
292 | }
293 | 
294 | /* Find target cwnd. Right-size the cwnd based on min RTT and the
295 |  * estimated bottleneck bandwidth:
296 |  *
297 |  * cwnd = bw * min_rtt * gain = BDP * gain
298 |  *
299 |  * The key factor, gain, controls the amount of queue. While a small gain
300 |  * builds a smaller queue, it becomes more vulnerable to noise in RTT
301 |  * measurements (e.g., delayed ACKs or other ACK compression effects). This
302 |  * noise may cause BBR to under-estimate the rate.
303 |  *
304 |  * To achieve full performance in high-speed paths, we budget enough cwnd to
305 |  * fit full-sized skbs in-flight on both end hosts to fully utilize the path:
306 |  *   - one skb in sending host Qdisc,
307 |  *   - one skb in sending host TSO/GSO engine
308 |  *   - one skb being received by receiver host LRO/GRO/delayed-ACK engine
309 |  * Don't worry, at low rates (bbr_min_tso_rate) this won't bloat cwnd because
310 |  * in such cases tso_segs_goal is 1. The minimum cwnd is 4 packets,
311 |  * which allows 2 outstanding 2-packet sequences, to try to keep pipe
312 |  * full even with ACK-every-other-packet delayed ACKs.
313 |  */
314 | static u32 bbr_target_cwnd(struct sock *sk, u32 bw, int gain)
315 | {
316 | 	struct bbr *bbr = inet_csk_ca(sk);
317 | 	u32 cwnd;
318 | 	u64 w;
319 | 
320 | 	/* If we've never had a valid RTT sample, cap cwnd at the initial
321 | 	 * default. This should only happen when the connection is not using TCP
322 | 	 * timestamps and has retransmitted all of the SYN/SYNACK/data packets
323 | 	 * ACKed so far. In this case, an RTO can cut cwnd to 1, in which
324 | 	 * case we need to slow-start up toward something safe: TCP_INIT_CWND.
325 | 	 */
326 | 	if (unlikely(bbr->min_rtt_us == ~0U))	 /* no valid RTT samples yet? */
327 | 		return TCP_INIT_CWND;  /* be safe: cap at default initial cwnd*/
328 | 
329 | 	w = (u64)bw * bbr->min_rtt_us;
330 | 
331 | 	/* Apply a gain to the given value, then remove the BW_SCALE shift. */
332 | 	cwnd = (((w * gain) >> BBR_SCALE) + BW_UNIT - 1) / BW_UNIT;
333 | 
334 | 	/* Allow enough full-sized skbs in flight to utilize end systems. */
335 | 	cwnd += 3 * bbr->tso_segs_goal;
336 | 
337 | 	/* Reduce delayed ACKs by rounding up cwnd to the next even number. */
338 | 	cwnd = (cwnd + 1) & ~1U;
339 | 
340 | 	return cwnd;
341 | }
342 | 
343 | /* An optimization in BBR to reduce losses: On the first round of recovery, we
344 |  * follow the packet conservation principle: send P packets per P packets acked.
345 |  * After that, we slow-start and send at most 2*P packets per P packets acked.
346 |  * After recovery finishes, or upon undo, we restore the cwnd we had when
347 |  * recovery started (capped by the target cwnd based on estimated BDP).
348 |  *
349 |  * TODO(ycheng/ncardwell): implement a rate-based approach.
350 |  */
351 | static bool bbr_set_cwnd_to_recover_or_restore(
352 | 	struct sock *sk, const struct rate_sample *rs, u32 acked, u32 *new_cwnd)
353 | {
354 | 	struct tcp_sock *tp = tcp_sk(sk);
355 | 	struct bbr *bbr = inet_csk_ca(sk);
356 | 	u8 prev_state = bbr->prev_ca_state, state = inet_csk(sk)->icsk_ca_state;
357 | 	u32 cwnd = tp->snd_cwnd;
358 | 
359 | 	/* An ACK for P pkts should release at most 2*P packets. We do this
360 | 	 * in two steps. First, here we deduct the number of lost packets.
361 | 	 * Then, in bbr_set_cwnd() we slow start up toward the target cwnd.
362 | 	 */
363 | 	if (rs->losses > 0)
364 | 		cwnd = max_t(s32, cwnd - rs->losses, 1);
365 | 
366 | 	if (state == TCP_CA_Recovery && prev_state != TCP_CA_Recovery) {
367 | 		/* Starting 1st round of Recovery, so do packet conservation. */
368 | 		bbr->packet_conservation = 1;
369 | 		bbr->next_rtt_delivered = tp->delivered;  /* start round now */
370 | 		/* Cut unused cwnd from app behavior, TSQ, or TSO deferral: */
371 | 		cwnd = tcp_packets_in_flight(tp) + acked;
372 | 	} else if (prev_state >= TCP_CA_Recovery && state < TCP_CA_Recovery) {
373 | 		/* Exiting loss recovery; restore cwnd saved before recovery. */
374 | 		bbr->restore_cwnd = 1;
375 | 		bbr->packet_conservation = 0;
376 | 	}
377 | 	bbr->prev_ca_state = state;
378 | 
379 | 	if (bbr->restore_cwnd) {
380 | 		/* Restore cwnd after exiting loss recovery or PROBE_RTT. */
381 | 		cwnd = max(cwnd, bbr->prior_cwnd);
382 | 		bbr->restore_cwnd = 0;
383 | 	}
384 | 
385 | 	if (bbr->packet_conservation) {
386 | 		*new_cwnd = max(cwnd, tcp_packets_in_flight(tp) + acked);
387 | 		return true;	/* yes, using packet conservation */
388 | 	}
389 | 	*new_cwnd = cwnd;
390 | 	return false;
391 | }
392 | 
393 | /* Slow-start up toward target cwnd (if bw estimate is growing, or packet loss
394 |  * has drawn us down below target), or snap down to target if we're above it.
395 |  */
396 | static void bbr_set_cwnd(struct sock *sk, const struct rate_sample *rs,
397 | 			 u32 acked, u32 bw, int gain)
398 | {
399 | 	struct tcp_sock *tp = tcp_sk(sk);
400 | 	struct bbr *bbr = inet_csk_ca(sk);
401 | 	u32 cwnd = 0, target_cwnd = 0;
402 | 
403 | 	if (!acked)
404 | 		return;
405 | 
406 | 	if (bbr_set_cwnd_to_recover_or_restore(sk, rs, acked, &cwnd))
407 | 		goto done;
408 | 
409 | 	/* If we're below target cwnd, slow start cwnd toward target cwnd. */
410 | 	target_cwnd = bbr_target_cwnd(sk, bw, gain);
411 | 	if (bbr_full_bw_reached(sk))  /* only cut cwnd if we filled the pipe */
412 | 		cwnd = min(cwnd + acked, target_cwnd);
413 | 	else if (cwnd < target_cwnd || tp->delivered < TCP_INIT_CWND)
414 | 		cwnd = cwnd + acked;
415 | 	cwnd = max(cwnd, bbr_cwnd_min_target);
416 | 
417 | done:
418 | 	tp->snd_cwnd = min(cwnd, tp->snd_cwnd_clamp);	/* apply global cap */
419 | 	if (bbr->mode == BBR_PROBE_RTT)  /* drain queue, refresh min_rtt */
420 | 		tp->snd_cwnd = max(tp->snd_cwnd >> 1, bbr_cwnd_min_target);
421 | }
422 | 
423 | /* End cycle phase if it's time and/or we hit the phase's in-flight target. */
424 | static bool bbr_is_next_cycle_phase(struct sock *sk,
425 | 				    const struct rate_sample *rs)
426 | {
427 | 	struct tcp_sock *tp = tcp_sk(sk);
428 | 	struct bbr *bbr = inet_csk_ca(sk);
429 | 	bool is_full_length =
430 | 		skb_mstamp_us_delta(&tp->delivered_mstamp, &bbr->cycle_mstamp) >
431 | 		bbr->min_rtt_us;
432 | 	u32 inflight, bw;
433 | 
434 | 	/* The pacing_gain of 1.0 paces at the estimated bw to try to fully
435 | 	 * use the pipe without increasing the queue.
436 | 	 */
437 | 	if (bbr->pacing_gain == BBR_UNIT)
438 | 		return is_full_length;		/* just use wall clock time */
439 | 
440 | 	inflight = rs->prior_in_flight;  /* what was in-flight before ACK? */
441 | 	bw = bbr_max_bw(sk);
442 | 
443 | 	/* A pacing_gain > 1.0 probes for bw by trying to raise inflight to at
444 | 	 * least pacing_gain*BDP; this may take more than min_rtt if min_rtt is
445 | 	 * small (e.g. on a LAN). We do not persist if packets are lost, since
446 | 	 * a path with small buffers may not hold that much.
447 | 	 */
448 | 	if (bbr->pacing_gain > BBR_UNIT)
449 | 		return is_full_length &&
450 | 			(rs->losses ||  /* perhaps pacing_gain*BDP won't fit */
451 | 			 inflight >= bbr_target_cwnd(sk, bw, bbr->pacing_gain));
452 | 
453 | 	/* A pacing_gain < 1.0 tries to drain extra queue we added if bw
454 | 	 * probing didn't find more bw. If inflight falls to match BDP then we
455 | 	 * estimate queue is drained; persisting would underutilize the pipe.
456 | 	 */
457 | 	return is_full_length ||
458 | 		inflight <= bbr_target_cwnd(sk, bw, BBR_UNIT);
459 | }
460 | 
461 | static void bbr_advance_cycle_phase(struct sock *sk)
462 | {
463 | 	struct tcp_sock *tp = tcp_sk(sk);
464 | 	struct bbr *bbr = inet_csk_ca(sk);
465 | 
466 | 	bbr->cycle_idx = (bbr->cycle_idx + 1) & (CYCLE_LEN - 1);
467 | 	bbr->cycle_mstamp = tp->delivered_mstamp;
468 | 	bbr->pacing_gain = bbr_pacing_gain[bbr->cycle_idx];
469 | }
470 | 
471 | /* Gain cycling: cycle pacing gain to converge to fair share of available bw. */
472 | static void bbr_update_cycle_phase(struct sock *sk,
473 | 				   const struct rate_sample *rs)
474 | {
475 | 	struct bbr *bbr = inet_csk_ca(sk);
476 | 
477 | 	if ((bbr->mode == BBR_PROBE_BW) && !bbr->lt_use_bw &&
478 | 	    bbr_is_next_cycle_phase(sk, rs))
479 | 		bbr_advance_cycle_phase(sk);
480 | }
481 | 
482 | static void bbr_reset_startup_mode(struct sock *sk)
483 | {
484 | 	struct bbr *bbr = inet_csk_ca(sk);
485 | 
486 | 	bbr->mode = BBR_STARTUP;
487 | 	bbr->pacing_gain = bbr_high_gain;
488 | 	bbr->cwnd_gain	 = bbr_high_gain;
489 | }
490 | 
491 | static void bbr_reset_probe_bw_mode(struct sock *sk)
492 | {
493 | 	struct bbr *bbr = inet_csk_ca(sk);
494 | 
495 | 	bbr->mode = BBR_PROBE_BW;
496 | 	bbr->pacing_gain = BBR_UNIT;
497 | 	bbr->cwnd_gain = bbr_cwnd_gain;
498 | 	bbr->cycle_idx = CYCLE_LEN - 1 - prandom_u32_max(bbr_cycle_rand);
499 | 	bbr_advance_cycle_phase(sk);	/* flip to next phase of gain cycle */
500 | }
501 | 
502 | static void bbr_reset_mode(struct sock *sk)
503 | {
504 | 	if (!bbr_full_bw_reached(sk))
505 | 		bbr_reset_startup_mode(sk);
506 | 	else
507 | 		bbr_reset_probe_bw_mode(sk);
508 | }
509 | 
510 | /* Start a new long-term sampling interval. */
511 | static void bbr_reset_lt_bw_sampling_interval(struct sock *sk)
512 | {
513 | 	struct tcp_sock *tp = tcp_sk(sk);
514 | 	struct bbr *bbr = inet_csk_ca(sk);
515 | 
516 | 	bbr->lt_last_stamp = tp->delivered_mstamp.stamp_jiffies;
517 | 	bbr->lt_last_delivered = tp->delivered;
518 | 	bbr->lt_last_lost = tp->lost;
519 | 	bbr->lt_rtt_cnt = 0;
520 | }
521 | 
522 | /* Completely reset long-term bandwidth sampling. */
523 | static void bbr_reset_lt_bw_sampling(struct sock *sk)
524 | {
525 | 	struct bbr *bbr = inet_csk_ca(sk);
526 | 
527 | 	bbr->lt_bw = 0;
528 | 	bbr->lt_use_bw = 0;
529 | 	bbr->lt_is_sampling = false;
530 | 	bbr_reset_lt_bw_sampling_interval(sk);
531 | }
532 | 
533 | /* Long-term bw sampling interval is done. Estimate whether we're policed. */
534 | static void bbr_lt_bw_interval_done(struct sock *sk, u32 bw)
535 | {
536 | 	struct bbr *bbr = inet_csk_ca(sk);
537 | 	u32 diff;
538 | 
539 | 	if (bbr->lt_bw) {  /* do we have bw from a previous interval? */
540 | 		/* Is new bw close to the lt_bw from the previous interval? */
541 | 		diff = abs(bw - bbr->lt_bw);
542 | 		if ((diff * BBR_UNIT <= bbr_lt_bw_ratio * bbr->lt_bw) ||
543 | 		    (bbr_rate_bytes_per_sec(sk, diff, BBR_UNIT) <=
544 | 		     bbr_lt_bw_diff)) {
545 | 			/* All criteria are met; estimate we're policed. */
546 | 			bbr->lt_bw = (bw + bbr->lt_bw) >> 1;  /* avg 2 intvls */
547 | 			bbr->lt_use_bw = 1;
548 | 			bbr->pacing_gain = BBR_UNIT;  /* try to avoid drops */
549 | 			bbr->lt_rtt_cnt = 0;
550 | 			return;
551 | 		}
552 | 	}
553 | 	bbr->lt_bw = bw;
554 | 	bbr_reset_lt_bw_sampling_interval(sk);
555 | }
556 | 
557 | /* Token-bucket traffic policers are common (see "An Internet-Wide Analysis of
558 |  * Traffic Policing", SIGCOMM 2016). BBR detects token-bucket policers and
559 |  * explicitly models their policed rate, to reduce unnecessary losses. We
560 |  * estimate that we're policed if we see 2 consecutive sampling intervals with
561 |  * consistent throughput and high packet loss. If we think we're being policed,
562 |  * set lt_bw to the "long-term" average delivery rate from those 2 intervals.
563 |  */
564 | static void bbr_lt_bw_sampling(struct sock *sk, const struct rate_sample *rs)
565 | {
566 | 	struct tcp_sock *tp = tcp_sk(sk);
567 | 	struct bbr *bbr = inet_csk_ca(sk);
568 | 	u32 lost, delivered;
569 | 	u64 bw;
570 | 	s32 t;
571 | 
572 | 	if (bbr->lt_use_bw) {	/* already using long-term rate, lt_bw? */
573 | 		if (bbr->mode == BBR_PROBE_BW && bbr->round_start &&
574 | 		    ++bbr->lt_rtt_cnt >= bbr_lt_bw_max_rtts) {
575 | 			bbr_reset_lt_bw_sampling(sk);    /* stop using lt_bw */
576 | 			bbr_reset_probe_bw_mode(sk);  /* restart gain cycling */
577 | 		}
578 | 		return;
579 | 	}
580 | 
581 | 	/* Wait for the first loss before sampling, to let the policer exhaust
582 | 	 * its tokens and estimate the steady-state rate allowed by the policer.
583 | 	 * Starting samples earlier includes bursts that over-estimate the bw.
584 | 	 */
585 | 	if (!bbr->lt_is_sampling) {
586 | 		if (!rs->losses)
587 | 			return;
588 | 		bbr_reset_lt_bw_sampling_interval(sk);
589 | 		bbr->lt_is_sampling = true;
590 | 	}
591 | 
592 | 	/* To avoid underestimates, reset sampling if we run out of data. */
593 | 	if (rs->is_app_limited) {
594 | 		bbr_reset_lt_bw_sampling(sk);
595 | 		return;
596 | 	}
597 | 
598 | 	if (bbr->round_start)
599 | 		bbr->lt_rtt_cnt++;	/* count round trips in this interval */
600 | 	if (bbr->lt_rtt_cnt < bbr_lt_intvl_min_rtts)
601 | 		return;		/* sampling interval needs to be longer */
602 | 	if (bbr->lt_rtt_cnt > 4 * bbr_lt_intvl_min_rtts) {
603 | 		bbr_reset_lt_bw_sampling(sk);  /* interval is too long */
604 | 		return;
605 | 	}
606 | 
607 | 	/* End sampling interval when a packet is lost, so we estimate the
608 | 	 * policer tokens were exhausted. Stopping the sampling before the
609 | 	 * tokens are exhausted under-estimates the policed rate.
610 | 	 */
611 | 	if (!rs->losses)
612 | 		return;
613 | 
614 | 	/* Calculate packets lost and delivered in sampling interval. */
615 | 	lost = tp->lost - bbr->lt_last_lost;
616 | 	delivered = tp->delivered - bbr->lt_last_delivered;
617 | 	/* Is loss rate (lost/delivered) >= lt_loss_thresh? If not, wait. */
618 | 	if (!delivered || (lost << BBR_SCALE) < bbr_lt_loss_thresh * delivered)
619 | 		return;
620 | 
621 | 	/* Find average delivery rate in this sampling interval. */
622 | 	t = (s32)(tp->delivered_mstamp.stamp_jiffies - bbr->lt_last_stamp);
623 | 	if (t < 1)
624 | 		return;		/* interval is less than one jiffy, so wait */
625 | 	t = jiffies_to_usecs(t);
626 | 	/* Interval long enough for jiffies_to_usecs() to return a bogus 0? */
627 | 	if (t < 1) {
628 | 		bbr_reset_lt_bw_sampling(sk);  /* interval too long; reset */
629 | 		return;
630 | 	}
631 | 	bw = (u64)delivered * BW_UNIT;
632 | 	do_div(bw, t);
633 | 	bbr_lt_bw_interval_done(sk, bw);
634 | }
635 | 
636 | /* Estimate the bandwidth based on how fast packets are delivered */
637 | static void bbr_update_bw(struct sock *sk, const struct rate_sample *rs)
638 | {
639 | 	struct tcp_sock *tp = tcp_sk(sk);
640 | 	struct bbr *bbr = inet_csk_ca(sk);
641 | 	u64 bw;
642 | 
643 | 	bbr->round_start = 0;
644 | 	if (rs->delivered < 0 || rs->interval_us <= 0)
645 | 		return; /* Not a valid observation */
646 | 
647 | 	/* See if we've reached the next RTT */
648 | 	if (!before(rs->prior_delivered, bbr->next_rtt_delivered)) {
649 | 		bbr->next_rtt_delivered = tp->delivered;
650 | 		bbr->rtt_cnt++;
651 | 		bbr->round_start = 1;
652 | 		bbr->packet_conservation = 0;
653 | 	}
654 | 
655 | 	bbr_lt_bw_sampling(sk, rs);
656 | 
657 | 	/* Divide delivered by the interval to find a (lower bound) bottleneck
658 | 	 * bandwidth sample. Delivered is in packets and interval_us in uS and
659 | 	 * ratio will be <<1 for most connections. So delivered is first scaled.
660 | 	 */
661 | 	bw = (u64)rs->delivered * BW_UNIT;
662 | 	do_div(bw, rs->interval_us);
663 | 
664 | 	/* If this sample is application-limited, it is likely to have a very
665 | 	 * low delivered count that represents application behavior rather than
666 | 	 * the available network rate. Such a sample could drag down estimated
667 | 	 * bw, causing needless slow-down. Thus, to continue to send at the
668 | 	 * last measured network rate, we filter out app-limited samples unless
669 | 	 * they describe the path bw at least as well as our bw model.
670 | 	 *
671 | 	 * So the goal during app-limited phase is to proceed with the best
672 | 	 * network rate no matter how long. We automatically leave this
673 | 	 * phase when app writes faster than the network can deliver :)
674 | 	 */
675 | 	if (!rs->is_app_limited || bw >= bbr_max_bw(sk)) {
676 | 		/* Incorporate new sample into our max bw filter. */
677 | 		minmax_running_max(&bbr->bw, bbr_bw_rtts, bbr->rtt_cnt, bw);
678 | 	}
679 | }
680 | 
681 | /* Estimate when the pipe is full, using the change in delivery rate: BBR
682 |  * estimates that STARTUP filled the pipe if the estimated bw hasn't changed by
683 |  * at least bbr_full_bw_thresh (25%) after bbr_full_bw_cnt (3) non-app-limited
684 |  * rounds. Why 3 rounds: 1: rwin autotuning grows the rwin, 2: we fill the
685 |  * higher rwin, 3: we get higher delivery rate samples. Or transient
686 |  * cross-traffic or radio noise can go away. CUBIC Hystart shares a similar
687 |  * design goal, but uses delay and inter-ACK spacing instead of bandwidth.
688 |  */
689 | static void bbr_check_full_bw_reached(struct sock *sk,
690 | 				      const struct rate_sample *rs)
691 | {
692 | 	struct bbr *bbr = inet_csk_ca(sk);
693 | 	u32 bw_thresh;
694 | 
695 | 	if (bbr_full_bw_reached(sk) || !bbr->round_start || rs->is_app_limited)
696 | 		return;
697 | 
698 | 	bw_thresh = (u64)bbr->full_bw * bbr_full_bw_thresh >> BBR_SCALE;
699 | 	if (bbr_max_bw(sk) >= bw_thresh) {
700 | 		bbr->full_bw = bbr_max_bw(sk);
701 | 		bbr->full_bw_cnt = 0;
702 | 		return;
703 | 	}
704 | 	++bbr->full_bw_cnt;
705 | }
706 | 
707 | /* If pipe is probably full, drain the queue and then enter steady-state. */
708 | static void bbr_check_drain(struct sock *sk, const struct rate_sample *rs)
709 | {
710 | 	struct bbr *bbr = inet_csk_ca(sk);
711 | 
712 | 	if (bbr->mode == BBR_STARTUP && bbr_full_bw_reached(sk)) {
713 | 		bbr->mode = BBR_DRAIN;	/* drain queue we created */
714 | 		bbr->pacing_gain = bbr_drain_gain;	/* pace slow to drain */
715 | 		bbr->cwnd_gain = bbr_high_gain;	/* maintain cwnd */
716 | 	}	/* fall through to check if in-flight is already small: */
717 | 	if (bbr->mode == BBR_DRAIN &&
718 | 	    tcp_packets_in_flight(tcp_sk(sk)) <=
719 | 	    bbr_target_cwnd(sk, bbr_max_bw(sk), BBR_UNIT))
720 | 		bbr_reset_probe_bw_mode(sk);  /* we estimate queue is drained */
721 | }
722 | 
723 | /* The goal of PROBE_RTT mode is to have BBR flows cooperatively and
724 |  * periodically drain the bottleneck queue, to converge to measure the true
725 |  * min_rtt (unloaded propagation delay). This allows the flows to keep queues
726 |  * small (reducing queuing delay and packet loss) and achieve fairness among
727 |  * BBR flows.
728 |  *
729 |  * The min_rtt filter window is 10 seconds. When the min_rtt estimate expires,
730 |  * we enter PROBE_RTT mode and cap the cwnd at bbr_cwnd_min_target=4 packets.
731 |  * After at least bbr_probe_rtt_mode_ms=200ms and at least one packet-timed
732 |  * round trip elapsed with that flight size <= 4, we leave PROBE_RTT mode and
733 |  * re-enter the previous mode. BBR uses 200ms to approximately bound the
734 |  * performance penalty of PROBE_RTT's cwnd capping to roughly 2% (200ms/10s).
735 |  *
736 |  * Note that flows need only pay 2% if they are busy sending over the last 10
737 |  * seconds. Interactive applications (e.g., Web, RPCs, video chunks) often have
738 |  * natural silences or low-rate periods within 10 seconds where the rate is low
739 |  * enough for long enough to drain its queue in the bottleneck. We pick up
740 |  * these min RTT measurements opportunistically with our min_rtt filter. :-)
741 |  */
742 | static void bbr_update_min_rtt(struct sock *sk, const struct rate_sample *rs)
743 | {
744 | 	struct tcp_sock *tp = tcp_sk(sk);
745 | 	struct bbr *bbr = inet_csk_ca(sk);
746 | 	bool filter_expired;
747 | 
748 | 	/* Track min RTT seen in the min_rtt_win_sec filter window: */
749 | 	// as above BBR_Structure define: "min_rtt_win_sec = 5 seconds"
750 | 	filter_expired = after(tcp_time_stamp,
751 | 			       bbr->min_rtt_stamp + bbr_min_rtt_win_sec * HZ);
752 | 	if (rs->rtt_us >= 0 &&
753 | 	    (rs->rtt_us <= bbr->min_rtt_us || filter_expired)) {
754 | 		bbr->min_rtt_us = rs->rtt_us;
755 | 		bbr->min_rtt_stamp = tcp_time_stamp;
756 | 	}
757 | 
758 | 	if (bbr_probe_rtt_mode_ms > 0 && filter_expired &&
759 | 	    !bbr->idle_restart && bbr->mode != BBR_PROBE_RTT) {
760 | 		bbr->mode = BBR_PROBE_RTT;  /* dip, drain queue */
761 | 		bbr->pacing_gain = BBR_UNIT;
762 | 		bbr->cwnd_gain = BBR_UNIT;
763 | 		bbr_save_cwnd(sk);  /* note cwnd so we can restore it */
764 | 		bbr->probe_rtt_done_stamp = 0;
765 | 	}
766 | 
767 | 	if (bbr->mode == BBR_PROBE_RTT) {
768 | 		/* Ignore low rate samples during this mode. */
769 | 		tp->app_limited =
770 | 			(tp->delivered + tcp_packets_in_flight(tp)) ? : 1;
771 | 		/* Maintain min packets in flight for max(200 ms, 1 round). */
772 | 		if (!bbr->probe_rtt_done_stamp &&
773 | 		    tcp_packets_in_flight(tp) <= bbr_cwnd_min_target) {
774 | 			bbr->probe_rtt_done_stamp = tcp_time_stamp +
775 | 				msecs_to_jiffies(bbr_probe_rtt_mode_ms >> 1);
776 | 			bbr->probe_rtt_round_done = 0;
777 | 			bbr->next_rtt_delivered = tp->delivered;
778 | 		} else if (bbr->probe_rtt_done_stamp) {
779 | 			if (bbr->round_start)
780 | 				bbr->probe_rtt_round_done = 1;
781 | 			if (bbr->probe_rtt_round_done &&
782 | 			    after(tcp_time_stamp, bbr->probe_rtt_done_stamp)) {
783 | 				bbr->min_rtt_stamp = tcp_time_stamp;
784 | 				bbr->restore_cwnd = 1;  /* snap to prior_cwnd */
785 | 				bbr_reset_mode(sk);
786 | 			}
787 | 		}
788 | 	}
789 | 	bbr->idle_restart = 0;
790 | }
791 | 
792 | static void bbr_update_model(struct sock *sk, const struct rate_sample *rs)
793 | {
794 | 	bbr_update_bw(sk, rs);
795 | 	bbr_update_cycle_phase(sk, rs);
796 | 	bbr_check_full_bw_reached(sk, rs);
797 | 	bbr_check_drain(sk, rs);
798 | 	bbr_update_min_rtt(sk, rs);
799 | }
800 | 
801 | static void bbr_main(struct sock *sk, const struct rate_sample *rs)
802 | {
803 | 	struct bbr *bbr = inet_csk_ca(sk);
804 | 	u32 bw;
805 | 
806 | 	bbr_update_model(sk, rs);
807 | 
808 | 	bw = bbr_bw(sk);
809 | 	bbr_set_pacing_rate(sk, bw, bbr->pacing_gain);
810 | 	bbr_set_tso_segs_goal(sk);
811 | 	bbr_set_cwnd(sk, rs, rs->acked_sacked, bw, bbr->cwnd_gain);
812 | }
813 | 
814 | static void bbr_init(struct sock *sk)
815 | {
816 | 	struct tcp_sock *tp = tcp_sk(sk);
817 | 	struct bbr *bbr = inet_csk_ca(sk);
818 | 	u64 bw;
819 | 
820 | 	bbr->prior_cwnd = 0;
821 | 	bbr->tso_segs_goal = 0;	 /* default segs per skb until first ACK */
822 | 	bbr->rtt_cnt = 0;
823 | 	bbr->next_rtt_delivered = 0;
824 | 	bbr->prev_ca_state = TCP_CA_Open;
825 | 	bbr->packet_conservation = 0;
826 | 
827 | 	bbr->probe_rtt_done_stamp = 0;
828 | 	bbr->probe_rtt_round_done = 0;
829 | 	bbr->min_rtt_us = tcp_min_rtt(tp);
830 | 	bbr->min_rtt_stamp = tcp_time_stamp;
831 | 
832 | 	minmax_reset(&bbr->bw, bbr->rtt_cnt, 0);  /* init max bw to 0 */
833 | 
834 | 	/* Initialize pacing rate to: high_gain * init_cwnd / RTT. */
835 | 	bw = (u64)tp->snd_cwnd * BW_UNIT;
836 | 	do_div(bw, (tp->srtt_us >> 3) ? : USEC_PER_MSEC);
837 | 	sk->sk_pacing_rate = 0;		/* force an update of sk_pacing_rate */
838 | 	bbr_set_pacing_rate(sk, bw, bbr_high_gain);
839 | 
840 | 	bbr->restore_cwnd = 0;
841 | 	bbr->round_start = 0;
842 | 	bbr->idle_restart = 0;
843 | 	bbr->full_bw = 0;
844 | 	bbr->full_bw_cnt = 0;
845 | 	bbr->cycle_mstamp.v64 = 0;
846 | 	bbr->cycle_idx = 0;
847 | 	bbr_reset_lt_bw_sampling(sk);
848 | 	bbr_reset_startup_mode(sk);
849 | }
850 | 
851 | static u32 bbr_sndbuf_expand(struct sock *sk)
852 | {
853 | 	/* Provision 3 * cwnd since BBR may slow-start even during recovery. */
854 | 	return 3;
855 | }
856 | 
857 | /* In theory BBR does not need to undo the cwnd since it does not
858 |  * always reduce cwnd on losses (see bbr_main()). Keep it for now.
859 |  */
860 | static u32 bbr_undo_cwnd(struct sock *sk)
861 | {
862 | 	return tcp_sk(sk)->snd_cwnd;
863 | }
864 | 
865 | /* Entering loss recovery, so save cwnd for when we exit or undo recovery. */
866 | static u32 bbr_ssthresh(struct sock *sk)
867 | {
868 | 	bbr_save_cwnd(sk);
869 | 	return TCP_INFINITE_SSTHRESH;	 /* BBR does not use ssthresh */
870 | }
871 | 
872 | static size_t bbr_get_info(struct sock *sk, u32 ext, int *attr, union tcp_cc_info *info)
873 | {
874 | 	if (ext & (1 << (INET_DIAG_BBRINFO - 1)) ||
875 | 	    ext & (1 << (INET_DIAG_VEGASINFO - 1))) {
876 | 		struct tcp_sock *tp = tcp_sk(sk);
877 | 		struct bbr *bbr = inet_csk_ca(sk);
878 | 		u64 bw = bbr_bw(sk);
879 | 		bw = bw * tp->mss_cache * USEC_PER_SEC >> BW_SCALE;
880 | 		memset(&info->bbr, 0, sizeof(info->bbr));
881 | 		info->bbr.bbr_bw_lo		= (u32)bw;
882 | 		info->bbr.bbr_bw_hi		= (u32)(bw >> 32);
883 | 		info->bbr.bbr_min_rtt		= bbr->min_rtt_us;
884 | 		info->bbr.bbr_pacing_gain	= bbr->pacing_gain;
885 | 		info->bbr.bbr_cwnd_gain		= bbr->cwnd_gain;
886 | 		*attr = INET_DIAG_BBRINFO;
887 | 		return sizeof(info->bbr);
888 | 	}
889 | 	return 0;
890 | }
891 | 
892 | static void bbr_set_state(struct sock *sk, u8 new_state)
893 | {
894 | 	struct bbr *bbr = inet_csk_ca(sk);
895 | 
896 | 	if (new_state == TCP_CA_Loss) {
897 | 		struct rate_sample rs = { .losses = 1 };
898 | 
899 | 		bbr->prev_ca_state = TCP_CA_Loss;
900 | 		bbr->full_bw = 0;
901 | 		bbr->round_start = 1;	/* treat RTO like end of a round */
902 | 		bbr_lt_bw_sampling(sk, &rs);
903 | 	}
904 | }
905 | 
906 | static struct tcp_congestion_ops tcp_bbr_cong_ops __read_mostly = {
907 | 	.flags			= TCP_CONG_NON_RESTRICTED,
908 | 	.name			= "bbr_plus",
909 | 	.owner			= THIS_MODULE,
910 | 	.init			= bbr_init,
911 | 	.cong_control	= bbr_main,
912 | 	.sndbuf_expand	= bbr_sndbuf_expand,
913 | 	.undo_cwnd		= bbr_undo_cwnd,
914 | 	.cwnd_event		= bbr_cwnd_event,
915 | 	.ssthresh		= bbr_ssthresh,
916 | 	.tso_segs_goal	= bbr_tso_segs_goal,
917 | 	.get_info		= bbr_get_info,
918 | 	.set_state		= bbr_set_state,
919 | };
920 | 
921 | static int __init bbr_register(void)
922 | {
923 | 	BUILD_BUG_ON(sizeof(struct bbr) > ICSK_CA_PRIV_SIZE);
924 | 	return tcp_register_congestion_control(&tcp_bbr_cong_ops);
925 | }
926 | 
927 | static void __exit bbr_unregister(void)
928 | {
929 | 	tcp_unregister_congestion_control(&tcp_bbr_cong_ops);
930 | }
931 | 
932 | module_init(bbr_register);
933 | module_exit(bbr_unregister);
934 | 
935 | MODULE_AUTHOR("Van Jacobson <vanj@google.com>");
936 | MODULE_AUTHOR("Neal Cardwell <ncardwell@google.com>");
937 | MODULE_AUTHOR("Yuchung Cheng <ycheng@google.com>");
938 | MODULE_AUTHOR("Soheil Hassas Yeganeh <soheil@google.com>");
939 | MODULE_LICENSE("Dual BSD/GPL");
940 | MODULE_DESCRIPTION("TCP BBR (Bottleneck Bandwidth and RTT)");
941 | 


--------------------------------------------------------------------------------