├── .github
    └── workflows
    │   └── semgrep.yml
├── COPYING
├── README.md
└── patches
    ├── 0001-audit-check-syscall-bitmap-on-entry-to-avoid-extra-w.patch
    ├── 0014-add-a-sysctl-to-enable-disable-tcp_collapse-logic.patch
    ├── 0020-Add-a-sysctl-to-allow-TCP-window-shrinking-in-order-.patch
    └── 0024-Add-xtsproxy-Crypto-API-module.patch


/.github/workflows/semgrep.yml:
--------------------------------------------------------------------------------
 1 | 
 2 | on:
 3 |   pull_request: {}
 4 |   workflow_dispatch: {}
 5 |   push: 
 6 |     branches:
 7 |       - main
 8 |       - master
 9 |   schedule:
10 |     - cron: '0 0 * * *'
11 | name: Semgrep config
12 | jobs:
13 |   semgrep:
14 |     name: semgrep/ci
15 |     runs-on: ubuntu-20.04
16 |     env:
17 |       SEMGREP_APP_TOKEN: ${{ secrets.SEMGREP_APP_TOKEN }}
18 |       SEMGREP_URL: https://cloudflare.semgrep.dev
19 |       SEMGREP_APP_URL: https://cloudflare.semgrep.dev
20 |       SEMGREP_VERSION_CHECK_URL: https://cloudflare.semgrep.dev/api/check-version
21 |     container:
22 |       image: returntocorp/semgrep
23 |     steps:
24 |       - uses: actions/checkout@v3
25 |       - run: semgrep ci
26 | 


--------------------------------------------------------------------------------
/COPYING:
--------------------------------------------------------------------------------
1 | This software is distributed under the same license as The Linux Kernel.
2 | See https://www.kernel.org/pub/linux/kernel/COPYING for details.
3 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Cloudflare Linux Kernel Patches
 2 | 
 3 | This repository contains some Linux Kernel patches, which where not submitted to the mainline kernel tree. These patches may be not generic enough for inclusion into the mainline kernel tree, declined by the kernel maintainers etc.
 4 | 
 5 | ## Supported Kernel Versions
 6 | 
 7 | Normally the patches in this repository should be applicable on top of the latest [Linux Kernel long-term support ("LTS") release](https://www.kernel.org/releases.html).  Other versions are not considered.
 8 | 
 9 | These patches are provided as-is without any support guarantee, but contributions are welcome.
10 | 


--------------------------------------------------------------------------------
/patches/0001-audit-check-syscall-bitmap-on-entry-to-avoid-extra-w.patch:
--------------------------------------------------------------------------------
  1 | From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
  2 | From: Ivan Babrou <ivan@cloudflare.com>
  3 | Date: Mon, 22 May 2023 17:04:03 -0700
  4 | Subject: [PATCH] audit: check syscall bitmap on entry to avoid extra work
  5 | 
  6 | Currently audit subsystem arms itself as long as there are rules present,
  7 | which means that on every syscall exit all rules are evaluated, even
  8 | if they don't match the syscall to begin with. For setups where
  9 | there are no rules that can match any syscall, this means that
 10 | the CPU price needs to be paid when it's not necessary.
 11 | 
 12 | This patch introduces a bitmap for syscalls that is maintained
 13 | when rules are inserted and removed. For every syscall we maintain
 14 | a bit indicating whether it needs to be audited at all, which is then
 15 | checked at syscall entry. If the are no rules matching a syscall,
 16 | extra cost of checking all the rules is avoided.
 17 | 
 18 | Consider the following set of 10 audit rules as a benchmark:
 19 | 
 20 |     -a always,exit -F arch=b64 -S unlinkat,linkat,renameat,openat,renameat2 -F perm=wa -F dir=/tmp/audit-bench/0 -F key=BENCH0
 21 |     -a always,exit -F arch=b64 -S unlinkat,linkat,renameat,openat,renameat2 -F perm=wa -F dir=/tmp/audit-bench/1 -F key=BENCH1
 22 |     -a always,exit -F arch=b64 -S unlinkat,linkat,renameat,openat,renameat2 -F perm=wa -F dir=/tmp/audit-bench/2 -F key=BENCH2
 23 |     -a always,exit -F arch=b64 -S unlinkat,linkat,renameat,openat,renameat2 -F perm=wa -F dir=/tmp/audit-bench/3 -F key=BENCH3
 24 |     -a always,exit -F arch=b64 -S unlinkat,linkat,renameat,openat,renameat2 -F perm=wa -F dir=/tmp/audit-bench/4 -F key=BENCH4
 25 |     -a always,exit -F arch=b64 -S unlinkat,linkat,renameat,openat,renameat2 -F perm=wa -F dir=/tmp/audit-bench/5 -F key=BENCH5
 26 |     -a always,exit -F arch=b64 -S unlinkat,linkat,renameat,openat,renameat2 -F perm=wa -F dir=/tmp/audit-bench/6 -F key=BENCH6
 27 |     -a always,exit -F arch=b64 -S unlinkat,linkat,renameat,openat,renameat2 -F perm=wa -F dir=/tmp/audit-bench/7 -F key=BENCH7
 28 |     -a always,exit -F arch=b64 -S unlinkat,linkat,renameat,openat,renameat2 -F perm=wa -F dir=/tmp/audit-bench/8 -F key=BENCH8
 29 |     -a always,exit -F arch=b64 -S unlinkat,linkat,renameat,openat,renameat2 -F perm=wa -F dir=/tmp/audit-bench/9 -F key=BENCH9
 30 | 
 31 | We can use the following benchmark to run unrelated syscalls:
 32 | 
 33 |     #include <sys/stat.h>
 34 |     #include <unistd.h>
 35 |     #include <stdio.h>
 36 | 
 37 |     #define GETPID_COUNT 100 * 1000
 38 |     #define STAT_COUNT 100 * 1000
 39 | 
 40 |     pid_t bench_getpid()
 41 |     {
 42 |         pid_t pid;
 43 | 
 44 |         for (int i = 0; i < GETPID_COUNT; i++)
 45 |         {
 46 |             pid = getpid();
 47 |         }
 48 | 
 49 |         return pid;
 50 |     }
 51 | 
 52 |     struct stat bench_stat()
 53 |     {
 54 |         struct stat statbuf;
 55 | 
 56 |         for (int i = 0; i < STAT_COUNT; i++)
 57 |         {
 58 |             stat("/etc/passwd", &statbuf);
 59 |         }
 60 | 
 61 |         return statbuf;
 62 |     }
 63 | 
 64 |     int main()
 65 |     {
 66 |         pid_t pid = bench_getpid();
 67 |         struct stat statbuf = bench_stat();
 68 | 
 69 |         printf("pid = %d, size = %d\n", pid, statbuf.st_size);
 70 |     }
 71 | 
 72 | Here we run 100k `getpid()` calls and 100k `stat()` calls, which are not
 73 | covered by any of the audit rules installed on the system.
 74 | 
 75 | When running without any rules present, but with auditd running, flamegraphs
 76 | show ~5% of CPU time spent in audit_* code. If we install the rules mentioned
 77 | above, this number jumps to ~24%. With this patch applied, the number is once
 78 | again down to 5%, which is what one would expect.
 79 | 
 80 | There's extra cost of maintaining the bitmap when rules are changed,
 81 | but it's negligible compared to CPU savings from cheaper syscalls.
 82 | 
 83 | Signed-off-by: Ivan Babrou <ivan@cloudflare.com>
 84 | 
 85 | ---
 86 | v2: Skipping AUDIT_NEVER in bitmap.
 87 | ---
 88 |  include/linux/audit.h | 21 +++++++++++++++++++++
 89 |  kernel/auditfilter.c  | 36 ++++++++++++++++++++++++++++++++----
 90 |  kernel/auditsc.c      | 27 +++++++++++----------------
 91 |  3 files changed, 64 insertions(+), 20 deletions(-)
 92 | 
 93 | diff --git a/include/linux/audit.h b/include/linux/audit.h
 94 | index 31086a72e32a..e99428052321 100644
 95 | --- a/include/linux/audit.h
 96 | +++ b/include/linux/audit.h
 97 | @@ -9,6 +9,7 @@
 98 |  #ifndef _LINUX_AUDIT_H_
 99 |  #define _LINUX_AUDIT_H_
100 |  
101 | +#include <linux/bitmap.h>
102 |  #include <linux/sched.h>
103 |  #include <linux/ptrace.h>
104 |  #include <linux/audit_arch.h>
105 | @@ -399,6 +400,22 @@ static inline void audit_ptrace(struct task_struct *t)
106 |  		__audit_ptrace(t);
107 |  }
108 |  
109 | +static inline int audit_in_mask(const struct audit_krule *rule, unsigned long val)
110 | +{
111 | +	int word, bit;
112 | +
113 | +	if (val > 0xffffffff)
114 | +		return false;
115 | +
116 | +	word = AUDIT_WORD(val);
117 | +	if (word >= AUDIT_BITMASK_SIZE)
118 | +		return false;
119 | +
120 | +	bit = AUDIT_BIT(val);
121 | +
122 | +	return rule->mask[word] & bit;
123 | +}
124 | +
125 |  				/* Private API (for audit.c only) */
126 |  extern void __audit_ipc_obj(struct kern_ipc_perm *ipcp);
127 |  extern void __audit_ipc_set_perm(unsigned long qbytes, uid_t uid, gid_t gid, umode_t mode);
128 | @@ -573,6 +590,10 @@ static inline void audit_log_nfcfg(const char *name, u8 af,
129 |  
130 |  extern int audit_n_rules;
131 |  extern int audit_signals;
132 | +
133 | +extern int audit_n_syscall_rules;
134 | +extern int audit_syscall_rules[NR_syscalls];
135 | +extern DECLARE_BITMAP(audit_syscalls_bitmap, NR_syscalls);
136 |  #else /* CONFIG_AUDITSYSCALL */
137 |  static inline int audit_alloc(struct task_struct *task)
138 |  {
139 | diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
140 | index 42d99896e7a6..e6ca85e96622 100644
141 | --- a/kernel/auditfilter.c
142 | +++ b/kernel/auditfilter.c
143 | @@ -943,7 +943,7 @@ static inline int audit_add_rule(struct audit_entry *entry)
144 |  	struct list_head *list;
145 |  	int err = 0;
146 |  #ifdef CONFIG_AUDITSYSCALL
147 | -	int dont_count = 0;
148 | +	int syscall_nr, dont_count = 0;
149 |  
150 |  	/* If any of these, don't count towards total */
151 |  	switch(entry->rule.listnr) {
152 | @@ -1007,9 +1007,23 @@ static inline int audit_add_rule(struct audit_entry *entry)
153 |  		list_add_tail_rcu(&entry->list, list);
154 |  	}
155 |  #ifdef CONFIG_AUDITSYSCALL
156 | -	if (!dont_count)
157 | +	if (!dont_count) {
158 |  		audit_n_rules++;
159 |  
160 | +		if (entry->rule.listnr == AUDIT_FILTER_EXIT) {
161 | +			audit_n_syscall_rules++;
162 | +
163 | +			if (entry->rule.action != AUDIT_NEVER) {
164 | +				for (syscall_nr = 0; syscall_nr < NR_syscalls; syscall_nr++) {
165 | +					if (!audit_in_mask(&entry->rule, syscall_nr))
166 | +						continue;
167 | +					if (++audit_syscall_rules[syscall_nr] == 1)
168 | +						set_bit(syscall_nr, audit_syscalls_bitmap);
169 | +				}
170 | +			}
171 | +		}
172 | +	}
173 | +
174 |  	if (!audit_match_signal(entry))
175 |  		audit_signals++;
176 |  #endif
177 | @@ -1026,7 +1040,7 @@ int audit_del_rule(struct audit_entry *entry)
178 |  	struct list_head *list;
179 |  	int ret = 0;
180 |  #ifdef CONFIG_AUDITSYSCALL
181 | -	int dont_count = 0;
182 | +	int syscall_nr, dont_count = 0;
183 |  
184 |  	/* If any of these, don't count towards total */
185 |  	switch(entry->rule.listnr) {
186 | @@ -1054,9 +1068,23 @@ int audit_del_rule(struct audit_entry *entry)
187 |  		audit_remove_mark_rule(&e->rule);
188 |  
189 |  #ifdef CONFIG_AUDITSYSCALL
190 | -	if (!dont_count)
191 | +	if (!dont_count) {
192 |  		audit_n_rules--;
193 |  
194 | +		if (entry->rule.listnr == AUDIT_FILTER_EXIT) {
195 | +			audit_n_syscall_rules--;
196 | +
197 | +			if (entry->rule.action != AUDIT_NEVER) {
198 | +				for (syscall_nr = 0; syscall_nr < NR_syscalls; syscall_nr++) {
199 | +					if (!audit_in_mask(&entry->rule, syscall_nr))
200 | +						continue;
201 | +					if (--audit_syscall_rules[syscall_nr] == 0)
202 | +						clear_bit(syscall_nr, audit_syscalls_bitmap);
203 | +				}
204 | +			}
205 | +		}
206 | +	}
207 | +
208 |  	if (!audit_match_signal(entry))
209 |  		audit_signals--;
210 |  #endif
211 | diff --git a/kernel/auditsc.c b/kernel/auditsc.c
212 | index addeed3df15d..eb8296474bb2 100644
213 | --- a/kernel/auditsc.c
214 | +++ b/kernel/auditsc.c
215 | @@ -86,6 +86,15 @@ int audit_n_rules;
216 |  /* determines whether we collect data for signals sent */
217 |  int audit_signals;
218 |  
219 | +/* number of syscall related audit rules */
220 | +int audit_n_syscall_rules;
221 | +
222 | +/* number of rules per syscall */
223 | +int audit_syscall_rules[NR_syscalls];
224 | +
225 | +/* bitmap for checking whether a syscall is audited */
226 | +DECLARE_BITMAP(audit_syscalls_bitmap, NR_syscalls);
227 | +
228 |  struct audit_aux_data {
229 |  	struct audit_aux_data	*next;
230 |  	int			type;
231 | @@ -790,22 +799,6 @@ static enum audit_state audit_filter_task(struct task_struct *tsk, char **key)
232 |  	return AUDIT_STATE_BUILD;
233 |  }
234 |  
235 | -static int audit_in_mask(const struct audit_krule *rule, unsigned long val)
236 | -{
237 | -	int word, bit;
238 | -
239 | -	if (val > 0xffffffff)
240 | -		return false;
241 | -
242 | -	word = AUDIT_WORD(val);
243 | -	if (word >= AUDIT_BITMASK_SIZE)
244 | -		return false;
245 | -
246 | -	bit = AUDIT_BIT(val);
247 | -
248 | -	return rule->mask[word] & bit;
249 | -}
250 | -
251 |  /**
252 |   * __audit_filter_op - common filter helper for operations (syscall/uring/etc)
253 |   * @tsk: associated task
254 | @@ -2025,6 +2018,8 @@ void __audit_syscall_entry(int major, unsigned long a1, unsigned long a2,
255 |  		return;
256 |  
257 |  	context->dummy = !audit_n_rules;
258 | +	if (!context->dummy && audit_n_syscall_rules == audit_n_rules)
259 | +		context->dummy = !test_bit(major, audit_syscalls_bitmap);
260 |  	if (!context->dummy && state == AUDIT_STATE_BUILD) {
261 |  		context->prio = 0;
262 |  		if (auditd_test_task(current))
263 | -- 
264 | 2.41.0
265 | 
266 | 


--------------------------------------------------------------------------------
/patches/0014-add-a-sysctl-to-enable-disable-tcp_collapse-logic.patch:
--------------------------------------------------------------------------------
  1 | From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
  2 | From: "mfreemon@cloudflare.com" <mfreemon@cloudflare.com>
  3 | Date: Tue, 1 Mar 2022 17:06:02 -0600
  4 | Subject: [PATCH] Add a sysctl to skip tcp collapse processing when the receive
  5 |  buffer is full.
  6 | 
  7 | For context and additional information about this patch, see the
  8 | blog post at https://blog.cloudflare.com/optimizing-tcp-for-high-throughput-and-low-latency/
  9 | 
 10 | sysctl:  net.ipv4.tcp_collapse_max_bytes
 11 | 
 12 | If tcp_collapse_max_bytes is non-zero, attempt to collapse the
 13 | queue to free up memory if the current amount of memory allocated
 14 | is less than tcp_collapse_max_bytes.  Otherwise, the packet is
 15 | dropped without attempting to collapse the queue.
 16 | 
 17 | If tcp_collapse_max_bytes is zero, this feature is disabled
 18 | and the default Linux behavior is used.  The default Linux
 19 | behavior is to always perform the attempt to collapse the
 20 | queue to free up memory.
 21 | 
 22 | When the receive queue is small, we want to collapse the
 23 | queue.  There are two reasons for this: (a) the latency of
 24 | performing the collapse will be small on a small queue, and
 25 | (b) we want to avoid sending a congestion signal (via a
 26 | packet drop) to the sender when the receive queue is small.
 27 | 
 28 | The result is that we avoid latency spikes caused by the
 29 | time it takes to perform the collapse logic when the receive
 30 | queue is large and full, while preserving existing behavior
 31 | and performance for all other cases.
 32 | ---
 33 |  include/net/netns/ipv4.h   |  1 +
 34 |  include/trace/events/tcp.h |  7 +++++++
 35 |  net/ipv4/sysctl_net_ipv4.c |  7 +++++++
 36 |  net/ipv4/tcp_input.c       | 36 ++++++++++++++++++++++++++++++++++++
 37 |  net/ipv4/tcp_ipv4.c        |  2 ++
 38 |  5 files changed, 53 insertions(+)
 39 | 
 40 | diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h
 41 | index 6c5b2efc4f17..bf2c9b5847e4 100644
 42 | --- a/include/net/netns/ipv4.h
 43 | +++ b/include/net/netns/ipv4.h
 44 | @@ -182,6 +182,7 @@ struct netns_ipv4 {
 45 |  	int sysctl_udp_rmem_min;
 46 |  
 47 |  	u8 sysctl_fib_notify_on_flag_change;
 48 | +	unsigned int sysctl_tcp_collapse_max_bytes;
 49 |  
 50 |  #ifdef CONFIG_NET_L3_MASTER_DEV
 51 |  	u8 sysctl_udp_l3mdev_accept;
 52 | diff --git a/include/trace/events/tcp.h b/include/trace/events/tcp.h
 53 | index 521059d8dc0a..35fa0f23680c 100644
 54 | --- a/include/trace/events/tcp.h
 55 | +++ b/include/trace/events/tcp.h
 56 | @@ -187,6 +187,13 @@ DEFINE_EVENT(tcp_event_sk, tcp_rcv_space_adjust,
 57 |  	TP_ARGS(sk)
 58 |  );
 59 |  
 60 | +DEFINE_EVENT(tcp_event_sk, tcp_collapse_max_bytes_exceeded,
 61 | +
 62 | +	TP_PROTO(struct sock *sk),
 63 | +
 64 | +	TP_ARGS(sk)
 65 | +);
 66 | +
 67 |  TRACE_EVENT(tcp_retransmit_synack,
 68 |  
 69 |  	TP_PROTO(const struct sock *sk, const struct request_sock *req),
 70 | diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
 71 | index 6f1e64d49232..a61e6b1ac0cc 100644
 72 | --- a/net/ipv4/sysctl_net_ipv4.c
 73 | +++ b/net/ipv4/sysctl_net_ipv4.c
 74 | @@ -1406,6 +1406,13 @@ static struct ctl_table ipv4_net_table[] = {
 75 |  		.extra1		= SYSCTL_ZERO,
 76 |  		.extra2		= &two,
 77 |  	},
 78 | +	{
 79 | +		.procname	= "tcp_collapse_max_bytes",
 80 | +		.data		= &init_net.ipv4.sysctl_tcp_collapse_max_bytes,
 81 | +		.maxlen		= sizeof(unsigned int),
 82 | +		.mode		= 0644,
 83 | +		.proc_handler	= proc_douintvec_minmax,
 84 | +	},
 85 |  	{ }
 86 |  };
 87 |  
 88 | diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
 89 | index f3b623967436..204f33f2835c 100644
 90 | --- a/net/ipv4/tcp_input.c
 91 | +++ b/net/ipv4/tcp_input.c
 92 | @@ -5340,6 +5340,7 @@ static bool tcp_prune_ofo_queue(struct sock *sk)
 93 |  static int tcp_prune_queue(struct sock *sk)
 94 |  {
 95 |  	struct tcp_sock *tp = tcp_sk(sk);
 96 | +	struct net *net = sock_net(sk);
 97 |  
 98 |  	NET_INC_STATS(sock_net(sk), LINUX_MIB_PRUNECALLED);
 99 |  
100 | @@ -5351,6 +5352,39 @@ static int tcp_prune_queue(struct sock *sk)
101 |  	if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf)
102 |  		return 0;
103 |  
104 | +	/* For context and additional information about this patch, see the
105 | +	 * blog post at
106 | +	 *
107 | +	 * sysctl:  net.ipv4.tcp_collapse_max_bytes
108 | +	 *
109 | +	 * If tcp_collapse_max_bytes is non-zero, attempt to collapse the
110 | +	 * queue to free up memory if the current amount of memory allocated
111 | +	 * is less than tcp_collapse_max_bytes.  Otherwise, the packet is
112 | +	 * dropped without attempting to collapse the queue.
113 | +	 *
114 | +	 * If tcp_collapse_max_bytes is zero, this feature is disabled
115 | +	 * and the default Linux behavior is used.  The default Linux
116 | +	 * behavior is to always perform the attempt to collapse the
117 | +	 * queue to free up memory.
118 | +	 *
119 | +	 * When the receive queue is small, we want to collapse the
120 | +	 * queue.  There are two reasons for this: (a) the latency of
121 | +	 * performing the collapse will be small on a small queue, and
122 | +	 * (b) we want to avoid sending a congestion signal (via a
123 | +	 * packet drop) to the sender when the receive queue is small.
124 | +	 *
125 | +	 * The result is that we avoid latency spikes caused by the
126 | +	 * time it takes to perform the collapse logic when the receive
127 | +	 * queue is large and full, while preserving existing behavior
128 | +	 * and performance for all other cases.
129 | +	 */
130 | +	if (net->ipv4.sysctl_tcp_collapse_max_bytes &&
131 | +		(atomic_read(&sk->sk_rmem_alloc) > net->ipv4.sysctl_tcp_collapse_max_bytes)) {
132 | +		/* We are dropping the packet */
133 | +		trace_tcp_collapse_max_bytes_exceeded(sk);
134 | +		goto do_not_collapse;
135 | +	}
136 | +
137 |  	tcp_collapse_ofo_queue(sk);
138 |  	if (!skb_queue_empty(&sk->sk_receive_queue))
139 |  		tcp_collapse(sk, &sk->sk_receive_queue, NULL,
140 | @@ -5370,6 +5404,8 @@ static int tcp_prune_queue(struct sock *sk)
141 |  	if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf)
142 |  		return 0;
143 |  
144 | +do_not_collapse:
145 | +
146 |  	/* If we are really being abused, tell the caller to silently
147 |  	 * drop receive data on the floor.  It will get retransmitted
148 |  	 * and hopefully then we'll have sufficient space.
149 | diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
150 | index 0fe9461647da..4fadbf38525f 100644
151 | --- a/net/ipv4/tcp_ipv4.c
152 | +++ b/net/ipv4/tcp_ipv4.c
153 | @@ -3218,6 +3218,8 @@ static int __net_init tcp_sk_init(struct net *net)
154 |  	else
155 |  		net->ipv4.tcp_congestion_control = &tcp_reno;
156 |  
157 | +	net->ipv4.sysctl_tcp_collapse_max_bytes = 0;
158 | +
159 |  	return 0;
160 |  fail:
161 |  	tcp_sk_exit(net);
162 | -- 
163 | 2.35.1
164 | 
165 | 


--------------------------------------------------------------------------------
/patches/0020-Add-a-sysctl-to-allow-TCP-window-shrinking-in-order-.patch:
--------------------------------------------------------------------------------
  1 | From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
  2 | From: "mfreemon@cloudflare.com" <mfreemon@cloudflare.com>
  3 | Date: Wed, 1 Mar 2023 20:06:28 -0600
  4 | Subject: [PATCH] Add a sysctl to allow TCP window shrinking in order to honor
  5 |  memory limits
  6 | 
  7 | Under certain circumstances, the tcp receive buffer memory limit
  8 | set by autotuning is ignored, and the receive buffer can grow
  9 | unrestrained until it reaches tcp_rmem[2].
 10 | 
 11 | To reproduce:  Connect a TCP session with the receiver doing
 12 | nothing and the sender sending small packets (an infinite loop
 13 | of socket send() with 4 bytes of payload with a sleep of 1 ms
 14 | in between each send()).  This will fill the tcp receive buffer
 15 | all the way to tcp_rmem[2], ignoring the autotuning limit
 16 | (sk_rcvbuf).
 17 | 
 18 | As a result, a host can have individual tcp sessions with receive
 19 | buffers of size tcp_rmem[2], and the host itself can reach tcp_mem
 20 | limits, causing the host to go into tcp memory pressure mode.
 21 | 
 22 | The fundamental issue is the relationship between the granularity
 23 | of the window scaling factor and the number of byte ACKed back
 24 | to the sender.  This problem has previously been identified in
 25 | RFC 7323, appendix F [1].
 26 | 
 27 | The Linux kernel currently adheres to never shrinking the window.
 28 | 
 29 | In addition to the overallocation of memory mentioned above, this
 30 | is also functionally incorrect, because once tcp_rmem[2] is
 31 | reached, the receiver will drop in-window packets resulting in
 32 | retransmissions and an eventual timeout of the tcp session.  A
 33 | receive buffer full condition should instead result in a zero
 34 | window and an indefinite wait.
 35 | 
 36 | In practice, this problem is largely hidden for most flows.  It
 37 | is not applicable to mice flows.  Elephant flows can send data
 38 | fast enough to "overrun" the sk_rcvbuf limit (in a single ACK),
 39 | triggering a zero window.
 40 | 
 41 | But this problem does show up for other types of flows.  A good
 42 | example are websockets and other type of flows that send small
 43 | amounts of data spaced apart slightly in time.  In these cases,
 44 | we directly encounter the problem described in [1].
 45 | 
 46 | RFC 7323, section 2.4 [2], says there are instances when a retracted
 47 | window can be offered, and that TCP implementations MUST ensure
 48 | that they handle a shrinking window, as specified in RFC 1122,
 49 | section 4.2.2.16 [3].  All prior RFCs on the topic of tcp window
 50 | management have made clear that sender must accept a shrunk window
 51 | from the receiver, including RFC 793 [4] and RFC 1323 [5].
 52 | 
 53 | This patch implements the functionality to shrink the tcp window
 54 | when necessary to keep the right edge within the memory limit by
 55 | autotuning (sk_rcvbuf).  This new functionality is enabled with
 56 | the following sysctl:
 57 | 
 58 | sysctl: net.ipv4.tcp_shrink_window
 59 | 
 60 | This sysctl changes how the TCP window is calculated.
 61 | 
 62 | If sysctl tcp_shrink_window is zero (the default value), then the
 63 | window is never shrunk.
 64 | 
 65 | If sysctl tcp_shrink_window is non-zero, then the memory limit
 66 | set by autotuning is honored.  This requires that the TCP window
 67 | be shrunk ("retracted") as described in RFC 1122.
 68 | 
 69 | [1] https://www.rfc-editor.org/rfc/rfc7323#appendix-F
 70 | [2] https://www.rfc-editor.org/rfc/rfc7323#section-2.4
 71 | [3] https://www.rfc-editor.org/rfc/rfc1122#page-91
 72 | [4] https://www.rfc-editor.org/rfc/rfc793
 73 | [5] https://www.rfc-editor.org/rfc/rfc1323
 74 | ---
 75 |  Documentation/networking/ip-sysctl.rst | 14 ++++++
 76 |  include/net/netns/ipv4.h               |  2 +
 77 |  net/ipv4/sysctl_net_ipv4.c             |  7 +++
 78 |  net/ipv4/tcp_ipv4.c                    |  1 +
 79 |  net/ipv4/tcp_output.c                  | 59 +++++++++++++++++++-------
 80 |  5 files changed, 68 insertions(+), 15 deletions(-)
 81 | 
 82 | diff --git a/Documentation/networking/ip-sysctl.rst b/Documentation/networking/ip-sysctl.rst
 83 | index e7b3fa7bb3f7..114ea77f4786 100644
 84 | --- a/Documentation/networking/ip-sysctl.rst
 85 | +++ b/Documentation/networking/ip-sysctl.rst
 86 | @@ -965,6 +965,20 @@ tcp_tw_reuse - INTEGER
 87 |  tcp_window_scaling - BOOLEAN
 88 |  	Enable window scaling as defined in RFC1323.
 89 |  
 90 | +tcp_shrink_window - BOOLEAN
 91 | +	This changes how the TCP receive window is calculated when window
 92 | +	scaling is in effect.
 93 | +
 94 | +	RFC 7323, section 2.4, says there are instances when a retracted
 95 | +	window can be offered, and that TCP implementations MUST ensure
 96 | +	that they handle a shrinking window, as specified in RFC 1122.
 97 | +
 98 | +	- 0 - Disabled.	The window is never shrunk.
 99 | +	- 1 - Enabled.	The window is shrunk when necessary to remain within
100 | +					the memory limit set by autotuning (sk_rcvbuf).
101 | +
102 | +	Default: 0
103 | +
104 |  tcp_wmem - vector of 3 INTEGERs: min, default, max
105 |  	min: Amount of memory reserved for send buffers for TCP sockets.
106 |  	Each TCP socket has rights to use it due to fact of its birth.
107 | diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h
108 | index bea45ca29cd0..476378afdd99 100644
109 | --- a/include/net/netns/ipv4.h
110 | +++ b/include/net/netns/ipv4.h
111 | @@ -231,5 +231,7 @@ struct netns_ipv4 {
112 |  
113 |  	atomic_t	rt_genid;
114 |  	siphash_key_t	ip_id_key;
115 | +
116 | +	unsigned int sysctl_tcp_shrink_window;
117 |  };
118 |  #endif
119 | diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
120 | index fab6da51e4c6..bf5386395ebd 100644
121 | --- a/net/ipv4/sysctl_net_ipv4.c
122 | +++ b/net/ipv4/sysctl_net_ipv4.c
123 | @@ -1398,6 +1398,13 @@ static struct ctl_table ipv4_net_table[] = {
124 |  		.mode		= 0644,
125 |  		.proc_handler	= proc_douintvec_minmax,
126 |  	},
127 | +	{
128 | +		.procname	= "tcp_shrink_window",
129 | +		.data		= &init_net.ipv4.sysctl_tcp_shrink_window,
130 | +		.maxlen		= sizeof(unsigned int),
131 | +		.mode		= 0644,
132 | +		.proc_handler	= proc_douintvec_minmax,
133 | +	},
134 |  	{ }
135 |  };
136 |  
137 | diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
138 | index a0a3880b8cf9..725c2aa3b515 100644
139 | --- a/net/ipv4/tcp_ipv4.c
140 | +++ b/net/ipv4/tcp_ipv4.c
141 | @@ -3217,6 +3217,7 @@ static int __net_init tcp_sk_init(struct net *net)
142 |  		net->ipv4.tcp_congestion_control = &tcp_reno;
143 |  
144 |  	net->ipv4.sysctl_tcp_collapse_max_bytes = 0;
145 | +	net->ipv4.sysctl_tcp_shrink_window = 0;
146 |  
147 |  	return 0;
148 |  }
149 | diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
150 | index 85f9a3a99bd6..c08cb445d5f7 100644
151 | --- a/net/ipv4/tcp_output.c
152 | +++ b/net/ipv4/tcp_output.c
153 | @@ -260,8 +260,8 @@ static u16 tcp_select_window(struct sock *sk)
154 |  	u32 old_win = tp->rcv_wnd;
155 |  	u32 cur_win = tcp_receive_window(tp);
156 |  	u32 new_win = __tcp_select_window(sk);
157 | +	struct net *net = sock_net(sk);
158 |  
159 | -	/* Never shrink the offered window */
160 |  	if (new_win < cur_win) {
161 |  		/* Danger Will Robinson!
162 |  		 * Don't update rcv_wup/rcv_wnd here or else
163 | @@ -270,11 +270,15 @@ static u16 tcp_select_window(struct sock *sk)
164 |  		 *
165 |  		 * Relax Will Robinson.
166 |  		 */
167 | -		if (new_win == 0)
168 | -			NET_INC_STATS(sock_net(sk),
169 | -				      LINUX_MIB_TCPWANTZEROWINDOWADV);
170 | -		new_win = ALIGN(cur_win, 1 << tp->rx_opt.rcv_wscale);
171 | +		if (!net->ipv4.sysctl_tcp_shrink_window) {
172 | +			/* Never shrink the offered window */
173 | +			if (new_win == 0)
174 | +				NET_INC_STATS(sock_net(sk),
175 | +					      LINUX_MIB_TCPWANTZEROWINDOWADV);
176 | +			new_win = ALIGN(cur_win, 1 << tp->rx_opt.rcv_wscale);
177 | +		}
178 |  	}
179 | +
180 |  	tp->rcv_wnd = new_win;
181 |  	tp->rcv_wup = tp->rcv_nxt;
182 |  
183 | @@ -2956,6 +2960,7 @@ u32 __tcp_select_window(struct sock *sk)
184 |  {
185 |  	struct inet_connection_sock *icsk = inet_csk(sk);
186 |  	struct tcp_sock *tp = tcp_sk(sk);
187 | +	struct net *net = sock_net(sk);
188 |  	/* MSS for the peer's data.  Previous versions used mss_clamp
189 |  	 * here.  I don't know if the value based on our guesses
190 |  	 * of peer's MSS is better for the performance.  It's more correct
191 | @@ -2977,16 +2982,24 @@ u32 __tcp_select_window(struct sock *sk)
192 |  		if (mss <= 0)
193 |  			return 0;
194 |  	}
195 | +
196 | +	if (net->ipv4.sysctl_tcp_shrink_window) {
197 | +		/* new window should always be an exact multiple of scaling factor */
198 | +		free_space = round_down(free_space, 1 << tp->rx_opt.rcv_wscale);
199 | +	}
200 | +
201 |  	if (free_space < (full_space >> 1)) {
202 |  		icsk->icsk_ack.quick = 0;
203 |  
204 |  		if (tcp_under_memory_pressure(sk))
205 |  			tcp_adjust_rcv_ssthresh(sk);
206 |  
207 | -		/* free_space might become our new window, make sure we don't
208 | -		 * increase it due to wscale.
209 | -		 */
210 | -		free_space = round_down(free_space, 1 << tp->rx_opt.rcv_wscale);
211 | +		if (!net->ipv4.sysctl_tcp_shrink_window) {
212 | +			/* free_space might become our new window, make sure we don't
213 | +			 * increase it due to wscale.
214 | +			 */
215 | +			free_space = round_down(free_space, 1 << tp->rx_opt.rcv_wscale);
216 | +		}
217 |  
218 |  		/* if free space is less than mss estimate, or is below 1/16th
219 |  		 * of the maximum allowed, try to move to zero-window, else
220 | @@ -2997,10 +3010,24 @@ u32 __tcp_select_window(struct sock *sk)
221 |  		 */
222 |  		if (free_space < (allowed_space >> 4) || free_space < mss)
223 |  			return 0;
224 | +
225 | +		if (net->ipv4.sysctl_tcp_shrink_window && free_space < (1 << tp->rx_opt.rcv_wscale))
226 | +			return 0;
227 |  	}
228 |  
229 | -	if (free_space > tp->rcv_ssthresh)
230 | +	if (free_space > tp->rcv_ssthresh) {
231 |  		free_space = tp->rcv_ssthresh;
232 | +		if (net->ipv4.sysctl_tcp_shrink_window) {
233 | +			/* new window should always be an exact multiple of scaling factor
234 | +			 *
235 | +			 * For this case, we ALIGN "up" (increase free_space) because
236 | +			 * we know free_space is not zero here, it has been reduced from
237 | +			 * the memory-based limit, and rcv_ssthresh is not a hard limit
238 | +			 * (unlike sk_rcvbuf).
239 | +			 */
240 | +			free_space = ALIGN(free_space, (1 << tp->rx_opt.rcv_wscale));
241 | +		}
242 | +	}
243 |  
244 |  	/* Don't do rounding if we are using window scaling, since the
245 |  	 * scaled window will not line up with the MSS boundary anyway.
246 | @@ -3008,11 +3035,13 @@ u32 __tcp_select_window(struct sock *sk)
247 |  	if (tp->rx_opt.rcv_wscale) {
248 |  		window = free_space;
249 |  
250 | -		/* Advertise enough space so that it won't get scaled away.
251 | -		 * Import case: prevent zero window announcement if
252 | -		 * 1<<rcv_wscale > mss.
253 | -		 */
254 | -		window = ALIGN(window, (1 << tp->rx_opt.rcv_wscale));
255 | +		if (!net->ipv4.sysctl_tcp_shrink_window) {
256 | +			/* Advertise enough space so that it won't get scaled away.
257 | +			 * Import case: prevent zero window announcement if
258 | +			 * 1<<rcv_wscale > mss.
259 | +			 */
260 | +			window = ALIGN(window, (1 << tp->rx_opt.rcv_wscale));
261 | +		}
262 |  	} else {
263 |  		window = tp->rcv_wnd;
264 |  		/* Get the largest window that is a nice multiple of mss.
265 | -- 
266 | 2.39.2
267 | 
268 | 


--------------------------------------------------------------------------------
/patches/0024-Add-xtsproxy-Crypto-API-module.patch:
--------------------------------------------------------------------------------
  1 | From 354d7a8febaa440dd3244466670315ed2805764e Mon Sep 17 00:00:00 2001
  2 | From: Ignat Korchagin <ignat@cloudflare.com>
  3 | Date: Wed, 4 Dec 2019 16:53:46 +0000
  4 | Subject: [PATCH] Add xtsproxy Crypto API module
  5 | 
  6 | This module implements a Crypto API AES-XTS synchronous driver, which uses
  7 | AES NI implementation as a backend and falls back to generic AES implementation,
  8 | when FPU is not usable.
  9 | ---
 10 |  crypto/Kconfig    |  10 ++++
 11 |  crypto/Makefile   |   1 +
 12 |  crypto/xtsproxy.c | 131 ++++++++++++++++++++++++++++++++++++++++++++++
 13 |  3 files changed, 142 insertions(+)
 14 |  create mode 100644 crypto/xtsproxy.c
 15 | 
 16 | diff --git a/crypto/Kconfig b/crypto/Kconfig
 17 | index 094ef56ab7b4..9964667cef85 100644
 18 | --- a/crypto/Kconfig
 19 | +++ b/crypto/Kconfig
 20 | @@ -457,6 +457,16 @@ config CRYPTO_XTS
 21 |  	  key size 256, 384 or 512 bits. This implementation currently
 22 |  	  can't handle a sectorsize which is not a multiple of 16 bytes.
 23 |  
 24 | +config CRYPTO_XTS_AES_SYNC
 25 | +	tristate "XTS AES synchronous implementation"
 26 | +	depends on X86 && 64BIT
 27 | +	select CRYPTO_AES
 28 | +	select CRYPTO_AES_NI_INTEL
 29 | +	help
 30 | +	  A synchronous AES-XTS implementaion, which uses AES NI as a
 31 | +	  backend implementation and falls back to generic implementation,
 32 | +	  when FPU is not usable.
 33 | +
 34 |  config CRYPTO_KEYWRAP
 35 |  	tristate "Key wrapping support"
 36 |  	select CRYPTO_SKCIPHER
 37 | diff --git a/crypto/Makefile b/crypto/Makefile
 38 | index b279483fba50..4f6ddcbdc6d4 100644
 39 | --- a/crypto/Makefile
 40 | +++ b/crypto/Makefile
 41 | @@ -90,6 +90,7 @@ obj-$(CONFIG_CRYPTO_PCBC) += pcbc.o
 42 |  obj-$(CONFIG_CRYPTO_CTS) += cts.o
 43 |  obj-$(CONFIG_CRYPTO_LRW) += lrw.o
 44 |  obj-$(CONFIG_CRYPTO_XTS) += xts.o
 45 | +obj-$(CONFIG_CRYPTO_XTS_AES_SYNC) += xtsproxy.o
 46 |  obj-$(CONFIG_CRYPTO_CTR) += ctr.o
 47 |  obj-$(CONFIG_CRYPTO_KEYWRAP) += keywrap.o
 48 |  obj-$(CONFIG_CRYPTO_ADIANTUM) += adiantum.o
 49 | diff --git a/crypto/xtsproxy.c b/crypto/xtsproxy.c
 50 | new file mode 100644
 51 | index 000000000000..51ecfb7b4891
 52 | --- /dev/null
 53 | +++ b/crypto/xtsproxy.c
 54 | @@ -0,0 +1,131 @@
 55 | +#include <linux/module.h>
 56 | +#include <linux/crypto.h>
 57 | +#include <linux/err.h>
 58 | +#include <crypto/internal/skcipher.h>
 59 | +#include <crypto/aes.h>
 60 | +#include <asm/fpu/api.h>
 61 | +
 62 | +struct xtsproxy_ctx {
 63 | +	struct crypto_skcipher *xts_aesni;
 64 | +	struct crypto_skcipher *xts_generic;
 65 | +};
 66 | +
 67 | +static int xtsproxy_skcipher_init(struct crypto_skcipher *tfm)
 68 | +{
 69 | +	struct xtsproxy_ctx *ctx = crypto_skcipher_ctx(tfm);
 70 | +
 71 | +	/* AESNI based XTS implementation, requires FPU to be available */
 72 | +	ctx->xts_aesni = crypto_alloc_skcipher("__xts-aes-aesni", CRYPTO_ALG_INTERNAL, 0);
 73 | +	if (IS_ERR(ctx->xts_aesni))
 74 | +		return PTR_ERR(ctx->xts_aesni);
 75 | +
 76 | +	/* generic XTS implementation based on generic FPU-less AES */
 77 | +	/* there is also aes-aesni implementation, which falls back to aes-generic */
 78 | +	/* but we're doing FPU checks in our code, so no need to repeat those */
 79 | +	/* as we will always fallback to aes-generic in this case */
 80 | +	ctx->xts_generic = crypto_alloc_skcipher("xts(ecb(aes-generic))", 0, 0);
 81 | +	if (IS_ERR(ctx->xts_generic))
 82 | +		return PTR_ERR(ctx->xts_generic);
 83 | +
 84 | +	/* make sure we allocate enough request memory for both implementations */
 85 | +	crypto_skcipher_set_reqsize(tfm, max(crypto_skcipher_reqsize(ctx->xts_aesni), crypto_skcipher_reqsize(ctx->xts_generic)));
 86 | +
 87 | +	return 0;
 88 | +}
 89 | +
 90 | +static void xtsproxy_skcipher_exit(struct crypto_skcipher *tfm)
 91 | +{
 92 | +	struct xtsproxy_ctx *ctx = crypto_skcipher_ctx(tfm);
 93 | +
 94 | +	if (!IS_ERR_OR_NULL(ctx->xts_generic)) {
 95 | +		crypto_free_skcipher(ctx->xts_generic);
 96 | +		ctx->xts_generic = NULL;
 97 | +	}
 98 | +
 99 | +	if (!IS_ERR_OR_NULL(ctx->xts_aesni)) {
100 | +		crypto_free_skcipher(ctx->xts_aesni);
101 | +		ctx->xts_aesni = NULL;
102 | +	}
103 | +}
104 | +
105 | +static int xtsproxy_setkey(struct crypto_skcipher *tfm, const u8 *key,
106 | +			    unsigned int keylen)
107 | +{
108 | +	struct xtsproxy_ctx *ctx = crypto_skcipher_ctx(tfm);
109 | +	int err;
110 | +
111 | +	err = crypto_skcipher_setkey(ctx->xts_aesni, key, keylen);
112 | +	if (err)
113 | +		return err;
114 | +
115 | +	return crypto_skcipher_setkey(ctx->xts_generic, key, keylen);
116 | +}
117 | +
118 | +static int xtsproxy_encrypt(struct skcipher_request *req)
119 | +{
120 | +	struct xtsproxy_ctx *ctx = crypto_skcipher_ctx(crypto_skcipher_reqtfm(req));
121 | +
122 | +	if (irq_fpu_usable())
123 | +		skcipher_request_set_tfm(req, ctx->xts_aesni);
124 | +	else
125 | +		skcipher_request_set_tfm(req, ctx->xts_generic);
126 | +
127 | +	/* underlying implementations should not try to sleep */
128 | +	req->base.flags &= ~(CRYPTO_TFM_REQ_MAY_SLEEP | CRYPTO_TFM_REQ_MAY_BACKLOG);
129 | +
130 | +	return crypto_skcipher_encrypt(req);
131 | +}
132 | +
133 | +static int xtsproxy_decrypt(struct skcipher_request *req)
134 | +{
135 | +	struct xtsproxy_ctx *ctx = crypto_skcipher_ctx(crypto_skcipher_reqtfm(req));
136 | +
137 | +	if (irq_fpu_usable())
138 | +		skcipher_request_set_tfm(req, ctx->xts_aesni);
139 | +	else
140 | +		skcipher_request_set_tfm(req, ctx->xts_generic);
141 | +
142 | +	/* underlying implementations should not try to sleep */
143 | +	req->base.flags &= ~(CRYPTO_TFM_REQ_MAY_SLEEP | CRYPTO_TFM_REQ_MAY_BACKLOG);
144 | +
145 | +	return crypto_skcipher_decrypt(req);
146 | +}
147 | +
148 | +static struct skcipher_alg xtsproxy_skcipher = {
149 | +	.base = {
150 | +		.cra_name			= "xts(aes)",
151 | +		.cra_driver_name	= "xts-aes-xtsproxy",
152 | +		/* make sure we don't use it unless requested explicitly */
153 | +		.cra_priority		= 0,
154 | +		/* .cra_flags			= CRYPTO_ALG_INTERNAL, */
155 | +		.cra_blocksize		= AES_BLOCK_SIZE,
156 | +		.cra_ctxsize		= sizeof(struct xtsproxy_ctx),
157 | +		.cra_module			= THIS_MODULE,
158 | +	},
159 | +	.min_keysize	= 2 * AES_MIN_KEY_SIZE,
160 | +	.max_keysize	= 2 * AES_MAX_KEY_SIZE,
161 | +	.ivsize			= AES_BLOCK_SIZE,
162 | +	.init 			= xtsproxy_skcipher_init,
163 | +	.exit 			= xtsproxy_skcipher_exit,
164 | +	.setkey			= xtsproxy_setkey,
165 | +	.encrypt		= xtsproxy_encrypt,
166 | +	.decrypt		= xtsproxy_decrypt,
167 | +};
168 | +
169 | +static int __init xtsproxy_init(void)
170 | +{
171 | +	return crypto_register_skcipher(&xtsproxy_skcipher);
172 | +}
173 | +
174 | +static void __exit xtsproxy_fini(void)
175 | +{
176 | +	crypto_unregister_skcipher(&xtsproxy_skcipher);
177 | +}
178 | +
179 | +module_init(xtsproxy_init);
180 | +module_exit(xtsproxy_fini);
181 | +
182 | +MODULE_DESCRIPTION("XTS-AES using AESNI implementation with generic AES fallback");
183 | +MODULE_AUTHOR("Ignat Korchagin <ignat@cloudflare.com>");
184 | +MODULE_LICENSE("GPL");
185 | +MODULE_ALIAS_CRYPTO("xts(aes)");
186 | -- 
187 | 2.29.1
188 | 
189 | 


--------------------------------------------------------------------------------