├── .github └── workflows │ └── semgrep.yml ├── COPYING ├── README.md └── patches ├── 0001-audit-check-syscall-bitmap-on-entry-to-avoid-extra-w.patch ├── 0014-add-a-sysctl-to-enable-disable-tcp_collapse-logic.patch ├── 0020-Add-a-sysctl-to-allow-TCP-window-shrinking-in-order-.patch └── 0024-Add-xtsproxy-Crypto-API-module.patch /.github/workflows/semgrep.yml: -------------------------------------------------------------------------------- 1 | 2 | on: 3 | pull_request: {} 4 | workflow_dispatch: {} 5 | push: 6 | branches: 7 | - main 8 | - master 9 | schedule: 10 | - cron: '0 0 * * *' 11 | name: Semgrep config 12 | jobs: 13 | semgrep: 14 | name: semgrep/ci 15 | runs-on: ubuntu-20.04 16 | env: 17 | SEMGREP_APP_TOKEN: ${{ secrets.SEMGREP_APP_TOKEN }} 18 | SEMGREP_URL: https://cloudflare.semgrep.dev 19 | SEMGREP_APP_URL: https://cloudflare.semgrep.dev 20 | SEMGREP_VERSION_CHECK_URL: https://cloudflare.semgrep.dev/api/check-version 21 | container: 22 | image: returntocorp/semgrep 23 | steps: 24 | - uses: actions/checkout@v3 25 | - run: semgrep ci 26 | -------------------------------------------------------------------------------- /COPYING: -------------------------------------------------------------------------------- 1 | This software is distributed under the same license as The Linux Kernel. 2 | See https://www.kernel.org/pub/linux/kernel/COPYING for details. 3 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Cloudflare Linux Kernel Patches 2 | 3 | This repository contains some Linux Kernel patches, which where not submitted to the mainline kernel tree. These patches may be not generic enough for inclusion into the mainline kernel tree, declined by the kernel maintainers etc. 4 | 5 | ## Supported Kernel Versions 6 | 7 | Normally the patches in this repository should be applicable on top of the latest [Linux Kernel long-term support ("LTS") release](https://www.kernel.org/releases.html). Other versions are not considered. 8 | 9 | These patches are provided as-is without any support guarantee, but contributions are welcome. 10 | -------------------------------------------------------------------------------- /patches/0001-audit-check-syscall-bitmap-on-entry-to-avoid-extra-w.patch: -------------------------------------------------------------------------------- 1 | From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 2 | From: Ivan Babrou 3 | Date: Mon, 22 May 2023 17:04:03 -0700 4 | Subject: [PATCH] audit: check syscall bitmap on entry to avoid extra work 5 | 6 | Currently audit subsystem arms itself as long as there are rules present, 7 | which means that on every syscall exit all rules are evaluated, even 8 | if they don't match the syscall to begin with. For setups where 9 | there are no rules that can match any syscall, this means that 10 | the CPU price needs to be paid when it's not necessary. 11 | 12 | This patch introduces a bitmap for syscalls that is maintained 13 | when rules are inserted and removed. For every syscall we maintain 14 | a bit indicating whether it needs to be audited at all, which is then 15 | checked at syscall entry. If the are no rules matching a syscall, 16 | extra cost of checking all the rules is avoided. 17 | 18 | Consider the following set of 10 audit rules as a benchmark: 19 | 20 | -a always,exit -F arch=b64 -S unlinkat,linkat,renameat,openat,renameat2 -F perm=wa -F dir=/tmp/audit-bench/0 -F key=BENCH0 21 | -a always,exit -F arch=b64 -S unlinkat,linkat,renameat,openat,renameat2 -F perm=wa -F dir=/tmp/audit-bench/1 -F key=BENCH1 22 | -a always,exit -F arch=b64 -S unlinkat,linkat,renameat,openat,renameat2 -F perm=wa -F dir=/tmp/audit-bench/2 -F key=BENCH2 23 | -a always,exit -F arch=b64 -S unlinkat,linkat,renameat,openat,renameat2 -F perm=wa -F dir=/tmp/audit-bench/3 -F key=BENCH3 24 | -a always,exit -F arch=b64 -S unlinkat,linkat,renameat,openat,renameat2 -F perm=wa -F dir=/tmp/audit-bench/4 -F key=BENCH4 25 | -a always,exit -F arch=b64 -S unlinkat,linkat,renameat,openat,renameat2 -F perm=wa -F dir=/tmp/audit-bench/5 -F key=BENCH5 26 | -a always,exit -F arch=b64 -S unlinkat,linkat,renameat,openat,renameat2 -F perm=wa -F dir=/tmp/audit-bench/6 -F key=BENCH6 27 | -a always,exit -F arch=b64 -S unlinkat,linkat,renameat,openat,renameat2 -F perm=wa -F dir=/tmp/audit-bench/7 -F key=BENCH7 28 | -a always,exit -F arch=b64 -S unlinkat,linkat,renameat,openat,renameat2 -F perm=wa -F dir=/tmp/audit-bench/8 -F key=BENCH8 29 | -a always,exit -F arch=b64 -S unlinkat,linkat,renameat,openat,renameat2 -F perm=wa -F dir=/tmp/audit-bench/9 -F key=BENCH9 30 | 31 | We can use the following benchmark to run unrelated syscalls: 32 | 33 | #include 34 | #include 35 | #include 36 | 37 | #define GETPID_COUNT 100 * 1000 38 | #define STAT_COUNT 100 * 1000 39 | 40 | pid_t bench_getpid() 41 | { 42 | pid_t pid; 43 | 44 | for (int i = 0; i < GETPID_COUNT; i++) 45 | { 46 | pid = getpid(); 47 | } 48 | 49 | return pid; 50 | } 51 | 52 | struct stat bench_stat() 53 | { 54 | struct stat statbuf; 55 | 56 | for (int i = 0; i < STAT_COUNT; i++) 57 | { 58 | stat("/etc/passwd", &statbuf); 59 | } 60 | 61 | return statbuf; 62 | } 63 | 64 | int main() 65 | { 66 | pid_t pid = bench_getpid(); 67 | struct stat statbuf = bench_stat(); 68 | 69 | printf("pid = %d, size = %d\n", pid, statbuf.st_size); 70 | } 71 | 72 | Here we run 100k `getpid()` calls and 100k `stat()` calls, which are not 73 | covered by any of the audit rules installed on the system. 74 | 75 | When running without any rules present, but with auditd running, flamegraphs 76 | show ~5% of CPU time spent in audit_* code. If we install the rules mentioned 77 | above, this number jumps to ~24%. With this patch applied, the number is once 78 | again down to 5%, which is what one would expect. 79 | 80 | There's extra cost of maintaining the bitmap when rules are changed, 81 | but it's negligible compared to CPU savings from cheaper syscalls. 82 | 83 | Signed-off-by: Ivan Babrou 84 | 85 | --- 86 | v2: Skipping AUDIT_NEVER in bitmap. 87 | --- 88 | include/linux/audit.h | 21 +++++++++++++++++++++ 89 | kernel/auditfilter.c | 36 ++++++++++++++++++++++++++++++++---- 90 | kernel/auditsc.c | 27 +++++++++++---------------- 91 | 3 files changed, 64 insertions(+), 20 deletions(-) 92 | 93 | diff --git a/include/linux/audit.h b/include/linux/audit.h 94 | index 31086a72e32a..e99428052321 100644 95 | --- a/include/linux/audit.h 96 | +++ b/include/linux/audit.h 97 | @@ -9,6 +9,7 @@ 98 | #ifndef _LINUX_AUDIT_H_ 99 | #define _LINUX_AUDIT_H_ 100 | 101 | +#include 102 | #include 103 | #include 104 | #include 105 | @@ -399,6 +400,22 @@ static inline void audit_ptrace(struct task_struct *t) 106 | __audit_ptrace(t); 107 | } 108 | 109 | +static inline int audit_in_mask(const struct audit_krule *rule, unsigned long val) 110 | +{ 111 | + int word, bit; 112 | + 113 | + if (val > 0xffffffff) 114 | + return false; 115 | + 116 | + word = AUDIT_WORD(val); 117 | + if (word >= AUDIT_BITMASK_SIZE) 118 | + return false; 119 | + 120 | + bit = AUDIT_BIT(val); 121 | + 122 | + return rule->mask[word] & bit; 123 | +} 124 | + 125 | /* Private API (for audit.c only) */ 126 | extern void __audit_ipc_obj(struct kern_ipc_perm *ipcp); 127 | extern void __audit_ipc_set_perm(unsigned long qbytes, uid_t uid, gid_t gid, umode_t mode); 128 | @@ -573,6 +590,10 @@ static inline void audit_log_nfcfg(const char *name, u8 af, 129 | 130 | extern int audit_n_rules; 131 | extern int audit_signals; 132 | + 133 | +extern int audit_n_syscall_rules; 134 | +extern int audit_syscall_rules[NR_syscalls]; 135 | +extern DECLARE_BITMAP(audit_syscalls_bitmap, NR_syscalls); 136 | #else /* CONFIG_AUDITSYSCALL */ 137 | static inline int audit_alloc(struct task_struct *task) 138 | { 139 | diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c 140 | index 42d99896e7a6..e6ca85e96622 100644 141 | --- a/kernel/auditfilter.c 142 | +++ b/kernel/auditfilter.c 143 | @@ -943,7 +943,7 @@ static inline int audit_add_rule(struct audit_entry *entry) 144 | struct list_head *list; 145 | int err = 0; 146 | #ifdef CONFIG_AUDITSYSCALL 147 | - int dont_count = 0; 148 | + int syscall_nr, dont_count = 0; 149 | 150 | /* If any of these, don't count towards total */ 151 | switch(entry->rule.listnr) { 152 | @@ -1007,9 +1007,23 @@ static inline int audit_add_rule(struct audit_entry *entry) 153 | list_add_tail_rcu(&entry->list, list); 154 | } 155 | #ifdef CONFIG_AUDITSYSCALL 156 | - if (!dont_count) 157 | + if (!dont_count) { 158 | audit_n_rules++; 159 | 160 | + if (entry->rule.listnr == AUDIT_FILTER_EXIT) { 161 | + audit_n_syscall_rules++; 162 | + 163 | + if (entry->rule.action != AUDIT_NEVER) { 164 | + for (syscall_nr = 0; syscall_nr < NR_syscalls; syscall_nr++) { 165 | + if (!audit_in_mask(&entry->rule, syscall_nr)) 166 | + continue; 167 | + if (++audit_syscall_rules[syscall_nr] == 1) 168 | + set_bit(syscall_nr, audit_syscalls_bitmap); 169 | + } 170 | + } 171 | + } 172 | + } 173 | + 174 | if (!audit_match_signal(entry)) 175 | audit_signals++; 176 | #endif 177 | @@ -1026,7 +1040,7 @@ int audit_del_rule(struct audit_entry *entry) 178 | struct list_head *list; 179 | int ret = 0; 180 | #ifdef CONFIG_AUDITSYSCALL 181 | - int dont_count = 0; 182 | + int syscall_nr, dont_count = 0; 183 | 184 | /* If any of these, don't count towards total */ 185 | switch(entry->rule.listnr) { 186 | @@ -1054,9 +1068,23 @@ int audit_del_rule(struct audit_entry *entry) 187 | audit_remove_mark_rule(&e->rule); 188 | 189 | #ifdef CONFIG_AUDITSYSCALL 190 | - if (!dont_count) 191 | + if (!dont_count) { 192 | audit_n_rules--; 193 | 194 | + if (entry->rule.listnr == AUDIT_FILTER_EXIT) { 195 | + audit_n_syscall_rules--; 196 | + 197 | + if (entry->rule.action != AUDIT_NEVER) { 198 | + for (syscall_nr = 0; syscall_nr < NR_syscalls; syscall_nr++) { 199 | + if (!audit_in_mask(&entry->rule, syscall_nr)) 200 | + continue; 201 | + if (--audit_syscall_rules[syscall_nr] == 0) 202 | + clear_bit(syscall_nr, audit_syscalls_bitmap); 203 | + } 204 | + } 205 | + } 206 | + } 207 | + 208 | if (!audit_match_signal(entry)) 209 | audit_signals--; 210 | #endif 211 | diff --git a/kernel/auditsc.c b/kernel/auditsc.c 212 | index addeed3df15d..eb8296474bb2 100644 213 | --- a/kernel/auditsc.c 214 | +++ b/kernel/auditsc.c 215 | @@ -86,6 +86,15 @@ int audit_n_rules; 216 | /* determines whether we collect data for signals sent */ 217 | int audit_signals; 218 | 219 | +/* number of syscall related audit rules */ 220 | +int audit_n_syscall_rules; 221 | + 222 | +/* number of rules per syscall */ 223 | +int audit_syscall_rules[NR_syscalls]; 224 | + 225 | +/* bitmap for checking whether a syscall is audited */ 226 | +DECLARE_BITMAP(audit_syscalls_bitmap, NR_syscalls); 227 | + 228 | struct audit_aux_data { 229 | struct audit_aux_data *next; 230 | int type; 231 | @@ -790,22 +799,6 @@ static enum audit_state audit_filter_task(struct task_struct *tsk, char **key) 232 | return AUDIT_STATE_BUILD; 233 | } 234 | 235 | -static int audit_in_mask(const struct audit_krule *rule, unsigned long val) 236 | -{ 237 | - int word, bit; 238 | - 239 | - if (val > 0xffffffff) 240 | - return false; 241 | - 242 | - word = AUDIT_WORD(val); 243 | - if (word >= AUDIT_BITMASK_SIZE) 244 | - return false; 245 | - 246 | - bit = AUDIT_BIT(val); 247 | - 248 | - return rule->mask[word] & bit; 249 | -} 250 | - 251 | /** 252 | * __audit_filter_op - common filter helper for operations (syscall/uring/etc) 253 | * @tsk: associated task 254 | @@ -2025,6 +2018,8 @@ void __audit_syscall_entry(int major, unsigned long a1, unsigned long a2, 255 | return; 256 | 257 | context->dummy = !audit_n_rules; 258 | + if (!context->dummy && audit_n_syscall_rules == audit_n_rules) 259 | + context->dummy = !test_bit(major, audit_syscalls_bitmap); 260 | if (!context->dummy && state == AUDIT_STATE_BUILD) { 261 | context->prio = 0; 262 | if (auditd_test_task(current)) 263 | -- 264 | 2.41.0 265 | 266 | -------------------------------------------------------------------------------- /patches/0014-add-a-sysctl-to-enable-disable-tcp_collapse-logic.patch: -------------------------------------------------------------------------------- 1 | From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 2 | From: "mfreemon@cloudflare.com" 3 | Date: Tue, 1 Mar 2022 17:06:02 -0600 4 | Subject: [PATCH] Add a sysctl to skip tcp collapse processing when the receive 5 | buffer is full. 6 | 7 | For context and additional information about this patch, see the 8 | blog post at https://blog.cloudflare.com/optimizing-tcp-for-high-throughput-and-low-latency/ 9 | 10 | sysctl: net.ipv4.tcp_collapse_max_bytes 11 | 12 | If tcp_collapse_max_bytes is non-zero, attempt to collapse the 13 | queue to free up memory if the current amount of memory allocated 14 | is less than tcp_collapse_max_bytes. Otherwise, the packet is 15 | dropped without attempting to collapse the queue. 16 | 17 | If tcp_collapse_max_bytes is zero, this feature is disabled 18 | and the default Linux behavior is used. The default Linux 19 | behavior is to always perform the attempt to collapse the 20 | queue to free up memory. 21 | 22 | When the receive queue is small, we want to collapse the 23 | queue. There are two reasons for this: (a) the latency of 24 | performing the collapse will be small on a small queue, and 25 | (b) we want to avoid sending a congestion signal (via a 26 | packet drop) to the sender when the receive queue is small. 27 | 28 | The result is that we avoid latency spikes caused by the 29 | time it takes to perform the collapse logic when the receive 30 | queue is large and full, while preserving existing behavior 31 | and performance for all other cases. 32 | --- 33 | include/net/netns/ipv4.h | 1 + 34 | include/trace/events/tcp.h | 7 +++++++ 35 | net/ipv4/sysctl_net_ipv4.c | 7 +++++++ 36 | net/ipv4/tcp_input.c | 36 ++++++++++++++++++++++++++++++++++++ 37 | net/ipv4/tcp_ipv4.c | 2 ++ 38 | 5 files changed, 53 insertions(+) 39 | 40 | diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h 41 | index 6c5b2efc4f17..bf2c9b5847e4 100644 42 | --- a/include/net/netns/ipv4.h 43 | +++ b/include/net/netns/ipv4.h 44 | @@ -182,6 +182,7 @@ struct netns_ipv4 { 45 | int sysctl_udp_rmem_min; 46 | 47 | u8 sysctl_fib_notify_on_flag_change; 48 | + unsigned int sysctl_tcp_collapse_max_bytes; 49 | 50 | #ifdef CONFIG_NET_L3_MASTER_DEV 51 | u8 sysctl_udp_l3mdev_accept; 52 | diff --git a/include/trace/events/tcp.h b/include/trace/events/tcp.h 53 | index 521059d8dc0a..35fa0f23680c 100644 54 | --- a/include/trace/events/tcp.h 55 | +++ b/include/trace/events/tcp.h 56 | @@ -187,6 +187,13 @@ DEFINE_EVENT(tcp_event_sk, tcp_rcv_space_adjust, 57 | TP_ARGS(sk) 58 | ); 59 | 60 | +DEFINE_EVENT(tcp_event_sk, tcp_collapse_max_bytes_exceeded, 61 | + 62 | + TP_PROTO(struct sock *sk), 63 | + 64 | + TP_ARGS(sk) 65 | +); 66 | + 67 | TRACE_EVENT(tcp_retransmit_synack, 68 | 69 | TP_PROTO(const struct sock *sk, const struct request_sock *req), 70 | diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c 71 | index 6f1e64d49232..a61e6b1ac0cc 100644 72 | --- a/net/ipv4/sysctl_net_ipv4.c 73 | +++ b/net/ipv4/sysctl_net_ipv4.c 74 | @@ -1406,6 +1406,13 @@ static struct ctl_table ipv4_net_table[] = { 75 | .extra1 = SYSCTL_ZERO, 76 | .extra2 = &two, 77 | }, 78 | + { 79 | + .procname = "tcp_collapse_max_bytes", 80 | + .data = &init_net.ipv4.sysctl_tcp_collapse_max_bytes, 81 | + .maxlen = sizeof(unsigned int), 82 | + .mode = 0644, 83 | + .proc_handler = proc_douintvec_minmax, 84 | + }, 85 | { } 86 | }; 87 | 88 | diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c 89 | index f3b623967436..204f33f2835c 100644 90 | --- a/net/ipv4/tcp_input.c 91 | +++ b/net/ipv4/tcp_input.c 92 | @@ -5340,6 +5340,7 @@ static bool tcp_prune_ofo_queue(struct sock *sk) 93 | static int tcp_prune_queue(struct sock *sk) 94 | { 95 | struct tcp_sock *tp = tcp_sk(sk); 96 | + struct net *net = sock_net(sk); 97 | 98 | NET_INC_STATS(sock_net(sk), LINUX_MIB_PRUNECALLED); 99 | 100 | @@ -5351,6 +5352,39 @@ static int tcp_prune_queue(struct sock *sk) 101 | if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf) 102 | return 0; 103 | 104 | + /* For context and additional information about this patch, see the 105 | + * blog post at 106 | + * 107 | + * sysctl: net.ipv4.tcp_collapse_max_bytes 108 | + * 109 | + * If tcp_collapse_max_bytes is non-zero, attempt to collapse the 110 | + * queue to free up memory if the current amount of memory allocated 111 | + * is less than tcp_collapse_max_bytes. Otherwise, the packet is 112 | + * dropped without attempting to collapse the queue. 113 | + * 114 | + * If tcp_collapse_max_bytes is zero, this feature is disabled 115 | + * and the default Linux behavior is used. The default Linux 116 | + * behavior is to always perform the attempt to collapse the 117 | + * queue to free up memory. 118 | + * 119 | + * When the receive queue is small, we want to collapse the 120 | + * queue. There are two reasons for this: (a) the latency of 121 | + * performing the collapse will be small on a small queue, and 122 | + * (b) we want to avoid sending a congestion signal (via a 123 | + * packet drop) to the sender when the receive queue is small. 124 | + * 125 | + * The result is that we avoid latency spikes caused by the 126 | + * time it takes to perform the collapse logic when the receive 127 | + * queue is large and full, while preserving existing behavior 128 | + * and performance for all other cases. 129 | + */ 130 | + if (net->ipv4.sysctl_tcp_collapse_max_bytes && 131 | + (atomic_read(&sk->sk_rmem_alloc) > net->ipv4.sysctl_tcp_collapse_max_bytes)) { 132 | + /* We are dropping the packet */ 133 | + trace_tcp_collapse_max_bytes_exceeded(sk); 134 | + goto do_not_collapse; 135 | + } 136 | + 137 | tcp_collapse_ofo_queue(sk); 138 | if (!skb_queue_empty(&sk->sk_receive_queue)) 139 | tcp_collapse(sk, &sk->sk_receive_queue, NULL, 140 | @@ -5370,6 +5404,8 @@ static int tcp_prune_queue(struct sock *sk) 141 | if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf) 142 | return 0; 143 | 144 | +do_not_collapse: 145 | + 146 | /* If we are really being abused, tell the caller to silently 147 | * drop receive data on the floor. It will get retransmitted 148 | * and hopefully then we'll have sufficient space. 149 | diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c 150 | index 0fe9461647da..4fadbf38525f 100644 151 | --- a/net/ipv4/tcp_ipv4.c 152 | +++ b/net/ipv4/tcp_ipv4.c 153 | @@ -3218,6 +3218,8 @@ static int __net_init tcp_sk_init(struct net *net) 154 | else 155 | net->ipv4.tcp_congestion_control = &tcp_reno; 156 | 157 | + net->ipv4.sysctl_tcp_collapse_max_bytes = 0; 158 | + 159 | return 0; 160 | fail: 161 | tcp_sk_exit(net); 162 | -- 163 | 2.35.1 164 | 165 | -------------------------------------------------------------------------------- /patches/0020-Add-a-sysctl-to-allow-TCP-window-shrinking-in-order-.patch: -------------------------------------------------------------------------------- 1 | From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 2 | From: "mfreemon@cloudflare.com" 3 | Date: Wed, 1 Mar 2023 20:06:28 -0600 4 | Subject: [PATCH] Add a sysctl to allow TCP window shrinking in order to honor 5 | memory limits 6 | 7 | Under certain circumstances, the tcp receive buffer memory limit 8 | set by autotuning is ignored, and the receive buffer can grow 9 | unrestrained until it reaches tcp_rmem[2]. 10 | 11 | To reproduce: Connect a TCP session with the receiver doing 12 | nothing and the sender sending small packets (an infinite loop 13 | of socket send() with 4 bytes of payload with a sleep of 1 ms 14 | in between each send()). This will fill the tcp receive buffer 15 | all the way to tcp_rmem[2], ignoring the autotuning limit 16 | (sk_rcvbuf). 17 | 18 | As a result, a host can have individual tcp sessions with receive 19 | buffers of size tcp_rmem[2], and the host itself can reach tcp_mem 20 | limits, causing the host to go into tcp memory pressure mode. 21 | 22 | The fundamental issue is the relationship between the granularity 23 | of the window scaling factor and the number of byte ACKed back 24 | to the sender. This problem has previously been identified in 25 | RFC 7323, appendix F [1]. 26 | 27 | The Linux kernel currently adheres to never shrinking the window. 28 | 29 | In addition to the overallocation of memory mentioned above, this 30 | is also functionally incorrect, because once tcp_rmem[2] is 31 | reached, the receiver will drop in-window packets resulting in 32 | retransmissions and an eventual timeout of the tcp session. A 33 | receive buffer full condition should instead result in a zero 34 | window and an indefinite wait. 35 | 36 | In practice, this problem is largely hidden for most flows. It 37 | is not applicable to mice flows. Elephant flows can send data 38 | fast enough to "overrun" the sk_rcvbuf limit (in a single ACK), 39 | triggering a zero window. 40 | 41 | But this problem does show up for other types of flows. A good 42 | example are websockets and other type of flows that send small 43 | amounts of data spaced apart slightly in time. In these cases, 44 | we directly encounter the problem described in [1]. 45 | 46 | RFC 7323, section 2.4 [2], says there are instances when a retracted 47 | window can be offered, and that TCP implementations MUST ensure 48 | that they handle a shrinking window, as specified in RFC 1122, 49 | section 4.2.2.16 [3]. All prior RFCs on the topic of tcp window 50 | management have made clear that sender must accept a shrunk window 51 | from the receiver, including RFC 793 [4] and RFC 1323 [5]. 52 | 53 | This patch implements the functionality to shrink the tcp window 54 | when necessary to keep the right edge within the memory limit by 55 | autotuning (sk_rcvbuf). This new functionality is enabled with 56 | the following sysctl: 57 | 58 | sysctl: net.ipv4.tcp_shrink_window 59 | 60 | This sysctl changes how the TCP window is calculated. 61 | 62 | If sysctl tcp_shrink_window is zero (the default value), then the 63 | window is never shrunk. 64 | 65 | If sysctl tcp_shrink_window is non-zero, then the memory limit 66 | set by autotuning is honored. This requires that the TCP window 67 | be shrunk ("retracted") as described in RFC 1122. 68 | 69 | [1] https://www.rfc-editor.org/rfc/rfc7323#appendix-F 70 | [2] https://www.rfc-editor.org/rfc/rfc7323#section-2.4 71 | [3] https://www.rfc-editor.org/rfc/rfc1122#page-91 72 | [4] https://www.rfc-editor.org/rfc/rfc793 73 | [5] https://www.rfc-editor.org/rfc/rfc1323 74 | --- 75 | Documentation/networking/ip-sysctl.rst | 14 ++++++ 76 | include/net/netns/ipv4.h | 2 + 77 | net/ipv4/sysctl_net_ipv4.c | 7 +++ 78 | net/ipv4/tcp_ipv4.c | 1 + 79 | net/ipv4/tcp_output.c | 59 +++++++++++++++++++------- 80 | 5 files changed, 68 insertions(+), 15 deletions(-) 81 | 82 | diff --git a/Documentation/networking/ip-sysctl.rst b/Documentation/networking/ip-sysctl.rst 83 | index e7b3fa7bb3f7..114ea77f4786 100644 84 | --- a/Documentation/networking/ip-sysctl.rst 85 | +++ b/Documentation/networking/ip-sysctl.rst 86 | @@ -965,6 +965,20 @@ tcp_tw_reuse - INTEGER 87 | tcp_window_scaling - BOOLEAN 88 | Enable window scaling as defined in RFC1323. 89 | 90 | +tcp_shrink_window - BOOLEAN 91 | + This changes how the TCP receive window is calculated when window 92 | + scaling is in effect. 93 | + 94 | + RFC 7323, section 2.4, says there are instances when a retracted 95 | + window can be offered, and that TCP implementations MUST ensure 96 | + that they handle a shrinking window, as specified in RFC 1122. 97 | + 98 | + - 0 - Disabled. The window is never shrunk. 99 | + - 1 - Enabled. The window is shrunk when necessary to remain within 100 | + the memory limit set by autotuning (sk_rcvbuf). 101 | + 102 | + Default: 0 103 | + 104 | tcp_wmem - vector of 3 INTEGERs: min, default, max 105 | min: Amount of memory reserved for send buffers for TCP sockets. 106 | Each TCP socket has rights to use it due to fact of its birth. 107 | diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h 108 | index bea45ca29cd0..476378afdd99 100644 109 | --- a/include/net/netns/ipv4.h 110 | +++ b/include/net/netns/ipv4.h 111 | @@ -231,5 +231,7 @@ struct netns_ipv4 { 112 | 113 | atomic_t rt_genid; 114 | siphash_key_t ip_id_key; 115 | + 116 | + unsigned int sysctl_tcp_shrink_window; 117 | }; 118 | #endif 119 | diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c 120 | index fab6da51e4c6..bf5386395ebd 100644 121 | --- a/net/ipv4/sysctl_net_ipv4.c 122 | +++ b/net/ipv4/sysctl_net_ipv4.c 123 | @@ -1398,6 +1398,13 @@ static struct ctl_table ipv4_net_table[] = { 124 | .mode = 0644, 125 | .proc_handler = proc_douintvec_minmax, 126 | }, 127 | + { 128 | + .procname = "tcp_shrink_window", 129 | + .data = &init_net.ipv4.sysctl_tcp_shrink_window, 130 | + .maxlen = sizeof(unsigned int), 131 | + .mode = 0644, 132 | + .proc_handler = proc_douintvec_minmax, 133 | + }, 134 | { } 135 | }; 136 | 137 | diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c 138 | index a0a3880b8cf9..725c2aa3b515 100644 139 | --- a/net/ipv4/tcp_ipv4.c 140 | +++ b/net/ipv4/tcp_ipv4.c 141 | @@ -3217,6 +3217,7 @@ static int __net_init tcp_sk_init(struct net *net) 142 | net->ipv4.tcp_congestion_control = &tcp_reno; 143 | 144 | net->ipv4.sysctl_tcp_collapse_max_bytes = 0; 145 | + net->ipv4.sysctl_tcp_shrink_window = 0; 146 | 147 | return 0; 148 | } 149 | diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c 150 | index 85f9a3a99bd6..c08cb445d5f7 100644 151 | --- a/net/ipv4/tcp_output.c 152 | +++ b/net/ipv4/tcp_output.c 153 | @@ -260,8 +260,8 @@ static u16 tcp_select_window(struct sock *sk) 154 | u32 old_win = tp->rcv_wnd; 155 | u32 cur_win = tcp_receive_window(tp); 156 | u32 new_win = __tcp_select_window(sk); 157 | + struct net *net = sock_net(sk); 158 | 159 | - /* Never shrink the offered window */ 160 | if (new_win < cur_win) { 161 | /* Danger Will Robinson! 162 | * Don't update rcv_wup/rcv_wnd here or else 163 | @@ -270,11 +270,15 @@ static u16 tcp_select_window(struct sock *sk) 164 | * 165 | * Relax Will Robinson. 166 | */ 167 | - if (new_win == 0) 168 | - NET_INC_STATS(sock_net(sk), 169 | - LINUX_MIB_TCPWANTZEROWINDOWADV); 170 | - new_win = ALIGN(cur_win, 1 << tp->rx_opt.rcv_wscale); 171 | + if (!net->ipv4.sysctl_tcp_shrink_window) { 172 | + /* Never shrink the offered window */ 173 | + if (new_win == 0) 174 | + NET_INC_STATS(sock_net(sk), 175 | + LINUX_MIB_TCPWANTZEROWINDOWADV); 176 | + new_win = ALIGN(cur_win, 1 << tp->rx_opt.rcv_wscale); 177 | + } 178 | } 179 | + 180 | tp->rcv_wnd = new_win; 181 | tp->rcv_wup = tp->rcv_nxt; 182 | 183 | @@ -2956,6 +2960,7 @@ u32 __tcp_select_window(struct sock *sk) 184 | { 185 | struct inet_connection_sock *icsk = inet_csk(sk); 186 | struct tcp_sock *tp = tcp_sk(sk); 187 | + struct net *net = sock_net(sk); 188 | /* MSS for the peer's data. Previous versions used mss_clamp 189 | * here. I don't know if the value based on our guesses 190 | * of peer's MSS is better for the performance. It's more correct 191 | @@ -2977,16 +2982,24 @@ u32 __tcp_select_window(struct sock *sk) 192 | if (mss <= 0) 193 | return 0; 194 | } 195 | + 196 | + if (net->ipv4.sysctl_tcp_shrink_window) { 197 | + /* new window should always be an exact multiple of scaling factor */ 198 | + free_space = round_down(free_space, 1 << tp->rx_opt.rcv_wscale); 199 | + } 200 | + 201 | if (free_space < (full_space >> 1)) { 202 | icsk->icsk_ack.quick = 0; 203 | 204 | if (tcp_under_memory_pressure(sk)) 205 | tcp_adjust_rcv_ssthresh(sk); 206 | 207 | - /* free_space might become our new window, make sure we don't 208 | - * increase it due to wscale. 209 | - */ 210 | - free_space = round_down(free_space, 1 << tp->rx_opt.rcv_wscale); 211 | + if (!net->ipv4.sysctl_tcp_shrink_window) { 212 | + /* free_space might become our new window, make sure we don't 213 | + * increase it due to wscale. 214 | + */ 215 | + free_space = round_down(free_space, 1 << tp->rx_opt.rcv_wscale); 216 | + } 217 | 218 | /* if free space is less than mss estimate, or is below 1/16th 219 | * of the maximum allowed, try to move to zero-window, else 220 | @@ -2997,10 +3010,24 @@ u32 __tcp_select_window(struct sock *sk) 221 | */ 222 | if (free_space < (allowed_space >> 4) || free_space < mss) 223 | return 0; 224 | + 225 | + if (net->ipv4.sysctl_tcp_shrink_window && free_space < (1 << tp->rx_opt.rcv_wscale)) 226 | + return 0; 227 | } 228 | 229 | - if (free_space > tp->rcv_ssthresh) 230 | + if (free_space > tp->rcv_ssthresh) { 231 | free_space = tp->rcv_ssthresh; 232 | + if (net->ipv4.sysctl_tcp_shrink_window) { 233 | + /* new window should always be an exact multiple of scaling factor 234 | + * 235 | + * For this case, we ALIGN "up" (increase free_space) because 236 | + * we know free_space is not zero here, it has been reduced from 237 | + * the memory-based limit, and rcv_ssthresh is not a hard limit 238 | + * (unlike sk_rcvbuf). 239 | + */ 240 | + free_space = ALIGN(free_space, (1 << tp->rx_opt.rcv_wscale)); 241 | + } 242 | + } 243 | 244 | /* Don't do rounding if we are using window scaling, since the 245 | * scaled window will not line up with the MSS boundary anyway. 246 | @@ -3008,11 +3035,13 @@ u32 __tcp_select_window(struct sock *sk) 247 | if (tp->rx_opt.rcv_wscale) { 248 | window = free_space; 249 | 250 | - /* Advertise enough space so that it won't get scaled away. 251 | - * Import case: prevent zero window announcement if 252 | - * 1< mss. 253 | - */ 254 | - window = ALIGN(window, (1 << tp->rx_opt.rcv_wscale)); 255 | + if (!net->ipv4.sysctl_tcp_shrink_window) { 256 | + /* Advertise enough space so that it won't get scaled away. 257 | + * Import case: prevent zero window announcement if 258 | + * 1< mss. 259 | + */ 260 | + window = ALIGN(window, (1 << tp->rx_opt.rcv_wscale)); 261 | + } 262 | } else { 263 | window = tp->rcv_wnd; 264 | /* Get the largest window that is a nice multiple of mss. 265 | -- 266 | 2.39.2 267 | 268 | -------------------------------------------------------------------------------- /patches/0024-Add-xtsproxy-Crypto-API-module.patch: -------------------------------------------------------------------------------- 1 | From 354d7a8febaa440dd3244466670315ed2805764e Mon Sep 17 00:00:00 2001 2 | From: Ignat Korchagin 3 | Date: Wed, 4 Dec 2019 16:53:46 +0000 4 | Subject: [PATCH] Add xtsproxy Crypto API module 5 | 6 | This module implements a Crypto API AES-XTS synchronous driver, which uses 7 | AES NI implementation as a backend and falls back to generic AES implementation, 8 | when FPU is not usable. 9 | --- 10 | crypto/Kconfig | 10 ++++ 11 | crypto/Makefile | 1 + 12 | crypto/xtsproxy.c | 131 ++++++++++++++++++++++++++++++++++++++++++++++ 13 | 3 files changed, 142 insertions(+) 14 | create mode 100644 crypto/xtsproxy.c 15 | 16 | diff --git a/crypto/Kconfig b/crypto/Kconfig 17 | index 094ef56ab7b4..9964667cef85 100644 18 | --- a/crypto/Kconfig 19 | +++ b/crypto/Kconfig 20 | @@ -457,6 +457,16 @@ config CRYPTO_XTS 21 | key size 256, 384 or 512 bits. This implementation currently 22 | can't handle a sectorsize which is not a multiple of 16 bytes. 23 | 24 | +config CRYPTO_XTS_AES_SYNC 25 | + tristate "XTS AES synchronous implementation" 26 | + depends on X86 && 64BIT 27 | + select CRYPTO_AES 28 | + select CRYPTO_AES_NI_INTEL 29 | + help 30 | + A synchronous AES-XTS implementaion, which uses AES NI as a 31 | + backend implementation and falls back to generic implementation, 32 | + when FPU is not usable. 33 | + 34 | config CRYPTO_KEYWRAP 35 | tristate "Key wrapping support" 36 | select CRYPTO_SKCIPHER 37 | diff --git a/crypto/Makefile b/crypto/Makefile 38 | index b279483fba50..4f6ddcbdc6d4 100644 39 | --- a/crypto/Makefile 40 | +++ b/crypto/Makefile 41 | @@ -90,6 +90,7 @@ obj-$(CONFIG_CRYPTO_PCBC) += pcbc.o 42 | obj-$(CONFIG_CRYPTO_CTS) += cts.o 43 | obj-$(CONFIG_CRYPTO_LRW) += lrw.o 44 | obj-$(CONFIG_CRYPTO_XTS) += xts.o 45 | +obj-$(CONFIG_CRYPTO_XTS_AES_SYNC) += xtsproxy.o 46 | obj-$(CONFIG_CRYPTO_CTR) += ctr.o 47 | obj-$(CONFIG_CRYPTO_KEYWRAP) += keywrap.o 48 | obj-$(CONFIG_CRYPTO_ADIANTUM) += adiantum.o 49 | diff --git a/crypto/xtsproxy.c b/crypto/xtsproxy.c 50 | new file mode 100644 51 | index 000000000000..51ecfb7b4891 52 | --- /dev/null 53 | +++ b/crypto/xtsproxy.c 54 | @@ -0,0 +1,131 @@ 55 | +#include 56 | +#include 57 | +#include 58 | +#include 59 | +#include 60 | +#include 61 | + 62 | +struct xtsproxy_ctx { 63 | + struct crypto_skcipher *xts_aesni; 64 | + struct crypto_skcipher *xts_generic; 65 | +}; 66 | + 67 | +static int xtsproxy_skcipher_init(struct crypto_skcipher *tfm) 68 | +{ 69 | + struct xtsproxy_ctx *ctx = crypto_skcipher_ctx(tfm); 70 | + 71 | + /* AESNI based XTS implementation, requires FPU to be available */ 72 | + ctx->xts_aesni = crypto_alloc_skcipher("__xts-aes-aesni", CRYPTO_ALG_INTERNAL, 0); 73 | + if (IS_ERR(ctx->xts_aesni)) 74 | + return PTR_ERR(ctx->xts_aesni); 75 | + 76 | + /* generic XTS implementation based on generic FPU-less AES */ 77 | + /* there is also aes-aesni implementation, which falls back to aes-generic */ 78 | + /* but we're doing FPU checks in our code, so no need to repeat those */ 79 | + /* as we will always fallback to aes-generic in this case */ 80 | + ctx->xts_generic = crypto_alloc_skcipher("xts(ecb(aes-generic))", 0, 0); 81 | + if (IS_ERR(ctx->xts_generic)) 82 | + return PTR_ERR(ctx->xts_generic); 83 | + 84 | + /* make sure we allocate enough request memory for both implementations */ 85 | + crypto_skcipher_set_reqsize(tfm, max(crypto_skcipher_reqsize(ctx->xts_aesni), crypto_skcipher_reqsize(ctx->xts_generic))); 86 | + 87 | + return 0; 88 | +} 89 | + 90 | +static void xtsproxy_skcipher_exit(struct crypto_skcipher *tfm) 91 | +{ 92 | + struct xtsproxy_ctx *ctx = crypto_skcipher_ctx(tfm); 93 | + 94 | + if (!IS_ERR_OR_NULL(ctx->xts_generic)) { 95 | + crypto_free_skcipher(ctx->xts_generic); 96 | + ctx->xts_generic = NULL; 97 | + } 98 | + 99 | + if (!IS_ERR_OR_NULL(ctx->xts_aesni)) { 100 | + crypto_free_skcipher(ctx->xts_aesni); 101 | + ctx->xts_aesni = NULL; 102 | + } 103 | +} 104 | + 105 | +static int xtsproxy_setkey(struct crypto_skcipher *tfm, const u8 *key, 106 | + unsigned int keylen) 107 | +{ 108 | + struct xtsproxy_ctx *ctx = crypto_skcipher_ctx(tfm); 109 | + int err; 110 | + 111 | + err = crypto_skcipher_setkey(ctx->xts_aesni, key, keylen); 112 | + if (err) 113 | + return err; 114 | + 115 | + return crypto_skcipher_setkey(ctx->xts_generic, key, keylen); 116 | +} 117 | + 118 | +static int xtsproxy_encrypt(struct skcipher_request *req) 119 | +{ 120 | + struct xtsproxy_ctx *ctx = crypto_skcipher_ctx(crypto_skcipher_reqtfm(req)); 121 | + 122 | + if (irq_fpu_usable()) 123 | + skcipher_request_set_tfm(req, ctx->xts_aesni); 124 | + else 125 | + skcipher_request_set_tfm(req, ctx->xts_generic); 126 | + 127 | + /* underlying implementations should not try to sleep */ 128 | + req->base.flags &= ~(CRYPTO_TFM_REQ_MAY_SLEEP | CRYPTO_TFM_REQ_MAY_BACKLOG); 129 | + 130 | + return crypto_skcipher_encrypt(req); 131 | +} 132 | + 133 | +static int xtsproxy_decrypt(struct skcipher_request *req) 134 | +{ 135 | + struct xtsproxy_ctx *ctx = crypto_skcipher_ctx(crypto_skcipher_reqtfm(req)); 136 | + 137 | + if (irq_fpu_usable()) 138 | + skcipher_request_set_tfm(req, ctx->xts_aesni); 139 | + else 140 | + skcipher_request_set_tfm(req, ctx->xts_generic); 141 | + 142 | + /* underlying implementations should not try to sleep */ 143 | + req->base.flags &= ~(CRYPTO_TFM_REQ_MAY_SLEEP | CRYPTO_TFM_REQ_MAY_BACKLOG); 144 | + 145 | + return crypto_skcipher_decrypt(req); 146 | +} 147 | + 148 | +static struct skcipher_alg xtsproxy_skcipher = { 149 | + .base = { 150 | + .cra_name = "xts(aes)", 151 | + .cra_driver_name = "xts-aes-xtsproxy", 152 | + /* make sure we don't use it unless requested explicitly */ 153 | + .cra_priority = 0, 154 | + /* .cra_flags = CRYPTO_ALG_INTERNAL, */ 155 | + .cra_blocksize = AES_BLOCK_SIZE, 156 | + .cra_ctxsize = sizeof(struct xtsproxy_ctx), 157 | + .cra_module = THIS_MODULE, 158 | + }, 159 | + .min_keysize = 2 * AES_MIN_KEY_SIZE, 160 | + .max_keysize = 2 * AES_MAX_KEY_SIZE, 161 | + .ivsize = AES_BLOCK_SIZE, 162 | + .init = xtsproxy_skcipher_init, 163 | + .exit = xtsproxy_skcipher_exit, 164 | + .setkey = xtsproxy_setkey, 165 | + .encrypt = xtsproxy_encrypt, 166 | + .decrypt = xtsproxy_decrypt, 167 | +}; 168 | + 169 | +static int __init xtsproxy_init(void) 170 | +{ 171 | + return crypto_register_skcipher(&xtsproxy_skcipher); 172 | +} 173 | + 174 | +static void __exit xtsproxy_fini(void) 175 | +{ 176 | + crypto_unregister_skcipher(&xtsproxy_skcipher); 177 | +} 178 | + 179 | +module_init(xtsproxy_init); 180 | +module_exit(xtsproxy_fini); 181 | + 182 | +MODULE_DESCRIPTION("XTS-AES using AESNI implementation with generic AES fallback"); 183 | +MODULE_AUTHOR("Ignat Korchagin "); 184 | +MODULE_LICENSE("GPL"); 185 | +MODULE_ALIAS_CRYPTO("xts(aes)"); 186 | -- 187 | 2.29.1 188 | 189 | --------------------------------------------------------------------------------