├── .gitignore ├── Kbuild ├── Kconfig ├── Makefile ├── README.org ├── cobalt_compat.h ├── gen_cake_const.c ├── pkt_sched.h └── sch_cake.c /.gitignore: -------------------------------------------------------------------------------- 1 | .*.cmd 2 | .tmp_versions 3 | Module.symvers 4 | modules.order 5 | sch_cake.ko 6 | sch_cake.mod.c 7 | *.o 8 | *.dwo 9 | -------------------------------------------------------------------------------- /Kbuild: -------------------------------------------------------------------------------- 1 | ifneq ($(KBUILD_EXTMOD),) 2 | CONFIG_NET_SCH_CAKE := m 3 | endif 4 | 5 | obj-$(CONFIG_NET_SCH_CAKE) += sch_cake.o 6 | -------------------------------------------------------------------------------- /Kconfig: -------------------------------------------------------------------------------- 1 | config NET_SCH_CAKE 2 | tristate "Common Applications Kept Enhanced (CAKE)" 3 | depends on NET_SCHED 4 | help 5 | Say Y here if you want to use the Common Applications Kept Enhanced 6 | (CAKE) queue management algorithm. 7 | 8 | To compile this driver as a module, choose M here: the module 9 | will be called sch_cake. 10 | 11 | If unsure, say N. 12 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | KERNEL_VERSION := $(shell uname -r) 2 | IDIR := /lib/modules/$(KERNEL_VERSION)/kernel/net/sched/ 3 | KDIR := /lib/modules/$(KERNEL_VERSION)/build 4 | PWD := $(shell pwd) 5 | VERSION := $(shell git rev-parse HEAD 2>/dev/null) 6 | default: 7 | @$(MAKE) -C $(KDIR) M=$(PWD) modules $(if $(VERSION),LDFLAGS_MODULE="--build-id=0x$(VERSION)" CFLAGS_MODULE="-DCAKE_VERSION=\\\"$(VERSION)\\\"") 8 | 9 | install: 10 | install -v -m 644 sch_cake.ko $(IDIR) 11 | depmod "$(KERNEL_VERSION)" 12 | [ "$(KERNEL_VERSION)" != `uname -r` ] || modprobe sch_cake 13 | 14 | clean: 15 | @$(MAKE) -C $(KDIR) M=$(PWD) clean 16 | -------------------------------------------------------------------------------- /README.org: -------------------------------------------------------------------------------- 1 | #+OPTIONS: ^:nil 2 | 3 | * Common Applications Kept Enhanced (CAKE) scheduler 4 | 5 | This is the out-of-tree version of [[https://www.bufferbloat.net/projects/codel/wiki/Cake/][CAKE]], the Linux qdisc that combines scheduler 6 | and traffic shaper for effective bufferbloat mitigation. 7 | 8 | Note that sch_cake is part of the upstream Linux kernel since kernel version 9 | 4.19, so this repository exists only as a resource for building the qdisc with 10 | older versions of the kernel. If you're already on kernel 4.19 or newer, you can 11 | just load CAKE with =tc= and the kernel shipped by your distribution! 12 | -------------------------------------------------------------------------------- /cobalt_compat.h: -------------------------------------------------------------------------------- 1 | #ifndef __NET_SCHED_COBALT_COMPAT_H 2 | #define __NET_SCHED_COBALT_COMPAT_H 3 | /* Backport some stuff if needed. 4 | */ 5 | #if KERNEL_VERSION(3, 11, 0) > LINUX_VERSION_CODE 6 | #define ktime_add_ms(kt, msec) ktime_add_ns(kt, msec * NSEC_PER_MSEC) 7 | #endif 8 | 9 | #if KERNEL_VERSION(3, 14, 0) > LINUX_VERSION_CODE 10 | 11 | static inline u32 reciprocal_scale(u32 val, u32 ep_ro) 12 | { 13 | return (u32)(((u64) val * ep_ro) >> 32); 14 | } 15 | 16 | #endif 17 | 18 | #if KERNEL_VERSION(3, 15, 0) > LINUX_VERSION_CODE 19 | 20 | static inline void kvfree(const void *addr) 21 | { 22 | if (is_vmalloc_addr(addr)) 23 | vfree(addr); 24 | else 25 | kfree(addr); 26 | } 27 | 28 | #endif 29 | 30 | #if KERNEL_VERSION(3, 16, 0) > LINUX_VERSION_CODE 31 | #define ktime_after(cmp1, cmp2) ktime_compare(cmp1, cmp2) > 0 32 | #define ktime_before(cmp1, cmp2) ktime_compare(cmp1, cmp2) < 0 33 | #endif 34 | 35 | #if KERNEL_VERSION(3, 17, 0) > LINUX_VERSION_CODE 36 | 37 | #define ktime_get_ns() ktime_to_ns(ktime_get()) 38 | 39 | #endif 40 | 41 | /* 3.18 > 4.7 use 3 arg, everything else uses 2 arg versions 42 | * of qdisc_watchdog_schedule_ns 43 | */ 44 | #if ((KERNEL_VERSION(3, 18, 0) <= LINUX_VERSION_CODE) && (KERNEL_VERSION(4, 8, 0) > LINUX_VERSION_CODE)) 45 | #define qdisc_watchdog_schedule_ns(_a, _b) qdisc_watchdog_schedule_ns(_a, _b, true); 46 | #endif 47 | 48 | #if KERNEL_VERSION(3, 18, 0) > LINUX_VERSION_CODE 49 | static inline void qdisc_qstats_backlog_dec(struct Qdisc *sch, 50 | const struct sk_buff *skb) 51 | { 52 | sch->qstats.backlog -= qdisc_pkt_len(skb); 53 | } 54 | 55 | static inline void qdisc_qstats_backlog_inc(struct Qdisc *sch, 56 | const struct sk_buff *skb) 57 | { 58 | sch->qstats.backlog += qdisc_pkt_len(skb); 59 | } 60 | 61 | static inline void __qdisc_qstats_drop(struct Qdisc *sch, int count) 62 | { 63 | sch->qstats.drops += count; 64 | } 65 | 66 | static inline void qdisc_qstats_drop(struct Qdisc *sch) 67 | { 68 | sch->qstats.drops++; 69 | } 70 | 71 | #define gnet_stats_copy_queue(_a, _b, _c, _d) gnet_stats_copy_queue(_a, _c) 72 | 73 | #endif 74 | 75 | #if KERNEL_VERSION(4, 1, 0) > LINUX_VERSION_CODE 76 | #define TCPOPT_FASTOPEN 34 77 | #endif 78 | 79 | #if KERNEL_VERSION(4, 3, 0) > LINUX_VERSION_CODE 80 | #define tcf_classify(_a, _b, _c, _d) tc_classify(_a, _b, _c); 81 | #elif KERNEL_VERSION(4, 13, 0) > LINUX_VERSION_CODE 82 | #define tcf_classify(_a, _b, _c, _d) tc_classify(_a, _b, _c, _d); 83 | #endif 84 | 85 | #if !defined(IS_REACHABLE) 86 | #define IS_REACHABLE(option) (config_enabled(option) || \ 87 | (config_enabled(option##_MODULE) && config_enabled(MODULE))) 88 | #endif 89 | 90 | #if ((KERNEL_VERSION(4, 4, 114) > LINUX_VERSION_CODE) && \ 91 | ((KERNEL_VERSION(4, 1, 50) > LINUX_VERSION_CODE) || (KERNEL_VERSION(4, 2, 0) <= LINUX_VERSION_CODE))) 92 | static inline unsigned int __tcp_hdrlen(const struct tcphdr *th) 93 | { 94 | return th->doff * 4; 95 | } 96 | #endif 97 | 98 | #if KERNEL_VERSION(4, 5, 0) > LINUX_VERSION_CODE 99 | #define IP6_ECN_set_ce(_a, _b) IP6_ECN_set_ce(_b) 100 | #endif 101 | 102 | #if KERNEL_VERSION(4, 6, 0) > LINUX_VERSION_CODE 103 | static inline int skb_try_make_writable(struct sk_buff *skb, 104 | unsigned int write_len) 105 | { 106 | return skb_cloned(skb) && !skb_clone_writable(skb, write_len) && 107 | pskb_expand_head(skb, 0, 0, GFP_ATOMIC); 108 | } 109 | #endif 110 | 111 | #if KERNEL_VERSION(4, 11, 0) > LINUX_VERSION_CODE 112 | static inline int skb_mac_offset(const struct sk_buff *skb) 113 | { 114 | return skb_mac_header(skb) - skb->data; 115 | } 116 | #endif 117 | 118 | #if KERNEL_VERSION(4, 7, 0) > LINUX_VERSION_CODE 119 | #define nla_put_u64_64bit(skb, attrtype, value, padattr) nla_put_u64(skb, attrtype, value) 120 | #endif 121 | 122 | #if KERNEL_VERSION(4, 8, 0) > LINUX_VERSION_CODE 123 | #define cake_maybe_lock(sch) 124 | #define cake_maybe_unlock(sch) 125 | #else 126 | #define cake_maybe_lock(sch) sch_tree_lock(sch); 127 | #define cake_maybe_unlock(sch) sch_tree_unlock(sch); 128 | #endif 129 | 130 | 131 | #if KERNEL_VERSION(4, 12, 0) > LINUX_VERSION_CODE 132 | static void *kvzalloc(size_t sz, gfp_t flags) 133 | { 134 | void *ptr = kzalloc(sz, flags); 135 | 136 | if (!ptr) 137 | ptr = vzalloc(sz); 138 | return ptr; 139 | } 140 | #endif 141 | 142 | /* save the best till last 143 | * qdisc_tree_reduce_backlog appears in kernel from: 144 | 3.16.37 onward 145 | not in 3.17 146 | 3.18.37 147 | not in 3.19 148 | not in 4.0 149 | 4.1.28 onward 150 | not in 4.2 151 | not in 4.3 152 | 4.4.11 onward 153 | 4.5.5 onward 154 | */ 155 | #if ((KERNEL_VERSION(3, 0, 0) <= LINUX_VERSION_CODE) && (KERNEL_VERSION(3, 16, 37) > LINUX_VERSION_CODE)) || \ 156 | ((KERNEL_VERSION(3, 18, 0) <= LINUX_VERSION_CODE) && (KERNEL_VERSION(3, 18, 37) > LINUX_VERSION_CODE)) || \ 157 | ((KERNEL_VERSION(4, 1, 0) <= LINUX_VERSION_CODE) && (KERNEL_VERSION(4, 1, 28) > LINUX_VERSION_CODE)) || \ 158 | ((KERNEL_VERSION(4, 4, 0) <= LINUX_VERSION_CODE) && (KERNEL_VERSION(4, 4, 11) > LINUX_VERSION_CODE)) || \ 159 | ((KERNEL_VERSION(4, 5, 0) <= LINUX_VERSION_CODE) && (KERNEL_VERSION(4, 5, 5) > LINUX_VERSION_CODE)) 160 | #define qdisc_tree_reduce_backlog(_a, _b, _c) qdisc_tree_decrease_qlen(_a, _b) 161 | #endif 162 | 163 | 164 | #endif 165 | -------------------------------------------------------------------------------- /gen_cake_const.c: -------------------------------------------------------------------------------- 1 | /** 2 | * cake_const.c 3 | * No point in calculating the diffserv lookup tables at runtime 4 | * Dave Taht 5 | * 2015-12-21 6 | */ 7 | 8 | #include 9 | #include 10 | #include 11 | 12 | 13 | /* List of known Diffserv codepoints: 14 | * 15 | * Least Effort (CS1) 16 | * Best Effort (CS0) 17 | * Max Reliability (TOS1) 18 | * Max Throughput (TOS2) 19 | * Min Delay (TOS4) 20 | * Assured Forwarding 1 (AF1x) - x3 21 | * Assured Forwarding 2 (AF2x) - x3 22 | * Assured Forwarding 3 (AF3x) - x3 23 | * Assured Forwarding 4 (AF4x) - x3 24 | * Precedence Class 2 (CS2) 25 | * Precedence Class 3 (CS3) 26 | * Precedence Class 4 (CS4) 27 | * Precedence Class 5 (CS5) 28 | * Precedence Class 6 (CS6) 29 | * Precedence Class 7 (CS7) 30 | * Voice Admit (VA) 31 | * Expedited Forwarding (EF) 32 | 33 | * Total 25 codepoints. 34 | */ 35 | 36 | /* List of traffic classes in RFC 4594: 37 | * (roughly descending order of contended priority) 38 | * (roughly ascending order of uncontended throughput) 39 | * 40 | * Network Control (CS6,CS7) - routing traffic 41 | * Telephony (EF,VA) - aka. VoIP streams 42 | * Signalling (CS5) - VoIP setup 43 | * Multimedia Conferencing (AF4x) - aka. video calls 44 | * Realtime Interactive (CS4) - eg. games 45 | * Multimedia Streaming (AF3x) - eg. YouTube, NetFlix, Twitch 46 | * Broadcast Video (CS3) 47 | * Low Latency Data (AF2x,TOS4) - eg. database 48 | * Ops, Admin, Management (CS2,TOS1) - eg. ssh 49 | * Standard Service (CS0 & unrecognised codepoints) 50 | * High Throughput Data (AF1x,TOS2) - eg. web traffic 51 | * Low Priority Data (CS1) - eg. BitTorrent 52 | 53 | * Total 12 traffic classes. 54 | */ 55 | 56 | static int min(int a, int b) { 57 | return (a < b ? a : b); 58 | } 59 | 60 | static void print_dscp(char *var, uint8_t *dscp) { 61 | printf("static const u8 %s[] = {", var); 62 | for(int i=0;i<64;i+=8) { 63 | for(int j=0; j<7; j++) { 64 | printf("%d, ",(int)dscp[i+j]); 65 | } 66 | printf("%d,\n\t\t\t\t", dscp[i+7]); 67 | } 68 | printf("};\n"); 69 | } 70 | 71 | void precedence() { 72 | uint8_t dscp[64]; 73 | for (int i = 0; i < 64; i++) 74 | dscp[i]= min((i >> 3), 8); 75 | print_dscp("precedence",dscp); 76 | } 77 | 78 | /* Pruned list of traffic classes for typical applications: 79 | * 80 | * Network Control (CS6, CS7) 81 | * Minimum Latency (EF, VA, CS5, CS4) 82 | * Interactive Shell (CS2, TOS1) 83 | * Low Latency Transactions (AF2x, TOS4) 84 | * Video Streaming (AF4x, AF3x, CS3) 85 | * Bog Standard (CS0 etc.) 86 | * High Throughput (AF1x, TOS2) 87 | * Background Traffic (CS1) 88 | * 89 | * Total 8 traffic classes. 90 | */ 91 | 92 | void diffserv8() { 93 | uint8_t dscp[64]; 94 | 95 | /* codepoint to class mapping */ 96 | for (int i = 0; i < 64; i++) 97 | dscp[i] = 2; /* default to best-effort */ 98 | 99 | dscp[0x08] = 0; /* CS1 */ 100 | dscp[0x02] = 1; /* TOS2 */ 101 | dscp[0x18] = 3; /* CS3 */ 102 | dscp[0x04] = 4; /* TOS4 */ 103 | dscp[0x01] = 5; /* TOS1 */ 104 | dscp[0x10] = 5; /* CS2 */ 105 | dscp[0x20] = 6; /* CS4 */ 106 | dscp[0x28] = 6; /* CS5 */ 107 | dscp[0x2c] = 6; /* VA */ 108 | dscp[0x2e] = 6; /* EF */ 109 | dscp[0x30] = 7; /* CS6 */ 110 | dscp[0x38] = 7; /* CS7 */ 111 | 112 | for (int i = 2; i <= 6; i += 2) { 113 | dscp[0x08 + i] = 1; /* AF1x */ 114 | dscp[0x10 + i] = 4; /* AF2x */ 115 | dscp[0x18 + i] = 3; /* AF3x */ 116 | dscp[0x20 + i] = 3; /* AF4x */ 117 | } 118 | 119 | print_dscp("diffserv8",dscp); 120 | } 121 | 122 | /* Diffserv structure specialised for Latency-Loss-Tradeoff spec. 123 | * Loss Sensitive (TOS1, TOS2) 124 | * Best Effort 125 | * Latency Sensitive (TOS4, TOS5, VA, EF) 126 | * Low Priority (CS1) 127 | * Network Control (CS6, CS7) 128 | */ 129 | 130 | void diffserv_llt() { 131 | uint8_t dscp[64]; 132 | /* codepoint to class mapping */ 133 | 134 | for (int i = 0; i < 64; i++) 135 | dscp[i] = 1; /* default to best-effort */ 136 | 137 | dscp[0x01] = 0; /* TOS1 */ 138 | dscp[0x02] = 0; /* TOS2 */ 139 | dscp[0x04] = 2; /* TOS4 */ 140 | dscp[0x05] = 2; /* TOS5 */ 141 | dscp[0x2c] = 2; /* VA */ 142 | dscp[0x2e] = 2; /* EF */ 143 | dscp[0x08] = 3; /* CS1 */ 144 | dscp[0x30] = 4; /* CS6 */ 145 | dscp[0x38] = 4; /* CS7 */ 146 | 147 | print_dscp("diffserv_llt",dscp); 148 | 149 | } 150 | 151 | /* Further pruned list of traffic classes for four-class system: 152 | * 153 | * Latency Sensitive (CS7, CS6, EF, VA, CS5, CS4) 154 | * Streaming Media (AF4x, AF3x, CS3, AF2x, TOS4, CS2, TOS1) 155 | * Best Effort (CS0, AF1x, TOS2, and those not specified) 156 | * Background Traffic (CS1) 157 | * 158 | * Total 4 traffic classes. 159 | */ 160 | 161 | void diffserv4() { 162 | uint8_t dscp[64]; 163 | /* codepoint to class mapping */ 164 | for (int i = 0; i < 64; i++) 165 | dscp[i] = 1; /* default to best-effort */ 166 | 167 | dscp[0x08] = 0; /* CS1 */ 168 | 169 | dscp[0x18] = 2; /* CS3 */ 170 | dscp[0x04] = 2; /* TOS4 */ 171 | dscp[0x01] = 2; /* TOS1 */ 172 | dscp[0x10] = 2; /* CS2 */ 173 | 174 | dscp[0x20] = 3; /* CS4 */ 175 | dscp[0x28] = 3; /* CS5 */ 176 | dscp[0x2c] = 3; /* VA */ 177 | dscp[0x2e] = 3; /* EF */ 178 | dscp[0x30] = 3; /* CS6 */ 179 | dscp[0x38] = 3; /* CS7 */ 180 | 181 | for (int i = 2; i <= 6; i += 2) { 182 | dscp[0x10 + i] = 2; /* AF2x */ 183 | dscp[0x18 + i] = 2; /* AF3x */ 184 | dscp[0x20 + i] = 2; /* AF4x */ 185 | } 186 | 187 | print_dscp("diffserv4",dscp); 188 | } 189 | 190 | /* Simplified Diffserv structure with 3 tins. 191 | * Low Priority (CS1) 192 | * Best Effort 193 | * Latency Sensitive (TOS4, VA, EF, CS6, CS7) 194 | */ 195 | 196 | void diffserv3() { 197 | uint8_t dscp[64]; 198 | /* codepoint to class mapping */ 199 | for (int i = 0; i < 64; i++) 200 | dscp[i] = 1; /* default to best-effort */ 201 | 202 | dscp[0x08] = 0; /* CS1 */ 203 | 204 | dscp[0x04] = 2; /* TOS4 */ 205 | dscp[0x2c] = 2; /* VA */ 206 | dscp[0x2e] = 2; /* EF */ 207 | dscp[0x30] = 2; /* CS6 */ 208 | dscp[0x38] = 2; /* CS7 */ 209 | 210 | print_dscp("diffserv3",dscp); 211 | } 212 | 213 | void besteffort() { 214 | uint8_t dscp[64]; 215 | for (int i = 0; i < 64; i++) 216 | dscp[i]=0; 217 | print_dscp("besteffort",dscp); 218 | } 219 | 220 | int main(int argc, char **argv) { 221 | precedence(); 222 | diffserv_llt(); 223 | diffserv8(); 224 | diffserv4(); 225 | diffserv3(); 226 | besteffort(); 227 | } 228 | -------------------------------------------------------------------------------- /pkt_sched.h: -------------------------------------------------------------------------------- 1 | #ifndef __LINUX_PKT_SCHED_H 2 | #define __LINUX_PKT_SCHED_H 3 | 4 | #include 5 | 6 | /* Logical priority bands not depending on specific packet scheduler. 7 | Every scheduler will map them to real traffic classes, if it has 8 | no more precise mechanism to classify packets. 9 | 10 | These numbers have no special meaning, though their coincidence 11 | with obsolete IPv6 values is not occasional :-). New IPv6 drafts 12 | preferred full anarchy inspired by diffserv group. 13 | 14 | Note: TC_PRIO_BESTEFFORT does not mean that it is the most unhappy 15 | class, actually, as rule it will be handled with more care than 16 | filler or even bulk. 17 | */ 18 | 19 | #define TC_PRIO_BESTEFFORT 0 20 | #define TC_PRIO_FILLER 1 21 | #define TC_PRIO_BULK 2 22 | #define TC_PRIO_INTERACTIVE_BULK 4 23 | #define TC_PRIO_INTERACTIVE 6 24 | #define TC_PRIO_CONTROL 7 25 | 26 | #define TC_PRIO_MAX 15 27 | 28 | /* Generic queue statistics, available for all the elements. 29 | Particular schedulers may have also their private records. 30 | */ 31 | 32 | struct tc_stats { 33 | __u64 bytes; /* Number of enqueued bytes */ 34 | __u32 packets; /* Number of enqueued packets */ 35 | __u32 drops; /* Packets dropped because of lack of resources */ 36 | __u32 overlimits; /* Number of throttle events when this 37 | * flow goes out of allocated bandwidth */ 38 | __u32 bps; /* Current flow byte rate */ 39 | __u32 pps; /* Current flow packet rate */ 40 | __u32 qlen; 41 | __u32 backlog; 42 | }; 43 | 44 | struct tc_estimator { 45 | signed char interval; 46 | unsigned char ewma_log; 47 | }; 48 | 49 | /* "Handles" 50 | --------- 51 | 52 | All the traffic control objects have 32bit identifiers, or "handles". 53 | 54 | They can be considered as opaque numbers from user API viewpoint, 55 | but actually they always consist of two fields: major and 56 | minor numbers, which are interpreted by kernel specially, 57 | that may be used by applications, though not recommended. 58 | 59 | F.e. qdisc handles always have minor number equal to zero, 60 | classes (or flows) have major equal to parent qdisc major, and 61 | minor uniquely identifying class inside qdisc. 62 | 63 | Macros to manipulate handles: 64 | */ 65 | 66 | 67 | #define TC_H_MAJ_MASK (0xFFFF0000U) 68 | #define TC_H_MIN_MASK (0x0000FFFFU) 69 | #define TC_H_MAJ(h) ((h)&TC_H_MAJ_MASK) 70 | #define TC_H_MIN(h) ((h)&TC_H_MIN_MASK) 71 | #define TC_H_MAKE(maj,min) (((maj)&TC_H_MAJ_MASK)|((min)&TC_H_MIN_MASK)) 72 | 73 | #define TC_H_UNSPEC (0U) 74 | #define TC_H_ROOT (0xFFFFFFFFU) 75 | #define TC_H_INGRESS (0xFFFFFFF1U) 76 | #ifndef TC_H_CLSACT 77 | #define TC_H_CLSACT TC_H_INGRESS 78 | #define TC_H_MIN_PRIORITY 0xFFF0U 79 | #define TC_H_MIN_INGRESS 0xFFF2U 80 | #define TC_H_MIN_EGRESS 0xFFF3U 81 | #endif 82 | 83 | /* Need to corrospond to iproute2 tc/tc_core.h "enum link_layer" */ 84 | enum tc_link_layer { 85 | TC_LINKLAYER_UNAWARE, /* Indicate unaware old iproute2 util */ 86 | TC_LINKLAYER_ETHERNET, 87 | TC_LINKLAYER_ATM, 88 | }; 89 | #define TC_LINKLAYER_MASK 0x0F /* limit use to lower 4 bits */ 90 | 91 | struct tc_ratespec { 92 | unsigned char cell_log; 93 | __u8 linklayer; /* lower 4 bits */ 94 | unsigned short overhead; 95 | short cell_align; 96 | unsigned short mpu; 97 | __u32 rate; 98 | }; 99 | 100 | #define TC_RTAB_SIZE 1024 101 | 102 | struct tc_sizespec { 103 | unsigned char cell_log; 104 | unsigned char size_log; 105 | short cell_align; 106 | int overhead; 107 | unsigned int linklayer; 108 | unsigned int mpu; 109 | unsigned int mtu; 110 | unsigned int tsize; 111 | }; 112 | 113 | enum { 114 | TCA_STAB_UNSPEC, 115 | TCA_STAB_BASE, 116 | TCA_STAB_DATA, 117 | __TCA_STAB_MAX 118 | }; 119 | 120 | #define TCA_STAB_MAX (__TCA_STAB_MAX - 1) 121 | 122 | /* FIFO section */ 123 | 124 | struct tc_fifo_qopt { 125 | __u32 limit; /* Queue length: bytes for bfifo, packets for pfifo */ 126 | }; 127 | 128 | /* PRIO section */ 129 | 130 | #define TCQ_PRIO_BANDS 16 131 | #define TCQ_MIN_PRIO_BANDS 2 132 | 133 | struct tc_prio_qopt { 134 | int bands; /* Number of bands */ 135 | __u8 priomap[TC_PRIO_MAX+1]; /* Map: logical priority -> PRIO band */ 136 | }; 137 | 138 | /* MULTIQ section */ 139 | 140 | struct tc_multiq_qopt { 141 | __u16 bands; /* Number of bands */ 142 | __u16 max_bands; /* Maximum number of queues */ 143 | }; 144 | 145 | /* PLUG section */ 146 | 147 | #define TCQ_PLUG_BUFFER 0 148 | #define TCQ_PLUG_RELEASE_ONE 1 149 | #define TCQ_PLUG_RELEASE_INDEFINITE 2 150 | #define TCQ_PLUG_LIMIT 3 151 | 152 | struct tc_plug_qopt { 153 | /* TCQ_PLUG_BUFFER: Inset a plug into the queue and 154 | * buffer any incoming packets 155 | * TCQ_PLUG_RELEASE_ONE: Dequeue packets from queue head 156 | * to beginning of the next plug. 157 | * TCQ_PLUG_RELEASE_INDEFINITE: Dequeue all packets from queue. 158 | * Stop buffering packets until the next TCQ_PLUG_BUFFER 159 | * command is received (just act as a pass-thru queue). 160 | * TCQ_PLUG_LIMIT: Increase/decrease queue size 161 | */ 162 | int action; 163 | __u32 limit; 164 | }; 165 | 166 | /* TBF section */ 167 | 168 | struct tc_tbf_qopt { 169 | struct tc_ratespec rate; 170 | struct tc_ratespec peakrate; 171 | __u32 limit; 172 | __u32 buffer; 173 | __u32 mtu; 174 | }; 175 | 176 | enum { 177 | TCA_TBF_UNSPEC, 178 | TCA_TBF_PARMS, 179 | TCA_TBF_RTAB, 180 | TCA_TBF_PTAB, 181 | TCA_TBF_RATE64, 182 | TCA_TBF_PRATE64, 183 | TCA_TBF_BURST, 184 | TCA_TBF_PBURST, 185 | __TCA_TBF_MAX, 186 | }; 187 | 188 | #define TCA_TBF_MAX (__TCA_TBF_MAX - 1) 189 | 190 | 191 | /* TEQL section */ 192 | 193 | /* TEQL does not require any parameters */ 194 | 195 | /* SFQ section */ 196 | 197 | struct tc_sfq_qopt { 198 | unsigned quantum; /* Bytes per round allocated to flow */ 199 | int perturb_period; /* Period of hash perturbation */ 200 | __u32 limit; /* Maximal packets in queue */ 201 | unsigned divisor; /* Hash divisor */ 202 | unsigned flows; /* Maximal number of flows */ 203 | }; 204 | 205 | struct tc_sfqred_stats { 206 | __u32 prob_drop; /* Early drops, below max threshold */ 207 | __u32 forced_drop; /* Early drops, after max threshold */ 208 | __u32 prob_mark; /* Marked packets, below max threshold */ 209 | __u32 forced_mark; /* Marked packets, after max threshold */ 210 | __u32 prob_mark_head; /* Marked packets, below max threshold */ 211 | __u32 forced_mark_head;/* Marked packets, after max threshold */ 212 | }; 213 | 214 | struct tc_sfq_qopt_v1 { 215 | struct tc_sfq_qopt v0; 216 | unsigned int depth; /* max number of packets per flow */ 217 | unsigned int headdrop; 218 | /* SFQRED parameters */ 219 | __u32 limit; /* HARD maximal flow queue length (bytes) */ 220 | __u32 qth_min; /* Min average length threshold (bytes) */ 221 | __u32 qth_max; /* Max average length threshold (bytes) */ 222 | unsigned char Wlog; /* log(W) */ 223 | unsigned char Plog; /* log(P_max/(qth_max-qth_min)) */ 224 | unsigned char Scell_log; /* cell size for idle damping */ 225 | unsigned char flags; 226 | __u32 max_P; /* probability, high resolution */ 227 | /* SFQRED stats */ 228 | struct tc_sfqred_stats stats; 229 | }; 230 | 231 | 232 | struct tc_sfq_xstats { 233 | __s32 allot; 234 | }; 235 | 236 | /* RED section */ 237 | 238 | enum { 239 | TCA_RED_UNSPEC, 240 | TCA_RED_PARMS, 241 | TCA_RED_STAB, 242 | TCA_RED_MAX_P, 243 | __TCA_RED_MAX, 244 | }; 245 | 246 | #define TCA_RED_MAX (__TCA_RED_MAX - 1) 247 | 248 | struct tc_red_qopt { 249 | __u32 limit; /* HARD maximal queue length (bytes) */ 250 | __u32 qth_min; /* Min average length threshold (bytes) */ 251 | __u32 qth_max; /* Max average length threshold (bytes) */ 252 | unsigned char Wlog; /* log(W) */ 253 | unsigned char Plog; /* log(P_max/(qth_max-qth_min)) */ 254 | unsigned char Scell_log; /* cell size for idle damping */ 255 | unsigned char flags; 256 | #define TC_RED_ECN 1 257 | #define TC_RED_HARDDROP 2 258 | #define TC_RED_ADAPTATIVE 4 259 | }; 260 | 261 | struct tc_red_xstats { 262 | __u32 early; /* Early drops */ 263 | __u32 pdrop; /* Drops due to queue limits */ 264 | __u32 other; /* Drops due to drop() calls */ 265 | __u32 marked; /* Marked packets */ 266 | }; 267 | 268 | /* GRED section */ 269 | 270 | #define MAX_DPs 16 271 | 272 | enum { 273 | TCA_GRED_UNSPEC, 274 | TCA_GRED_PARMS, 275 | TCA_GRED_STAB, 276 | TCA_GRED_DPS, 277 | TCA_GRED_MAX_P, 278 | TCA_GRED_LIMIT, 279 | __TCA_GRED_MAX, 280 | }; 281 | 282 | #define TCA_GRED_MAX (__TCA_GRED_MAX - 1) 283 | 284 | struct tc_gred_qopt { 285 | __u32 limit; /* HARD maximal queue length (bytes) */ 286 | __u32 qth_min; /* Min average length threshold (bytes) */ 287 | __u32 qth_max; /* Max average length threshold (bytes) */ 288 | __u32 DP; /* up to 2^32 DPs */ 289 | __u32 backlog; 290 | __u32 qave; 291 | __u32 forced; 292 | __u32 early; 293 | __u32 other; 294 | __u32 pdrop; 295 | __u8 Wlog; /* log(W) */ 296 | __u8 Plog; /* log(P_max/(qth_max-qth_min)) */ 297 | __u8 Scell_log; /* cell size for idle damping */ 298 | __u8 prio; /* prio of this VQ */ 299 | __u32 packets; 300 | __u32 bytesin; 301 | }; 302 | 303 | /* gred setup */ 304 | struct tc_gred_sopt { 305 | __u32 DPs; 306 | __u32 def_DP; 307 | __u8 grio; 308 | __u8 flags; 309 | __u16 pad1; 310 | }; 311 | 312 | /* CHOKe section */ 313 | 314 | enum { 315 | TCA_CHOKE_UNSPEC, 316 | TCA_CHOKE_PARMS, 317 | TCA_CHOKE_STAB, 318 | TCA_CHOKE_MAX_P, 319 | __TCA_CHOKE_MAX, 320 | }; 321 | 322 | #define TCA_CHOKE_MAX (__TCA_CHOKE_MAX - 1) 323 | 324 | struct tc_choke_qopt { 325 | __u32 limit; /* Hard queue length (packets) */ 326 | __u32 qth_min; /* Min average threshold (packets) */ 327 | __u32 qth_max; /* Max average threshold (packets) */ 328 | unsigned char Wlog; /* log(W) */ 329 | unsigned char Plog; /* log(P_max/(qth_max-qth_min)) */ 330 | unsigned char Scell_log; /* cell size for idle damping */ 331 | unsigned char flags; /* see RED flags */ 332 | }; 333 | 334 | struct tc_choke_xstats { 335 | __u32 early; /* Early drops */ 336 | __u32 pdrop; /* Drops due to queue limits */ 337 | __u32 other; /* Drops due to drop() calls */ 338 | __u32 marked; /* Marked packets */ 339 | __u32 matched; /* Drops due to flow match */ 340 | }; 341 | 342 | /* HTB section */ 343 | #define TC_HTB_NUMPRIO 8 344 | #define TC_HTB_MAXDEPTH 8 345 | #define TC_HTB_PROTOVER 3 /* the same as HTB and TC's major */ 346 | 347 | struct tc_htb_opt { 348 | struct tc_ratespec rate; 349 | struct tc_ratespec ceil; 350 | __u32 buffer; 351 | __u32 cbuffer; 352 | __u32 quantum; 353 | __u32 level; /* out only */ 354 | __u32 prio; 355 | }; 356 | struct tc_htb_glob { 357 | __u32 version; /* to match HTB/TC */ 358 | __u32 rate2quantum; /* bps->quantum divisor */ 359 | __u32 defcls; /* default class number */ 360 | __u32 debug; /* debug flags */ 361 | 362 | /* stats */ 363 | __u32 direct_pkts; /* count of non shaped packets */ 364 | }; 365 | enum { 366 | TCA_HTB_UNSPEC, 367 | TCA_HTB_PARMS, 368 | TCA_HTB_INIT, 369 | TCA_HTB_CTAB, 370 | TCA_HTB_RTAB, 371 | TCA_HTB_DIRECT_QLEN, 372 | TCA_HTB_RATE64, 373 | TCA_HTB_CEIL64, 374 | __TCA_HTB_MAX, 375 | }; 376 | 377 | #define TCA_HTB_MAX (__TCA_HTB_MAX - 1) 378 | 379 | struct tc_htb_xstats { 380 | __u32 lends; 381 | __u32 borrows; 382 | __u32 giants; /* too big packets (rate will not be accurate) */ 383 | __u32 tokens; 384 | __u32 ctokens; 385 | }; 386 | 387 | /* HFSC section */ 388 | 389 | struct tc_hfsc_qopt { 390 | __u16 defcls; /* default class */ 391 | }; 392 | 393 | struct tc_service_curve { 394 | __u32 m1; /* slope of the first segment in bps */ 395 | __u32 d; /* x-projection of the first segment in us */ 396 | __u32 m2; /* slope of the second segment in bps */ 397 | }; 398 | 399 | struct tc_hfsc_stats { 400 | __u64 work; /* total work done */ 401 | __u64 rtwork; /* work done by real-time criteria */ 402 | __u32 period; /* current period */ 403 | __u32 level; /* class level in hierarchy */ 404 | }; 405 | 406 | enum { 407 | TCA_HFSC_UNSPEC, 408 | TCA_HFSC_RSC, 409 | TCA_HFSC_FSC, 410 | TCA_HFSC_USC, 411 | __TCA_HFSC_MAX, 412 | }; 413 | 414 | #define TCA_HFSC_MAX (__TCA_HFSC_MAX - 1) 415 | 416 | 417 | /* CBQ section */ 418 | 419 | #define TC_CBQ_MAXPRIO 8 420 | #define TC_CBQ_MAXLEVEL 8 421 | #define TC_CBQ_DEF_EWMA 5 422 | 423 | struct tc_cbq_lssopt { 424 | unsigned char change; 425 | unsigned char flags; 426 | #define TCF_CBQ_LSS_BOUNDED 1 427 | #define TCF_CBQ_LSS_ISOLATED 2 428 | unsigned char ewma_log; 429 | unsigned char level; 430 | #define TCF_CBQ_LSS_FLAGS 1 431 | #define TCF_CBQ_LSS_EWMA 2 432 | #define TCF_CBQ_LSS_MAXIDLE 4 433 | #define TCF_CBQ_LSS_MINIDLE 8 434 | #define TCF_CBQ_LSS_OFFTIME 0x10 435 | #define TCF_CBQ_LSS_AVPKT 0x20 436 | __u32 maxidle; 437 | __u32 minidle; 438 | __u32 offtime; 439 | __u32 avpkt; 440 | }; 441 | 442 | struct tc_cbq_wrropt { 443 | unsigned char flags; 444 | unsigned char priority; 445 | unsigned char cpriority; 446 | unsigned char __reserved; 447 | __u32 allot; 448 | __u32 weight; 449 | }; 450 | 451 | struct tc_cbq_ovl { 452 | unsigned char strategy; 453 | #define TC_CBQ_OVL_CLASSIC 0 454 | #define TC_CBQ_OVL_DELAY 1 455 | #define TC_CBQ_OVL_LOWPRIO 2 456 | #define TC_CBQ_OVL_DROP 3 457 | #define TC_CBQ_OVL_RCLASSIC 4 458 | unsigned char priority2; 459 | __u16 pad; 460 | __u32 penalty; 461 | }; 462 | 463 | struct tc_cbq_police { 464 | unsigned char police; 465 | unsigned char __res1; 466 | unsigned short __res2; 467 | }; 468 | 469 | struct tc_cbq_fopt { 470 | __u32 split; 471 | __u32 defmap; 472 | __u32 defchange; 473 | }; 474 | 475 | struct tc_cbq_xstats { 476 | __u32 borrows; 477 | __u32 overactions; 478 | __s32 avgidle; 479 | __s32 undertime; 480 | }; 481 | 482 | enum { 483 | TCA_CBQ_UNSPEC, 484 | TCA_CBQ_LSSOPT, 485 | TCA_CBQ_WRROPT, 486 | TCA_CBQ_FOPT, 487 | TCA_CBQ_OVL_STRATEGY, 488 | TCA_CBQ_RATE, 489 | TCA_CBQ_RTAB, 490 | TCA_CBQ_POLICE, 491 | __TCA_CBQ_MAX, 492 | }; 493 | 494 | #define TCA_CBQ_MAX (__TCA_CBQ_MAX - 1) 495 | 496 | /* dsmark section */ 497 | 498 | enum { 499 | TCA_DSMARK_UNSPEC, 500 | TCA_DSMARK_INDICES, 501 | TCA_DSMARK_DEFAULT_INDEX, 502 | TCA_DSMARK_SET_TC_INDEX, 503 | TCA_DSMARK_MASK, 504 | TCA_DSMARK_VALUE, 505 | __TCA_DSMARK_MAX, 506 | }; 507 | 508 | #define TCA_DSMARK_MAX (__TCA_DSMARK_MAX - 1) 509 | 510 | /* ATM section */ 511 | 512 | enum { 513 | TCA_ATM_UNSPEC, 514 | TCA_ATM_FD, /* file/socket descriptor */ 515 | TCA_ATM_PTR, /* pointer to descriptor - later */ 516 | TCA_ATM_HDR, /* LL header */ 517 | TCA_ATM_EXCESS, /* excess traffic class (0 for CLP) */ 518 | TCA_ATM_ADDR, /* PVC address (for output only) */ 519 | TCA_ATM_STATE, /* VC state (ATM_VS_*; for output only) */ 520 | __TCA_ATM_MAX, 521 | }; 522 | 523 | #define TCA_ATM_MAX (__TCA_ATM_MAX - 1) 524 | 525 | /* Network emulator */ 526 | 527 | enum { 528 | TCA_NETEM_UNSPEC, 529 | TCA_NETEM_CORR, 530 | TCA_NETEM_DELAY_DIST, 531 | TCA_NETEM_REORDER, 532 | TCA_NETEM_CORRUPT, 533 | TCA_NETEM_LOSS, 534 | TCA_NETEM_RATE, 535 | TCA_NETEM_ECN, 536 | TCA_NETEM_RATE64, 537 | __TCA_NETEM_MAX, 538 | }; 539 | 540 | #define TCA_NETEM_MAX (__TCA_NETEM_MAX - 1) 541 | 542 | struct tc_netem_qopt { 543 | __u32 latency; /* added delay (us) */ 544 | __u32 limit; /* fifo limit (packets) */ 545 | __u32 loss; /* random packet loss (0=none ~0=100%) */ 546 | __u32 gap; /* re-ordering gap (0 for none) */ 547 | __u32 duplicate; /* random packet dup (0=none ~0=100%) */ 548 | __u32 jitter; /* random jitter in latency (us) */ 549 | }; 550 | 551 | struct tc_netem_corr { 552 | __u32 delay_corr; /* delay correlation */ 553 | __u32 loss_corr; /* packet loss correlation */ 554 | __u32 dup_corr; /* duplicate correlation */ 555 | }; 556 | 557 | struct tc_netem_reorder { 558 | __u32 probability; 559 | __u32 correlation; 560 | }; 561 | 562 | struct tc_netem_corrupt { 563 | __u32 probability; 564 | __u32 correlation; 565 | }; 566 | 567 | struct tc_netem_rate { 568 | __u32 rate; /* byte/s */ 569 | __s32 packet_overhead; 570 | __u32 cell_size; 571 | __s32 cell_overhead; 572 | }; 573 | 574 | enum { 575 | NETEM_LOSS_UNSPEC, 576 | NETEM_LOSS_GI, /* General Intuitive - 4 state model */ 577 | NETEM_LOSS_GE, /* Gilbert Elliot models */ 578 | __NETEM_LOSS_MAX 579 | }; 580 | #define NETEM_LOSS_MAX (__NETEM_LOSS_MAX - 1) 581 | 582 | /* State transition probabilities for 4 state model */ 583 | struct tc_netem_gimodel { 584 | __u32 p13; 585 | __u32 p31; 586 | __u32 p32; 587 | __u32 p14; 588 | __u32 p23; 589 | }; 590 | 591 | /* Gilbert-Elliot models */ 592 | struct tc_netem_gemodel { 593 | __u32 p; 594 | __u32 r; 595 | __u32 h; 596 | __u32 k1; 597 | }; 598 | 599 | #define NETEM_DIST_SCALE 8192 600 | #define NETEM_DIST_MAX 16384 601 | 602 | /* DRR */ 603 | 604 | enum { 605 | TCA_DRR_UNSPEC, 606 | TCA_DRR_QUANTUM, 607 | __TCA_DRR_MAX 608 | }; 609 | 610 | #define TCA_DRR_MAX (__TCA_DRR_MAX - 1) 611 | 612 | struct tc_drr_stats { 613 | __u32 deficit; 614 | }; 615 | 616 | /* MQPRIO */ 617 | #define TC_QOPT_BITMASK 15 618 | #define TC_QOPT_MAX_QUEUE 16 619 | 620 | struct tc_mqprio_qopt { 621 | __u8 num_tc; 622 | __u8 prio_tc_map[TC_QOPT_BITMASK + 1]; 623 | __u8 hw; 624 | __u16 count[TC_QOPT_MAX_QUEUE]; 625 | __u16 offset[TC_QOPT_MAX_QUEUE]; 626 | }; 627 | 628 | /* SFB */ 629 | 630 | enum { 631 | TCA_SFB_UNSPEC, 632 | TCA_SFB_PARMS, 633 | __TCA_SFB_MAX, 634 | }; 635 | 636 | #define TCA_SFB_MAX (__TCA_SFB_MAX - 1) 637 | 638 | /* 639 | * Note: increment, decrement are Q0.16 fixed-point values. 640 | */ 641 | struct tc_sfb_qopt { 642 | __u32 rehash_interval; /* delay between hash move, in ms */ 643 | __u32 warmup_time; /* double buffering warmup time in ms (warmup_time < rehash_interval) */ 644 | __u32 max; /* max len of qlen_min */ 645 | __u32 bin_size; /* maximum queue length per bin */ 646 | __u32 increment; /* probability increment, (d1 in Blue) */ 647 | __u32 decrement; /* probability decrement, (d2 in Blue) */ 648 | __u32 limit; /* max SFB queue length */ 649 | __u32 penalty_rate; /* inelastic flows are rate limited to 'rate' pps */ 650 | __u32 penalty_burst; 651 | }; 652 | 653 | struct tc_sfb_xstats { 654 | __u32 earlydrop; 655 | __u32 penaltydrop; 656 | __u32 bucketdrop; 657 | __u32 queuedrop; 658 | __u32 childdrop; /* drops in child qdisc */ 659 | __u32 marked; 660 | __u32 maxqlen; 661 | __u32 maxprob; 662 | __u32 avgprob; 663 | }; 664 | 665 | #define SFB_MAX_PROB 0xFFFF 666 | 667 | /* QFQ */ 668 | enum { 669 | TCA_QFQ_UNSPEC, 670 | TCA_QFQ_WEIGHT, 671 | TCA_QFQ_LMAX, 672 | __TCA_QFQ_MAX 673 | }; 674 | 675 | #define TCA_QFQ_MAX (__TCA_QFQ_MAX - 1) 676 | 677 | struct tc_qfq_stats { 678 | __u32 weight; 679 | __u32 lmax; 680 | }; 681 | 682 | /* CODEL */ 683 | 684 | enum { 685 | TCA_CODEL_UNSPEC, 686 | TCA_CODEL_TARGET, 687 | TCA_CODEL_LIMIT, 688 | TCA_CODEL_INTERVAL, 689 | TCA_CODEL_ECN, 690 | TCA_CODEL_CE_THRESHOLD, 691 | __TCA_CODEL_MAX 692 | }; 693 | 694 | #define TCA_CODEL_MAX (__TCA_CODEL_MAX - 1) 695 | 696 | struct tc_codel_xstats { 697 | __u32 maxpacket; /* largest packet we've seen so far */ 698 | __u32 count; /* how many drops we've done since the last time we 699 | * entered dropping state 700 | */ 701 | __u32 lastcount; /* count at entry to dropping state */ 702 | __u32 ldelay; /* in-queue delay seen by most recently dequeued packet */ 703 | __s32 drop_next; /* time to drop next packet */ 704 | __u32 drop_overlimit; /* number of time max qdisc packet limit was hit */ 705 | __u32 ecn_mark; /* number of packets we ECN marked instead of dropped */ 706 | __u32 dropping; /* are we in dropping state ? */ 707 | __u32 ce_mark; /* number of CE marked packets because of ce_threshold */ 708 | }; 709 | 710 | /* FQ_CODEL */ 711 | 712 | enum { 713 | TCA_FQ_CODEL_UNSPEC, 714 | TCA_FQ_CODEL_TARGET, 715 | TCA_FQ_CODEL_LIMIT, 716 | TCA_FQ_CODEL_INTERVAL, 717 | TCA_FQ_CODEL_ECN, 718 | TCA_FQ_CODEL_FLOWS, 719 | TCA_FQ_CODEL_QUANTUM, 720 | TCA_FQ_CODEL_CE_THRESHOLD, 721 | __TCA_FQ_CODEL_MAX 722 | }; 723 | 724 | #define TCA_FQ_CODEL_MAX (__TCA_FQ_CODEL_MAX - 1) 725 | 726 | enum { 727 | TCA_FQ_CODEL_XSTATS_QDISC, 728 | TCA_FQ_CODEL_XSTATS_CLASS, 729 | }; 730 | 731 | struct tc_fq_codel_qd_stats { 732 | __u32 maxpacket; /* largest packet we've seen so far */ 733 | __u32 drop_overlimit; /* number of time max qdisc 734 | * packet limit was hit 735 | */ 736 | __u32 ecn_mark; /* number of packets we ECN marked 737 | * instead of being dropped 738 | */ 739 | __u32 new_flow_count; /* number of time packets 740 | * created a 'new flow' 741 | */ 742 | __u32 new_flows_len; /* count of flows in new list */ 743 | __u32 old_flows_len; /* count of flows in old list */ 744 | __u32 ce_mark; /* packets above ce_threshold */ 745 | }; 746 | 747 | struct tc_fq_codel_cl_stats { 748 | __s32 deficit; 749 | __u32 ldelay; /* in-queue delay seen by most recently 750 | * dequeued packet 751 | */ 752 | __u32 count; 753 | __u32 lastcount; 754 | __u32 dropping; 755 | __s32 drop_next; 756 | }; 757 | 758 | struct tc_fq_codel_xstats { 759 | __u32 type; 760 | union { 761 | struct tc_fq_codel_qd_stats qdisc_stats; 762 | struct tc_fq_codel_cl_stats class_stats; 763 | }; 764 | }; 765 | 766 | /* FQ */ 767 | 768 | enum { 769 | TCA_FQ_UNSPEC, 770 | 771 | TCA_FQ_PLIMIT, /* limit of total number of packets in queue */ 772 | 773 | TCA_FQ_FLOW_PLIMIT, /* limit of packets per flow */ 774 | 775 | TCA_FQ_QUANTUM, /* RR quantum */ 776 | 777 | TCA_FQ_INITIAL_QUANTUM, /* RR quantum for new flow */ 778 | 779 | TCA_FQ_RATE_ENABLE, /* enable/disable rate limiting */ 780 | 781 | TCA_FQ_FLOW_DEFAULT_RATE,/* obsolete, do not use */ 782 | 783 | TCA_FQ_FLOW_MAX_RATE, /* per flow max rate */ 784 | 785 | TCA_FQ_BUCKETS_LOG, /* log2(number of buckets) */ 786 | 787 | TCA_FQ_FLOW_REFILL_DELAY, /* flow credit refill delay in usec */ 788 | 789 | TCA_FQ_ORPHAN_MASK, /* mask applied to orphaned skb hashes */ 790 | 791 | __TCA_FQ_MAX 792 | }; 793 | 794 | #define TCA_FQ_MAX (__TCA_FQ_MAX - 1) 795 | 796 | struct tc_fq_qd_stats { 797 | __u64 gc_flows; 798 | __u64 highprio_packets; 799 | __u64 tcp_retrans; 800 | __u64 throttled; 801 | __u64 flows_plimit; 802 | __u64 pkts_too_long; 803 | __u64 allocation_errors; 804 | __s64 time_next_delayed_flow; 805 | __u32 flows; 806 | __u32 inactive_flows; 807 | __u32 throttled_flows; 808 | __u32 pad; 809 | }; 810 | 811 | /* Heavy-Hitter Filter */ 812 | 813 | enum { 814 | TCA_HHF_UNSPEC, 815 | TCA_HHF_BACKLOG_LIMIT, 816 | TCA_HHF_QUANTUM, 817 | TCA_HHF_HH_FLOWS_LIMIT, 818 | TCA_HHF_RESET_TIMEOUT, 819 | TCA_HHF_ADMIT_BYTES, 820 | TCA_HHF_EVICT_TIMEOUT, 821 | TCA_HHF_NON_HH_WEIGHT, 822 | __TCA_HHF_MAX 823 | }; 824 | 825 | #define TCA_HHF_MAX (__TCA_HHF_MAX - 1) 826 | 827 | struct tc_hhf_xstats { 828 | __u32 drop_overlimit; /* number of times max qdisc packet limit 829 | * was hit 830 | */ 831 | __u32 hh_overlimit; /* number of times max heavy-hitters was hit */ 832 | __u32 hh_tot_count; /* number of captured heavy-hitters so far */ 833 | __u32 hh_cur_count; /* number of current heavy-hitters */ 834 | }; 835 | 836 | /* PIE */ 837 | enum { 838 | TCA_PIE_UNSPEC, 839 | TCA_PIE_TARGET, 840 | TCA_PIE_LIMIT, 841 | TCA_PIE_TUPDATE, 842 | TCA_PIE_ALPHA, 843 | TCA_PIE_BETA, 844 | TCA_PIE_ECN, 845 | TCA_PIE_BYTEMODE, 846 | __TCA_PIE_MAX 847 | }; 848 | #define TCA_PIE_MAX (__TCA_PIE_MAX - 1) 849 | 850 | struct tc_pie_xstats { 851 | __u32 prob; /* current probability */ 852 | __u32 delay; /* current delay in ms */ 853 | __u32 avg_dq_rate; /* current average dq_rate in bits/pie_time */ 854 | __u32 packets_in; /* total number of packets enqueued */ 855 | __u32 dropped; /* packets dropped due to pie_action */ 856 | __u32 overlimit; /* dropped due to lack of space in queue */ 857 | __u32 maxq; /* maximum queue size */ 858 | __u32 ecn_mark; /* packets marked with ecn*/ 859 | }; 860 | 861 | /* CAKE */ 862 | enum { 863 | TCA_CAKE_UNSPEC, 864 | TCA_CAKE_PAD, 865 | TCA_CAKE_BASE_RATE64, 866 | TCA_CAKE_DIFFSERV_MODE, 867 | TCA_CAKE_ATM, 868 | TCA_CAKE_FLOW_MODE, 869 | TCA_CAKE_OVERHEAD, 870 | TCA_CAKE_RTT, 871 | TCA_CAKE_TARGET, 872 | TCA_CAKE_AUTORATE, 873 | TCA_CAKE_MEMORY, 874 | TCA_CAKE_NAT, 875 | TCA_CAKE_RAW, // was _ETHERNET 876 | TCA_CAKE_WASH, 877 | TCA_CAKE_MPU, 878 | TCA_CAKE_INGRESS, 879 | TCA_CAKE_ACK_FILTER, 880 | TCA_CAKE_SPLIT_GSO, 881 | TCA_CAKE_FWMARK, 882 | __TCA_CAKE_MAX 883 | }; 884 | #define TCA_CAKE_MAX (__TCA_CAKE_MAX - 1) 885 | 886 | enum { 887 | __TCA_CAKE_STATS_INVALID, 888 | TCA_CAKE_STATS_PAD, 889 | TCA_CAKE_STATS_CAPACITY_ESTIMATE64, 890 | TCA_CAKE_STATS_MEMORY_LIMIT, 891 | TCA_CAKE_STATS_MEMORY_USED, 892 | TCA_CAKE_STATS_AVG_NETOFF, 893 | TCA_CAKE_STATS_MIN_NETLEN, 894 | TCA_CAKE_STATS_MAX_NETLEN, 895 | TCA_CAKE_STATS_MIN_ADJLEN, 896 | TCA_CAKE_STATS_MAX_ADJLEN, 897 | TCA_CAKE_STATS_TIN_STATS, 898 | TCA_CAKE_STATS_DEFICIT, 899 | TCA_CAKE_STATS_COBALT_COUNT, 900 | TCA_CAKE_STATS_DROPPING, 901 | TCA_CAKE_STATS_DROP_NEXT_US, 902 | TCA_CAKE_STATS_P_DROP, 903 | TCA_CAKE_STATS_BLUE_TIMER_US, 904 | __TCA_CAKE_STATS_MAX 905 | }; 906 | #define TCA_CAKE_STATS_MAX (__TCA_CAKE_STATS_MAX - 1) 907 | 908 | enum { 909 | __TCA_CAKE_TIN_STATS_INVALID, 910 | TCA_CAKE_TIN_STATS_PAD, 911 | TCA_CAKE_TIN_STATS_SENT_PACKETS, 912 | TCA_CAKE_TIN_STATS_SENT_BYTES64, 913 | TCA_CAKE_TIN_STATS_DROPPED_PACKETS, 914 | TCA_CAKE_TIN_STATS_DROPPED_BYTES64, 915 | TCA_CAKE_TIN_STATS_ACKS_DROPPED_PACKETS, 916 | TCA_CAKE_TIN_STATS_ACKS_DROPPED_BYTES64, 917 | TCA_CAKE_TIN_STATS_ECN_MARKED_PACKETS, 918 | TCA_CAKE_TIN_STATS_ECN_MARKED_BYTES64, 919 | TCA_CAKE_TIN_STATS_BACKLOG_PACKETS, 920 | TCA_CAKE_TIN_STATS_BACKLOG_BYTES, 921 | TCA_CAKE_TIN_STATS_THRESHOLD_RATE64, 922 | TCA_CAKE_TIN_STATS_TARGET_US, 923 | TCA_CAKE_TIN_STATS_INTERVAL_US, 924 | TCA_CAKE_TIN_STATS_WAY_INDIRECT_HITS, 925 | TCA_CAKE_TIN_STATS_WAY_MISSES, 926 | TCA_CAKE_TIN_STATS_WAY_COLLISIONS, 927 | TCA_CAKE_TIN_STATS_PEAK_DELAY_US, 928 | TCA_CAKE_TIN_STATS_AVG_DELAY_US, 929 | TCA_CAKE_TIN_STATS_BASE_DELAY_US, 930 | TCA_CAKE_TIN_STATS_SPARSE_FLOWS, 931 | TCA_CAKE_TIN_STATS_BULK_FLOWS, 932 | TCA_CAKE_TIN_STATS_UNRESPONSIVE_FLOWS, 933 | TCA_CAKE_TIN_STATS_MAX_SKBLEN, 934 | TCA_CAKE_TIN_STATS_FLOW_QUANTUM, 935 | __TCA_CAKE_TIN_STATS_MAX 936 | }; 937 | #define TCA_CAKE_TIN_STATS_MAX (__TCA_CAKE_TIN_STATS_MAX - 1) 938 | #define TC_CAKE_MAX_TINS (8) 939 | 940 | enum { 941 | CAKE_FLOW_NONE = 0, 942 | CAKE_FLOW_SRC_IP, 943 | CAKE_FLOW_DST_IP, 944 | CAKE_FLOW_HOSTS, /* = CAKE_FLOW_SRC_IP | CAKE_FLOW_DST_IP */ 945 | CAKE_FLOW_FLOWS, 946 | CAKE_FLOW_DUAL_SRC, /* = CAKE_FLOW_SRC_IP | CAKE_FLOW_FLOWS */ 947 | CAKE_FLOW_DUAL_DST, /* = CAKE_FLOW_DST_IP | CAKE_FLOW_FLOWS */ 948 | CAKE_FLOW_TRIPLE, /* = CAKE_FLOW_HOSTS | CAKE_FLOW_FLOWS */ 949 | CAKE_FLOW_MAX, 950 | }; 951 | 952 | enum { 953 | CAKE_DIFFSERV_DIFFSERV3 = 0, 954 | CAKE_DIFFSERV_DIFFSERV4, 955 | CAKE_DIFFSERV_DIFFSERV8, 956 | CAKE_DIFFSERV_BESTEFFORT, 957 | CAKE_DIFFSERV_PRECEDENCE, 958 | CAKE_DIFFSERV_MAX 959 | }; 960 | 961 | enum { 962 | CAKE_ACK_NONE = 0, 963 | CAKE_ACK_FILTER, 964 | CAKE_ACK_AGGRESSIVE, 965 | CAKE_ACK_MAX 966 | }; 967 | 968 | enum { 969 | CAKE_ATM_NONE = 0, 970 | CAKE_ATM_ATM, 971 | CAKE_ATM_PTM, 972 | CAKE_ATM_MAX 973 | }; 974 | 975 | #endif 976 | -------------------------------------------------------------------------------- /sch_cake.c: -------------------------------------------------------------------------------- 1 | // SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause 2 | 3 | /* COMMON Applications Kept Enhanced (CAKE) discipline 4 | * 5 | * Copyright (C) 2014-2018 Jonathan Morton 6 | * Copyright (C) 2015-2018 Toke Høiland-Jørgensen 7 | * Copyright (C) 2014-2018 Dave Täht 8 | * Copyright (C) 2015-2018 Sebastian Moeller 9 | * (C) 2015-2018 Kevin Darbyshire-Bryant 10 | * Copyright (C) 2017-2018 Ryan Mounce 11 | * 12 | * The CAKE Principles: 13 | * (or, how to have your cake and eat it too) 14 | * 15 | * This is a combination of several shaping, AQM and FQ techniques into one 16 | * easy-to-use package: 17 | * 18 | * - An overall bandwidth shaper, to move the bottleneck away from dumb CPE 19 | * equipment and bloated MACs. This operates in deficit mode (as in sch_fq), 20 | * eliminating the need for any sort of burst parameter (eg. token bucket 21 | * depth). Burst support is limited to that necessary to overcome scheduling 22 | * latency. 23 | * 24 | * - A Diffserv-aware priority queue, giving more priority to certain classes, 25 | * up to a specified fraction of bandwidth. Above that bandwidth threshold, 26 | * the priority is reduced to avoid starving other tins. 27 | * 28 | * - Each priority tin has a separate Flow Queue system, to isolate traffic 29 | * flows from each other. This prevents a burst on one flow from increasing 30 | * the delay to another. Flows are distributed to queues using a 31 | * set-associative hash function. 32 | * 33 | * - Each queue is actively managed by Cobalt, which is a combination of the 34 | * Codel and Blue AQM algorithms. This serves flows fairly, and signals 35 | * congestion early via ECN (if available) and/or packet drops, to keep 36 | * latency low. The codel parameters are auto-tuned based on the bandwidth 37 | * setting, as is necessary at low bandwidths. 38 | * 39 | * The configuration parameters are kept deliberately simple for ease of use. 40 | * Everything has sane defaults. Complete generality of configuration is *not* 41 | * a goal. 42 | * 43 | * The priority queue operates according to a weighted DRR scheme, combined with 44 | * a bandwidth tracker which reuses the shaper logic to detect which side of the 45 | * bandwidth sharing threshold the tin is operating. This determines whether a 46 | * priority-based weight (high) or a bandwidth-based weight (low) is used for 47 | * that tin in the current pass. 48 | * 49 | * This qdisc was inspired by Eric Dumazet's fq_codel code, which he kindly 50 | * granted us permission to leverage. 51 | */ 52 | 53 | #include 54 | #include 55 | #include 56 | #include 57 | #include 58 | #include 59 | #include 60 | #include 61 | #include 62 | #include 63 | #include 64 | #include 65 | #include 66 | #include 67 | #include 68 | #include "pkt_sched.h" 69 | #include 70 | #include 71 | #include 72 | #if LINUX_VERSION_CODE < KERNEL_VERSION(4, 2, 0) 73 | #include 74 | #else 75 | #include 76 | #endif 77 | #include "cobalt_compat.h" 78 | 79 | #if IS_REACHABLE(CONFIG_NF_CONNTRACK) 80 | #include 81 | #include 82 | #include 83 | #endif 84 | 85 | #define CAKE_SET_WAYS (8) 86 | #define CAKE_MAX_TINS (8) 87 | #define CAKE_QUEUES (1024) 88 | #define CAKE_FLOW_MASK 63 89 | #define CAKE_FLOW_NAT_FLAG 64 90 | 91 | /* struct cobalt_params - contains codel and blue parameters 92 | * @interval: codel initial drop rate 93 | * @target: maximum persistent sojourn time & blue update rate 94 | * @mtu_time: serialisation delay of maximum-size packet 95 | * @p_inc: increment of blue drop probability (0.32 fxp) 96 | * @p_dec: decrement of blue drop probability (0.32 fxp) 97 | */ 98 | struct cobalt_params { 99 | u64 interval; 100 | u64 target; 101 | u64 mtu_time; 102 | u32 p_inc; 103 | u32 p_dec; 104 | }; 105 | 106 | /* struct cobalt_vars - contains codel and blue variables 107 | * @count: codel dropping frequency 108 | * @rec_inv_sqrt: reciprocal value of sqrt(count) >> 1 109 | * @drop_next: time to drop next packet, or when we dropped last 110 | * @blue_timer: Blue time to next drop 111 | * @p_drop: BLUE drop probability (0.32 fxp) 112 | * @dropping: set if in dropping state 113 | * @ecn_marked: set if marked 114 | */ 115 | struct cobalt_vars { 116 | u32 count; 117 | u32 rec_inv_sqrt; 118 | ktime_t drop_next; 119 | ktime_t blue_timer; 120 | u32 p_drop; 121 | bool dropping; 122 | bool ecn_marked; 123 | }; 124 | 125 | enum { 126 | CAKE_SET_NONE = 0, 127 | CAKE_SET_SPARSE, 128 | CAKE_SET_SPARSE_WAIT, /* counted in SPARSE, actually in BULK */ 129 | CAKE_SET_BULK, 130 | CAKE_SET_DECAYING 131 | }; 132 | 133 | struct cake_flow { 134 | /* this stuff is all needed per-flow at dequeue time */ 135 | struct sk_buff *head; 136 | struct sk_buff *tail; 137 | struct list_head flowchain; 138 | s32 deficit; 139 | u32 dropped; 140 | struct cobalt_vars cvars; 141 | u16 srchost; /* index into cake_host table */ 142 | u16 dsthost; 143 | u8 set; 144 | }; /* please try to keep this structure <= 64 bytes */ 145 | 146 | struct cake_host { 147 | u32 srchost_tag; 148 | u32 dsthost_tag; 149 | u16 srchost_bulk_flow_count; 150 | u16 dsthost_bulk_flow_count; 151 | }; 152 | 153 | struct cake_heap_entry { 154 | u16 t:3, b:10; 155 | }; 156 | 157 | struct cake_tin_data { 158 | struct cake_flow flows[CAKE_QUEUES]; 159 | u32 backlogs[CAKE_QUEUES]; 160 | u32 tags[CAKE_QUEUES]; /* for set association */ 161 | u16 overflow_idx[CAKE_QUEUES]; 162 | struct cake_host hosts[CAKE_QUEUES]; /* for triple isolation */ 163 | u32 perturb; 164 | u16 flow_quantum; 165 | 166 | struct cobalt_params cparams; 167 | u32 drop_overlimit; 168 | u16 bulk_flow_count; 169 | u16 sparse_flow_count; 170 | u16 decaying_flow_count; 171 | u16 unresponsive_flow_count; 172 | 173 | u32 max_skblen; 174 | 175 | struct list_head new_flows; 176 | struct list_head old_flows; 177 | struct list_head decaying_flows; 178 | 179 | /* time_next = time_this + ((len * rate_ns) >> rate_shft) */ 180 | ktime_t time_next_packet; 181 | u64 tin_rate_ns; 182 | u64 tin_rate_bps; 183 | u16 tin_rate_shft; 184 | 185 | u16 tin_quantum; 186 | s32 tin_deficit; 187 | u32 tin_backlog; 188 | u32 tin_dropped; 189 | u32 tin_ecn_mark; 190 | 191 | u32 packets; 192 | u64 bytes; 193 | 194 | u32 ack_drops; 195 | 196 | /* moving averages */ 197 | u64 avge_delay; 198 | u64 peak_delay; 199 | u64 base_delay; 200 | 201 | /* hash function stats */ 202 | u32 way_directs; 203 | u32 way_hits; 204 | u32 way_misses; 205 | u32 way_collisions; 206 | }; /* number of tins is small, so size of this struct doesn't matter much */ 207 | 208 | struct cake_sched_data { 209 | struct tcf_proto __rcu *filter_list; /* optional external classifier */ 210 | #if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 13, 0) 211 | struct tcf_block *block; 212 | #endif 213 | struct cake_tin_data *tins; 214 | 215 | struct cake_heap_entry overflow_heap[CAKE_QUEUES * CAKE_MAX_TINS]; 216 | u16 overflow_timeout; 217 | 218 | u16 tin_cnt; 219 | u8 tin_mode; 220 | u8 flow_mode; 221 | u8 ack_filter; 222 | u8 atm_mode; 223 | 224 | u32 fwmark_mask; 225 | u16 fwmark_shft; 226 | 227 | /* time_next = time_this + ((len * rate_ns) >> rate_shft) */ 228 | u16 rate_shft; 229 | ktime_t time_next_packet; 230 | ktime_t failsafe_next_packet; 231 | u64 rate_ns; 232 | u64 rate_bps; 233 | u16 rate_flags; 234 | s16 rate_overhead; 235 | u16 rate_mpu; 236 | u64 interval; 237 | u64 target; 238 | 239 | /* resource tracking */ 240 | u32 buffer_used; 241 | u32 buffer_max_used; 242 | u32 buffer_limit; 243 | u32 buffer_config_limit; 244 | 245 | /* indices for dequeue */ 246 | u16 cur_tin; 247 | u16 cur_flow; 248 | 249 | struct qdisc_watchdog watchdog; 250 | const u8 *tin_index; 251 | const u8 *tin_order; 252 | 253 | /* bandwidth capacity estimate */ 254 | ktime_t last_packet_time; 255 | ktime_t avg_window_begin; 256 | u64 avg_packet_interval; 257 | u64 avg_window_bytes; 258 | u64 avg_peak_bandwidth; 259 | ktime_t last_reconfig_time; 260 | 261 | /* packet length stats */ 262 | u32 avg_netoff; 263 | u16 max_netlen; 264 | u16 max_adjlen; 265 | u16 min_netlen; 266 | u16 min_adjlen; 267 | }; 268 | 269 | enum { 270 | CAKE_FLAG_OVERHEAD = BIT(0), 271 | CAKE_FLAG_AUTORATE_INGRESS = BIT(1), 272 | CAKE_FLAG_INGRESS = BIT(2), 273 | CAKE_FLAG_WASH = BIT(3), 274 | CAKE_FLAG_SPLIT_GSO = BIT(4) 275 | }; 276 | 277 | /* COBALT operates the Codel and BLUE algorithms in parallel, in order to 278 | * obtain the best features of each. Codel is excellent on flows which 279 | * respond to congestion signals in a TCP-like way. BLUE is more effective on 280 | * unresponsive flows. 281 | */ 282 | 283 | struct cobalt_skb_cb { 284 | ktime_t enqueue_time; 285 | u32 adjusted_len; 286 | }; 287 | 288 | static u64 us_to_ns(u64 us) 289 | { 290 | return us * NSEC_PER_USEC; 291 | } 292 | 293 | static struct cobalt_skb_cb *get_cobalt_cb(const struct sk_buff *skb) 294 | { 295 | qdisc_cb_private_validate(skb, sizeof(struct cobalt_skb_cb)); 296 | return (struct cobalt_skb_cb *)qdisc_skb_cb(skb)->data; 297 | } 298 | 299 | static ktime_t cobalt_get_enqueue_time(const struct sk_buff *skb) 300 | { 301 | return get_cobalt_cb(skb)->enqueue_time; 302 | } 303 | 304 | static void cobalt_set_enqueue_time(struct sk_buff *skb, 305 | ktime_t now) 306 | { 307 | get_cobalt_cb(skb)->enqueue_time = now; 308 | } 309 | 310 | static u16 quantum_div[CAKE_QUEUES + 1] = {0}; 311 | 312 | /* Diffserv lookup tables */ 313 | 314 | static const u8 precedence[] = { 315 | 0, 0, 0, 0, 0, 0, 0, 0, 316 | 1, 1, 1, 1, 1, 1, 1, 1, 317 | 2, 2, 2, 2, 2, 2, 2, 2, 318 | 3, 3, 3, 3, 3, 3, 3, 3, 319 | 4, 4, 4, 4, 4, 4, 4, 4, 320 | 5, 5, 5, 5, 5, 5, 5, 5, 321 | 6, 6, 6, 6, 6, 6, 6, 6, 322 | 7, 7, 7, 7, 7, 7, 7, 7, 323 | }; 324 | 325 | static const u8 diffserv8[] = { 326 | 2, 0, 1, 2, 4, 2, 2, 2, 327 | 1, 2, 1, 2, 1, 2, 1, 2, 328 | 5, 2, 4, 2, 4, 2, 4, 2, 329 | 3, 2, 3, 2, 3, 2, 3, 2, 330 | 6, 2, 3, 2, 3, 2, 3, 2, 331 | 6, 2, 2, 2, 6, 2, 6, 2, 332 | 7, 2, 2, 2, 2, 2, 2, 2, 333 | 7, 2, 2, 2, 2, 2, 2, 2, 334 | }; 335 | 336 | static const u8 diffserv4[] = { 337 | 0, 1, 0, 0, 2, 0, 0, 0, 338 | 1, 0, 0, 0, 0, 0, 0, 0, 339 | 2, 0, 2, 0, 2, 0, 2, 0, 340 | 2, 0, 2, 0, 2, 0, 2, 0, 341 | 3, 0, 2, 0, 2, 0, 2, 0, 342 | 3, 0, 0, 0, 3, 0, 3, 0, 343 | 3, 0, 0, 0, 0, 0, 0, 0, 344 | 3, 0, 0, 0, 0, 0, 0, 0, 345 | }; 346 | 347 | static const u8 diffserv3[] = { 348 | 0, 1, 0, 0, 2, 0, 0, 0, 349 | 1, 0, 0, 0, 0, 0, 0, 0, 350 | 0, 0, 0, 0, 0, 0, 0, 0, 351 | 0, 0, 0, 0, 0, 0, 0, 0, 352 | 0, 0, 0, 0, 0, 0, 0, 0, 353 | 0, 0, 0, 0, 2, 0, 2, 0, 354 | 2, 0, 0, 0, 0, 0, 0, 0, 355 | 2, 0, 0, 0, 0, 0, 0, 0, 356 | }; 357 | 358 | static const u8 besteffort[] = { 359 | 0, 0, 0, 0, 0, 0, 0, 0, 360 | 0, 0, 0, 0, 0, 0, 0, 0, 361 | 0, 0, 0, 0, 0, 0, 0, 0, 362 | 0, 0, 0, 0, 0, 0, 0, 0, 363 | 0, 0, 0, 0, 0, 0, 0, 0, 364 | 0, 0, 0, 0, 0, 0, 0, 0, 365 | 0, 0, 0, 0, 0, 0, 0, 0, 366 | 0, 0, 0, 0, 0, 0, 0, 0, 367 | }; 368 | 369 | /* tin priority order for stats dumping */ 370 | 371 | static const u8 normal_order[] = {0, 1, 2, 3, 4, 5, 6, 7}; 372 | static const u8 bulk_order[] = {1, 0, 2, 3}; 373 | 374 | #define REC_INV_SQRT_CACHE (16) 375 | static u32 cobalt_rec_inv_sqrt_cache[REC_INV_SQRT_CACHE] = {0}; 376 | 377 | /* http://en.wikipedia.org/wiki/Methods_of_computing_square_roots 378 | * new_invsqrt = (invsqrt / 2) * (3 - count * invsqrt^2) 379 | * 380 | * Here, invsqrt is a fixed point number (< 1.0), 32bit mantissa, aka Q0.32 381 | */ 382 | 383 | static void cobalt_newton_step(struct cobalt_vars *vars) 384 | { 385 | u32 invsqrt, invsqrt2; 386 | u64 val; 387 | 388 | invsqrt = vars->rec_inv_sqrt; 389 | invsqrt2 = ((u64)invsqrt * invsqrt) >> 32; 390 | val = (3LL << 32) - ((u64)vars->count * invsqrt2); 391 | 392 | val >>= 2; /* avoid overflow in following multiply */ 393 | val = (val * invsqrt) >> (32 - 2 + 1); 394 | 395 | vars->rec_inv_sqrt = val; 396 | } 397 | 398 | static void cobalt_invsqrt(struct cobalt_vars *vars) 399 | { 400 | if (vars->count < REC_INV_SQRT_CACHE) 401 | vars->rec_inv_sqrt = cobalt_rec_inv_sqrt_cache[vars->count]; 402 | else 403 | cobalt_newton_step(vars); 404 | } 405 | 406 | /* There is a big difference in timing between the accurate values placed in 407 | * the cache and the approximations given by a single Newton step for small 408 | * count values, particularly when stepping from count 1 to 2 or vice versa. 409 | * Above 16, a single Newton step gives sufficient accuracy in either 410 | * direction, given the precision stored. 411 | * 412 | * The magnitude of the error when stepping up to count 2 is such as to give 413 | * the value that *should* have been produced at count 4. 414 | */ 415 | 416 | static void cobalt_cache_init(void) 417 | { 418 | struct cobalt_vars v; 419 | 420 | memset(&v, 0, sizeof(v)); 421 | v.rec_inv_sqrt = ~0U; 422 | cobalt_rec_inv_sqrt_cache[0] = v.rec_inv_sqrt; 423 | 424 | for (v.count = 1; v.count < REC_INV_SQRT_CACHE; v.count++) { 425 | cobalt_newton_step(&v); 426 | cobalt_newton_step(&v); 427 | cobalt_newton_step(&v); 428 | cobalt_newton_step(&v); 429 | 430 | cobalt_rec_inv_sqrt_cache[v.count] = v.rec_inv_sqrt; 431 | } 432 | } 433 | 434 | static void cobalt_vars_init(struct cobalt_vars *vars) 435 | { 436 | memset(vars, 0, sizeof(*vars)); 437 | 438 | if (!cobalt_rec_inv_sqrt_cache[0]) { 439 | cobalt_cache_init(); 440 | cobalt_rec_inv_sqrt_cache[0] = ~0; 441 | } 442 | } 443 | 444 | /* CoDel control_law is t + interval/sqrt(count) 445 | * We maintain in rec_inv_sqrt the reciprocal value of sqrt(count) to avoid 446 | * both sqrt() and divide operation. 447 | */ 448 | static ktime_t cobalt_control(ktime_t t, 449 | u64 interval, 450 | u32 rec_inv_sqrt) 451 | { 452 | return ktime_add_ns(t, reciprocal_scale(interval, 453 | rec_inv_sqrt)); 454 | } 455 | 456 | /* Call this when a packet had to be dropped due to queue overflow. Returns 457 | * true if the BLUE state was quiescent before but active after this call. 458 | */ 459 | static bool cobalt_queue_full(struct cobalt_vars *vars, 460 | struct cobalt_params *p, 461 | ktime_t now) 462 | { 463 | bool up = false; 464 | 465 | if (ktime_to_ns(ktime_sub(now, vars->blue_timer)) > p->target) { 466 | up = !vars->p_drop; 467 | vars->p_drop += p->p_inc; 468 | if (vars->p_drop < p->p_inc) 469 | vars->p_drop = ~0; 470 | vars->blue_timer = now; 471 | } 472 | vars->dropping = true; 473 | vars->drop_next = now; 474 | if (!vars->count) 475 | vars->count = 1; 476 | 477 | return up; 478 | } 479 | 480 | /* Call this when the queue was serviced but turned out to be empty. Returns 481 | * true if the BLUE state was active before but quiescent after this call. 482 | */ 483 | static bool cobalt_queue_empty(struct cobalt_vars *vars, 484 | struct cobalt_params *p, 485 | ktime_t now) 486 | { 487 | bool down = false; 488 | 489 | if (vars->p_drop && 490 | ktime_to_ns(ktime_sub(now, vars->blue_timer)) > p->target) { 491 | if (vars->p_drop < p->p_dec) 492 | vars->p_drop = 0; 493 | else 494 | vars->p_drop -= p->p_dec; 495 | vars->blue_timer = now; 496 | down = !vars->p_drop; 497 | } 498 | vars->dropping = false; 499 | 500 | if (vars->count && ktime_to_ns(ktime_sub(now, vars->drop_next)) >= 0) { 501 | vars->count--; 502 | cobalt_invsqrt(vars); 503 | vars->drop_next = cobalt_control(vars->drop_next, 504 | p->interval, 505 | vars->rec_inv_sqrt); 506 | } 507 | 508 | return down; 509 | } 510 | 511 | static __be16 cake_skb_proto(const struct sk_buff *skb) 512 | { 513 | unsigned int offset = skb_mac_offset(skb) + sizeof(struct ethhdr); 514 | __be16 proto = skb->protocol; 515 | struct vlan_hdr vhdr, *vh; 516 | 517 | while (proto == htons(ETH_P_8021Q) || proto == htons(ETH_P_8021AD)) { 518 | vh = skb_header_pointer(skb, offset, sizeof(vhdr), &vhdr); 519 | if (!vh) 520 | break; 521 | 522 | proto = vh->h_vlan_encapsulated_proto; 523 | offset += sizeof(vhdr); 524 | } 525 | 526 | return proto; 527 | } 528 | 529 | static int cake_set_ce(struct sk_buff *skb) 530 | { 531 | int wlen = skb_network_offset(skb); 532 | 533 | switch (cake_skb_proto(skb)) { 534 | case htons(ETH_P_IP): 535 | wlen += sizeof(struct iphdr); 536 | if (!pskb_may_pull(skb, wlen) || 537 | skb_try_make_writable(skb, wlen)) 538 | return 0; 539 | 540 | return IP_ECN_set_ce(ip_hdr(skb)); 541 | 542 | case htons(ETH_P_IPV6): 543 | wlen += sizeof(struct ipv6hdr); 544 | if (!pskb_may_pull(skb, wlen) || 545 | skb_try_make_writable(skb, wlen)) 546 | return 0; 547 | 548 | return IP6_ECN_set_ce(skb, ipv6_hdr(skb)); 549 | 550 | default: 551 | return 0; 552 | } 553 | 554 | return 0; 555 | } 556 | 557 | /* Call this with a freshly dequeued packet for possible congestion marking. 558 | * Returns true as an instruction to drop the packet, false for delivery. 559 | */ 560 | static bool cobalt_should_drop(struct cobalt_vars *vars, 561 | struct cobalt_params *p, 562 | ktime_t now, 563 | struct sk_buff *skb, 564 | u32 bulk_flows) 565 | { 566 | bool next_due, over_target, drop = false; 567 | ktime_t schedule; 568 | u64 sojourn; 569 | 570 | /* The 'schedule' variable records, in its sign, whether 'now' is before or 571 | * after 'drop_next'. This allows 'drop_next' to be updated before the next 572 | * scheduling decision is actually branched, without destroying that 573 | * information. Similarly, the first 'schedule' value calculated is preserved 574 | * in the boolean 'next_due'. 575 | * 576 | * As for 'drop_next', we take advantage of the fact that 'interval' is both 577 | * the delay between first exceeding 'target' and the first signalling event, 578 | * *and* the scaling factor for the signalling frequency. It's therefore very 579 | * natural to use a single mechanism for both purposes, and eliminates a 580 | * significant amount of reference Codel's spaghetti code. To help with this, 581 | * both the '0' and '1' entries in the invsqrt cache are 0xFFFFFFFF, as close 582 | * as possible to 1.0 in fixed-point. 583 | */ 584 | 585 | sojourn = ktime_to_ns(ktime_sub(now, cobalt_get_enqueue_time(skb))); 586 | schedule = ktime_sub(now, vars->drop_next); 587 | over_target = sojourn > p->target && 588 | sojourn > p->mtu_time * bulk_flows * 2 && 589 | sojourn > p->mtu_time * 4; 590 | next_due = vars->count && ktime_to_ns(schedule) >= 0; 591 | 592 | vars->ecn_marked = false; 593 | 594 | if (over_target) { 595 | if (!vars->dropping) { 596 | vars->dropping = true; 597 | vars->drop_next = cobalt_control(now, 598 | p->interval, 599 | vars->rec_inv_sqrt); 600 | } 601 | if (!vars->count) 602 | vars->count = 1; 603 | } else if (vars->dropping) { 604 | vars->dropping = false; 605 | } 606 | 607 | if (next_due && vars->dropping) { 608 | /* Use ECN mark if possible, otherwise drop */ 609 | drop = !(vars->ecn_marked = cake_set_ce(skb)); 610 | 611 | vars->count++; 612 | if (!vars->count) 613 | vars->count--; 614 | cobalt_invsqrt(vars); 615 | vars->drop_next = cobalt_control(vars->drop_next, 616 | p->interval, 617 | vars->rec_inv_sqrt); 618 | schedule = ktime_sub(now, vars->drop_next); 619 | } else { 620 | while (next_due) { 621 | vars->count--; 622 | cobalt_invsqrt(vars); 623 | vars->drop_next = cobalt_control(vars->drop_next, 624 | p->interval, 625 | vars->rec_inv_sqrt); 626 | schedule = ktime_sub(now, vars->drop_next); 627 | next_due = vars->count && ktime_to_ns(schedule) >= 0; 628 | } 629 | } 630 | 631 | /* Simple BLUE implementation. Lack of ECN is deliberate. */ 632 | if (vars->p_drop) 633 | drop |= (prandom_u32() < vars->p_drop); 634 | 635 | /* Overload the drop_next field as an activity timeout */ 636 | if (!vars->count) 637 | vars->drop_next = ktime_add_ns(now, p->interval); 638 | else if (ktime_to_ns(schedule) > 0 && !drop) 639 | vars->drop_next = now; 640 | 641 | return drop; 642 | } 643 | 644 | #if IS_REACHABLE(CONFIG_NF_CONNTRACK) 645 | 646 | static void cake_update_flowkeys(struct flow_keys *keys, 647 | const struct sk_buff *skb) 648 | { 649 | const struct nf_conntrack_tuple *tuple; 650 | enum ip_conntrack_info ctinfo; 651 | struct nf_conn *ct; 652 | bool rev = false; 653 | 654 | if (cake_skb_proto(skb) != htons(ETH_P_IP)) 655 | return; 656 | 657 | ct = nf_ct_get(skb, &ctinfo); 658 | if (ct) { 659 | tuple = nf_ct_tuple(ct, CTINFO2DIR(ctinfo)); 660 | } else { 661 | const struct nf_conntrack_tuple_hash *hash; 662 | struct nf_conntrack_tuple srctuple; 663 | 664 | #if KERNEL_VERSION(4, 4, 0) > LINUX_VERSION_CODE 665 | if (!nf_ct_get_tuplepr(skb, skb_network_offset(skb), 666 | NFPROTO_IPV4, &srctuple)) 667 | #else 668 | if (!nf_ct_get_tuplepr(skb, skb_network_offset(skb), 669 | NFPROTO_IPV4, dev_net(skb->dev), 670 | &srctuple)) 671 | #endif 672 | return; 673 | 674 | #if KERNEL_VERSION(4, 3, 0) > LINUX_VERSION_CODE 675 | hash = nf_conntrack_find_get(dev_net(skb->dev), 676 | NF_CT_DEFAULT_ZONE, 677 | &srctuple); 678 | #else 679 | hash = nf_conntrack_find_get(dev_net(skb->dev), 680 | &nf_ct_zone_dflt, 681 | &srctuple); 682 | #endif 683 | if (!hash) 684 | return; 685 | 686 | rev = true; 687 | ct = nf_ct_tuplehash_to_ctrack(hash); 688 | tuple = nf_ct_tuple(ct, !hash->tuple.dst.dir); 689 | } 690 | 691 | #if KERNEL_VERSION(4, 2, 0) > LINUX_VERSION_CODE 692 | keys->src = rev ? tuple->dst.u3.ip : tuple->src.u3.ip; 693 | keys->dst = rev ? tuple->src.u3.ip : tuple->dst.u3.ip; 694 | #else 695 | keys->addrs.v4addrs.src = rev ? tuple->dst.u3.ip : tuple->src.u3.ip; 696 | keys->addrs.v4addrs.dst = rev ? tuple->src.u3.ip : tuple->dst.u3.ip; 697 | #endif 698 | 699 | #if KERNEL_VERSION(4, 2, 0) > LINUX_VERSION_CODE 700 | if (keys->ports) { 701 | keys->port16[0] = rev ? tuple->dst.u.all : tuple->src.u.all; 702 | keys->port16[1] = rev ? tuple->src.u.all : tuple->dst.u.all; 703 | } 704 | #else 705 | if (keys->ports.ports) { 706 | keys->ports.src = rev ? tuple->dst.u.all : tuple->src.u.all; 707 | keys->ports.dst = rev ? tuple->src.u.all : tuple->dst.u.all; 708 | } 709 | #endif 710 | if (rev) 711 | nf_ct_put(ct); 712 | } 713 | #else 714 | static void cake_update_flowkeys(struct flow_keys *keys, 715 | const struct sk_buff *skb) 716 | { 717 | /* There is nothing we can do here without CONNTRACK */ 718 | } 719 | #endif 720 | 721 | /* Cake has several subtle multiple bit settings. In these cases you 722 | * would be matching triple isolate mode as well. 723 | */ 724 | 725 | static bool cake_dsrc(int flow_mode) 726 | { 727 | return (flow_mode & CAKE_FLOW_DUAL_SRC) == CAKE_FLOW_DUAL_SRC; 728 | } 729 | 730 | static bool cake_ddst(int flow_mode) 731 | { 732 | return (flow_mode & CAKE_FLOW_DUAL_DST) == CAKE_FLOW_DUAL_DST; 733 | } 734 | 735 | static u32 cake_hash(struct cake_tin_data *q, const struct sk_buff *skb, 736 | int flow_mode, u16 flow_override, u16 host_override) 737 | { 738 | u32 flow_hash = 0, srchost_hash = 0, dsthost_hash = 0; 739 | u16 reduced_hash, srchost_idx, dsthost_idx; 740 | #if KERNEL_VERSION(4, 2, 0) > LINUX_VERSION_CODE 741 | struct flow_keys keys; 742 | #else 743 | struct flow_keys keys, host_keys; 744 | #endif 745 | 746 | if (unlikely(flow_mode == CAKE_FLOW_NONE)) 747 | return 0; 748 | 749 | /* If both overrides are set we can skip packet dissection entirely */ 750 | if ((flow_override || !(flow_mode & CAKE_FLOW_FLOWS)) && 751 | (host_override || !(flow_mode & CAKE_FLOW_HOSTS))) 752 | goto skip_hash; 753 | 754 | #if KERNEL_VERSION(4, 2, 0) > LINUX_VERSION_CODE 755 | skb_flow_dissect(skb, &keys); 756 | 757 | if (flow_mode & CAKE_FLOW_NAT_FLAG) 758 | cake_update_flowkeys(&keys, skb); 759 | 760 | srchost_hash = jhash_1word((__force u32)keys.src, q->perturb); 761 | dsthost_hash = jhash_1word((__force u32)keys.dst, q->perturb); 762 | 763 | if (flow_mode & CAKE_FLOW_FLOWS) 764 | flow_hash = jhash_3words((__force u32)keys.dst, (__force u32)keys.src ^ keys.ip_proto, (__force u32)keys.ports, q->perturb); 765 | 766 | #else 767 | 768 | /* Linux kernel 4.2.x have skb_flow_dissect_flow_keys which takes only 2 769 | * arguments 770 | */ 771 | #if (KERNEL_VERSION(4, 2, 0) <= LINUX_VERSION_CODE) && (KERNEL_VERSION(4, 3, 0) > LINUX_VERSION_CODE) 772 | skb_flow_dissect_flow_keys(skb, &keys); 773 | #else 774 | skb_flow_dissect_flow_keys(skb, &keys, 775 | FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL); 776 | #endif 777 | 778 | if (flow_mode & CAKE_FLOW_NAT_FLAG) 779 | cake_update_flowkeys(&keys, skb); 780 | 781 | /* flow_hash_from_keys() sorts the addresses by value, so we have 782 | * to preserve their order in a separate data structure to treat 783 | * src and dst host addresses as independently selectable. 784 | */ 785 | host_keys = keys; 786 | host_keys.ports.ports = 0; 787 | host_keys.basic.ip_proto = 0; 788 | host_keys.keyid.keyid = 0; 789 | #if LINUX_VERSION_CODE < KERNEL_VERSION(4, 8, 0) 790 | host_keys.tags.vlan_id = 0; 791 | #endif 792 | host_keys.tags.flow_label = 0; 793 | 794 | switch (host_keys.control.addr_type) { 795 | case FLOW_DISSECTOR_KEY_IPV4_ADDRS: 796 | host_keys.addrs.v4addrs.src = 0; 797 | dsthost_hash = flow_hash_from_keys(&host_keys); 798 | host_keys.addrs.v4addrs.src = keys.addrs.v4addrs.src; 799 | host_keys.addrs.v4addrs.dst = 0; 800 | srchost_hash = flow_hash_from_keys(&host_keys); 801 | break; 802 | 803 | case FLOW_DISSECTOR_KEY_IPV6_ADDRS: 804 | memset(&host_keys.addrs.v6addrs.src, 0, 805 | sizeof(host_keys.addrs.v6addrs.src)); 806 | dsthost_hash = flow_hash_from_keys(&host_keys); 807 | host_keys.addrs.v6addrs.src = keys.addrs.v6addrs.src; 808 | memset(&host_keys.addrs.v6addrs.dst, 0, 809 | sizeof(host_keys.addrs.v6addrs.dst)); 810 | srchost_hash = flow_hash_from_keys(&host_keys); 811 | break; 812 | 813 | default: 814 | dsthost_hash = 0; 815 | srchost_hash = 0; 816 | } 817 | 818 | /* This *must* be after the above switch, since as a 819 | * side-effect it sorts the src and dst addresses. 820 | */ 821 | if (flow_mode & CAKE_FLOW_FLOWS) 822 | flow_hash = flow_hash_from_keys(&keys); 823 | #endif 824 | 825 | skip_hash: 826 | if (flow_override) 827 | flow_hash = flow_override - 1; 828 | if (host_override) { 829 | dsthost_hash = host_override - 1; 830 | srchost_hash = host_override - 1; 831 | } 832 | 833 | if (!(flow_mode & CAKE_FLOW_FLOWS)) { 834 | if (flow_mode & CAKE_FLOW_SRC_IP) 835 | flow_hash ^= srchost_hash; 836 | 837 | if (flow_mode & CAKE_FLOW_DST_IP) 838 | flow_hash ^= dsthost_hash; 839 | } 840 | 841 | reduced_hash = flow_hash % CAKE_QUEUES; 842 | 843 | /* set-associative hashing */ 844 | /* fast path if no hash collision (direct lookup succeeds) */ 845 | if (likely(q->tags[reduced_hash] == flow_hash && 846 | q->flows[reduced_hash].set)) { 847 | q->way_directs++; 848 | } else { 849 | u32 inner_hash = reduced_hash % CAKE_SET_WAYS; 850 | u32 outer_hash = reduced_hash - inner_hash; 851 | bool allocate_src = false; 852 | bool allocate_dst = false; 853 | u32 i, k; 854 | 855 | /* check if any active queue in the set is reserved for 856 | * this flow. 857 | */ 858 | for (i = 0, k = inner_hash; i < CAKE_SET_WAYS; 859 | i++, k = (k + 1) % CAKE_SET_WAYS) { 860 | if (q->tags[outer_hash + k] == flow_hash) { 861 | if (i) 862 | q->way_hits++; 863 | 864 | if (!q->flows[outer_hash + k].set) { 865 | /* need to increment host refcnts */ 866 | allocate_src = cake_dsrc(flow_mode); 867 | allocate_dst = cake_ddst(flow_mode); 868 | } 869 | 870 | goto found; 871 | } 872 | } 873 | 874 | /* no queue is reserved for this flow, look for an 875 | * empty one. 876 | */ 877 | for (i = 0; i < CAKE_SET_WAYS; 878 | i++, k = (k + 1) % CAKE_SET_WAYS) { 879 | if (!q->flows[outer_hash + k].set) { 880 | q->way_misses++; 881 | allocate_src = cake_dsrc(flow_mode); 882 | allocate_dst = cake_ddst(flow_mode); 883 | goto found; 884 | } 885 | } 886 | 887 | /* With no empty queues, default to the original 888 | * queue, accept the collision, update the host tags. 889 | */ 890 | q->way_collisions++; 891 | if (q->flows[outer_hash + k].set == CAKE_SET_BULK) { 892 | q->hosts[q->flows[reduced_hash].srchost].srchost_bulk_flow_count--; 893 | q->hosts[q->flows[reduced_hash].dsthost].dsthost_bulk_flow_count--; 894 | } 895 | allocate_src = cake_dsrc(flow_mode); 896 | allocate_dst = cake_ddst(flow_mode); 897 | found: 898 | /* reserve queue for future packets in same flow */ 899 | reduced_hash = outer_hash + k; 900 | q->tags[reduced_hash] = flow_hash; 901 | 902 | if (allocate_src) { 903 | srchost_idx = srchost_hash % CAKE_QUEUES; 904 | inner_hash = srchost_idx % CAKE_SET_WAYS; 905 | outer_hash = srchost_idx - inner_hash; 906 | for (i = 0, k = inner_hash; i < CAKE_SET_WAYS; 907 | i++, k = (k + 1) % CAKE_SET_WAYS) { 908 | if (q->hosts[outer_hash + k].srchost_tag == 909 | srchost_hash) 910 | goto found_src; 911 | } 912 | for (i = 0; i < CAKE_SET_WAYS; 913 | i++, k = (k + 1) % CAKE_SET_WAYS) { 914 | if (!q->hosts[outer_hash + k].srchost_bulk_flow_count) 915 | break; 916 | } 917 | q->hosts[outer_hash + k].srchost_tag = srchost_hash; 918 | found_src: 919 | srchost_idx = outer_hash + k; 920 | if (q->flows[reduced_hash].set == CAKE_SET_BULK) 921 | q->hosts[srchost_idx].srchost_bulk_flow_count++; 922 | q->flows[reduced_hash].srchost = srchost_idx; 923 | } 924 | 925 | if (allocate_dst) { 926 | dsthost_idx = dsthost_hash % CAKE_QUEUES; 927 | inner_hash = dsthost_idx % CAKE_SET_WAYS; 928 | outer_hash = dsthost_idx - inner_hash; 929 | for (i = 0, k = inner_hash; i < CAKE_SET_WAYS; 930 | i++, k = (k + 1) % CAKE_SET_WAYS) { 931 | if (q->hosts[outer_hash + k].dsthost_tag == 932 | dsthost_hash) 933 | goto found_dst; 934 | } 935 | for (i = 0; i < CAKE_SET_WAYS; 936 | i++, k = (k + 1) % CAKE_SET_WAYS) { 937 | if (!q->hosts[outer_hash + k].dsthost_bulk_flow_count) 938 | break; 939 | } 940 | q->hosts[outer_hash + k].dsthost_tag = dsthost_hash; 941 | found_dst: 942 | dsthost_idx = outer_hash + k; 943 | if (q->flows[reduced_hash].set == CAKE_SET_BULK) 944 | q->hosts[dsthost_idx].dsthost_bulk_flow_count++; 945 | q->flows[reduced_hash].dsthost = dsthost_idx; 946 | } 947 | } 948 | 949 | return reduced_hash; 950 | } 951 | 952 | /* helper functions : might be changed when/if skb use a standard list_head */ 953 | /* remove one skb from head of slot queue */ 954 | 955 | static struct sk_buff *dequeue_head(struct cake_flow *flow) 956 | { 957 | struct sk_buff *skb = flow->head; 958 | 959 | if (skb) { 960 | flow->head = skb->next; 961 | skb->next = NULL; 962 | } 963 | 964 | return skb; 965 | } 966 | 967 | /* add skb to flow queue (tail add) */ 968 | 969 | static void flow_queue_add(struct cake_flow *flow, struct sk_buff *skb) 970 | { 971 | if (!flow->head) 972 | flow->head = skb; 973 | else 974 | flow->tail->next = skb; 975 | flow->tail = skb; 976 | skb->next = NULL; 977 | } 978 | 979 | static struct iphdr *cake_get_iphdr(const struct sk_buff *skb, 980 | struct ipv6hdr *buf) 981 | { 982 | unsigned int offset = skb_network_offset(skb); 983 | struct iphdr *iph; 984 | 985 | iph = skb_header_pointer(skb, offset, sizeof(struct iphdr), buf); 986 | 987 | if (!iph) 988 | return NULL; 989 | 990 | if (iph->version == 4 && iph->protocol == IPPROTO_IPV6) 991 | return skb_header_pointer(skb, offset + iph->ihl * 4, 992 | sizeof(struct ipv6hdr), buf); 993 | 994 | else if (iph->version == 4) 995 | return iph; 996 | 997 | else if (iph->version == 6) 998 | return skb_header_pointer(skb, offset, sizeof(struct ipv6hdr), 999 | buf); 1000 | 1001 | return NULL; 1002 | } 1003 | 1004 | static struct tcphdr *cake_get_tcphdr(const struct sk_buff *skb, 1005 | void *buf, unsigned int bufsize) 1006 | { 1007 | unsigned int offset = skb_network_offset(skb); 1008 | const struct ipv6hdr *ipv6h; 1009 | const struct tcphdr *tcph; 1010 | const struct iphdr *iph; 1011 | struct ipv6hdr _ipv6h; 1012 | struct tcphdr _tcph; 1013 | 1014 | ipv6h = skb_header_pointer(skb, offset, sizeof(_ipv6h), &_ipv6h); 1015 | 1016 | if (!ipv6h) 1017 | return NULL; 1018 | 1019 | if (ipv6h->version == 4) { 1020 | iph = (struct iphdr *)ipv6h; 1021 | offset += iph->ihl * 4; 1022 | 1023 | /* special-case 6in4 tunnelling, as that is a common way to get 1024 | * v6 connectivity in the home 1025 | */ 1026 | if (iph->protocol == IPPROTO_IPV6) { 1027 | ipv6h = skb_header_pointer(skb, offset, 1028 | sizeof(_ipv6h), &_ipv6h); 1029 | 1030 | if (!ipv6h || ipv6h->nexthdr != IPPROTO_TCP) 1031 | return NULL; 1032 | 1033 | offset += sizeof(struct ipv6hdr); 1034 | 1035 | } else if (iph->protocol != IPPROTO_TCP) { 1036 | return NULL; 1037 | } 1038 | 1039 | } else if (ipv6h->version == 6) { 1040 | if (ipv6h->nexthdr != IPPROTO_TCP) 1041 | return NULL; 1042 | 1043 | offset += sizeof(struct ipv6hdr); 1044 | } else { 1045 | return NULL; 1046 | } 1047 | 1048 | tcph = skb_header_pointer(skb, offset, sizeof(_tcph), &_tcph); 1049 | if (!tcph) 1050 | return NULL; 1051 | 1052 | return skb_header_pointer(skb, offset, 1053 | min(__tcp_hdrlen(tcph), bufsize), buf); 1054 | } 1055 | 1056 | static const void *cake_get_tcpopt(const struct tcphdr *tcph, 1057 | int code, int *oplen) 1058 | { 1059 | /* inspired by tcp_parse_options in tcp_input.c */ 1060 | int length = __tcp_hdrlen(tcph) - sizeof(struct tcphdr); 1061 | const u8 *ptr = (const u8 *)(tcph + 1); 1062 | 1063 | while (length > 0) { 1064 | int opcode = *ptr++; 1065 | int opsize; 1066 | 1067 | if (opcode == TCPOPT_EOL) 1068 | break; 1069 | if (opcode == TCPOPT_NOP) { 1070 | length--; 1071 | continue; 1072 | } 1073 | if (length < 2) 1074 | break; 1075 | opsize = *ptr++; 1076 | if (opsize < 2 || opsize > length) 1077 | break; 1078 | 1079 | if (opcode == code) { 1080 | *oplen = opsize; 1081 | return ptr; 1082 | } 1083 | 1084 | ptr += opsize - 2; 1085 | length -= opsize; 1086 | } 1087 | 1088 | return NULL; 1089 | } 1090 | 1091 | /* Compare two SACK sequences. A sequence is considered greater if it SACKs more 1092 | * bytes than the other. In the case where both sequences ACKs bytes that the 1093 | * other doesn't, A is considered greater. DSACKs in A also makes A be 1094 | * considered greater. 1095 | * 1096 | * @return -1, 0 or 1 as normal compare functions 1097 | */ 1098 | static int cake_tcph_sack_compare(const struct tcphdr *tcph_a, 1099 | const struct tcphdr *tcph_b) 1100 | { 1101 | const struct tcp_sack_block_wire *sack_a, *sack_b; 1102 | u32 ack_seq_a = ntohl(tcph_a->ack_seq); 1103 | u32 bytes_a = 0, bytes_b = 0; 1104 | int oplen_a, oplen_b; 1105 | bool first = true; 1106 | 1107 | sack_a = cake_get_tcpopt(tcph_a, TCPOPT_SACK, &oplen_a); 1108 | sack_b = cake_get_tcpopt(tcph_b, TCPOPT_SACK, &oplen_b); 1109 | 1110 | /* pointers point to option contents */ 1111 | oplen_a -= TCPOLEN_SACK_BASE; 1112 | oplen_b -= TCPOLEN_SACK_BASE; 1113 | 1114 | if (sack_a && oplen_a >= sizeof(*sack_a) && 1115 | (!sack_b || oplen_b < sizeof(*sack_b))) 1116 | return -1; 1117 | else if (sack_b && oplen_b >= sizeof(*sack_b) && 1118 | (!sack_a || oplen_a < sizeof(*sack_a))) 1119 | return 1; 1120 | else if ((!sack_a || oplen_a < sizeof(*sack_a)) && 1121 | (!sack_b || oplen_b < sizeof(*sack_b))) 1122 | return 0; 1123 | 1124 | while (oplen_a >= sizeof(*sack_a)) { 1125 | const struct tcp_sack_block_wire *sack_tmp = sack_b; 1126 | u32 start_a = get_unaligned_be32(&sack_a->start_seq); 1127 | u32 end_a = get_unaligned_be32(&sack_a->end_seq); 1128 | int oplen_tmp = oplen_b; 1129 | bool found = false; 1130 | 1131 | /* DSACK; always considered greater to prevent dropping */ 1132 | if (before(start_a, ack_seq_a)) 1133 | return -1; 1134 | 1135 | bytes_a += end_a - start_a; 1136 | 1137 | while (oplen_tmp >= sizeof(*sack_tmp)) { 1138 | u32 start_b = get_unaligned_be32(&sack_tmp->start_seq); 1139 | u32 end_b = get_unaligned_be32(&sack_tmp->end_seq); 1140 | 1141 | /* first time through we count the total size */ 1142 | if (first) 1143 | bytes_b += end_b - start_b; 1144 | 1145 | if (!after(start_b, start_a) && !before(end_b, end_a)) { 1146 | found = true; 1147 | if (!first) 1148 | break; 1149 | } 1150 | oplen_tmp -= sizeof(*sack_tmp); 1151 | sack_tmp++; 1152 | } 1153 | 1154 | if (!found) 1155 | return -1; 1156 | 1157 | oplen_a -= sizeof(*sack_a); 1158 | sack_a++; 1159 | first = false; 1160 | } 1161 | 1162 | /* If we made it this far, all ranges SACKed by A are covered by B, so 1163 | * either the SACKs are equal, or B SACKs more bytes. 1164 | */ 1165 | return bytes_b > bytes_a ? 1 : 0; 1166 | } 1167 | 1168 | static void cake_tcph_get_tstamp(const struct tcphdr *tcph, 1169 | u32 *tsval, u32 *tsecr) 1170 | { 1171 | const u8 *ptr; 1172 | int opsize; 1173 | 1174 | ptr = cake_get_tcpopt(tcph, TCPOPT_TIMESTAMP, &opsize); 1175 | 1176 | if (ptr && opsize == TCPOLEN_TIMESTAMP) { 1177 | *tsval = get_unaligned_be32(ptr); 1178 | *tsecr = get_unaligned_be32(ptr + 4); 1179 | } 1180 | } 1181 | 1182 | static bool cake_tcph_may_drop(const struct tcphdr *tcph, 1183 | u32 tstamp_new, u32 tsecr_new) 1184 | { 1185 | /* inspired by tcp_parse_options in tcp_input.c */ 1186 | int length = __tcp_hdrlen(tcph) - sizeof(struct tcphdr); 1187 | const u8 *ptr = (const u8 *)(tcph + 1); 1188 | u32 tstamp, tsecr; 1189 | 1190 | /* 3 reserved flags must be unset to avoid future breakage 1191 | * ACK must be set 1192 | * ECE/CWR are handled separately 1193 | * All other flags URG/PSH/RST/SYN/FIN must be unset 1194 | * 0x0FFF0000 = all TCP flags (confirm ACK=1, others zero) 1195 | * 0x00C00000 = CWR/ECE (handled separately) 1196 | * 0x0F3F0000 = 0x0FFF0000 & ~0x00C00000 1197 | */ 1198 | if (((tcp_flag_word(tcph) & 1199 | cpu_to_be32(0x0F3F0000)) != TCP_FLAG_ACK)) 1200 | return false; 1201 | 1202 | while (length > 0) { 1203 | int opcode = *ptr++; 1204 | int opsize; 1205 | 1206 | if (opcode == TCPOPT_EOL) 1207 | break; 1208 | if (opcode == TCPOPT_NOP) { 1209 | length--; 1210 | continue; 1211 | } 1212 | if (length < 2) 1213 | break; 1214 | opsize = *ptr++; 1215 | if (opsize < 2 || opsize > length) 1216 | break; 1217 | 1218 | switch (opcode) { 1219 | case TCPOPT_MD5SIG: /* doesn't influence state */ 1220 | break; 1221 | 1222 | case TCPOPT_SACK: /* stricter checking performed later */ 1223 | if (opsize % 8 != 2) 1224 | return false; 1225 | break; 1226 | 1227 | case TCPOPT_TIMESTAMP: 1228 | /* only drop timestamps lower than new */ 1229 | if (opsize != TCPOLEN_TIMESTAMP) 1230 | return false; 1231 | tstamp = get_unaligned_be32(ptr); 1232 | tsecr = get_unaligned_be32(ptr + 4); 1233 | if (after(tstamp, tstamp_new) || 1234 | after(tsecr, tsecr_new)) 1235 | return false; 1236 | break; 1237 | 1238 | case TCPOPT_MSS: /* these should only be set on SYN */ 1239 | case TCPOPT_WINDOW: 1240 | case TCPOPT_SACK_PERM: 1241 | case TCPOPT_FASTOPEN: 1242 | case TCPOPT_EXP: 1243 | default: /* don't drop if any unknown options are present */ 1244 | return false; 1245 | } 1246 | 1247 | ptr += opsize - 2; 1248 | length -= opsize; 1249 | } 1250 | 1251 | return true; 1252 | } 1253 | 1254 | static struct sk_buff *cake_ack_filter(struct cake_sched_data *q, 1255 | struct cake_flow *flow) 1256 | { 1257 | bool aggressive = q->ack_filter == CAKE_ACK_AGGRESSIVE; 1258 | struct sk_buff *elig_ack = NULL, *elig_ack_prev = NULL; 1259 | struct sk_buff *skb_check, *skb_prev = NULL; 1260 | const struct ipv6hdr *ipv6h, *ipv6h_check; 1261 | unsigned char _tcph[64], _tcph_check[64]; 1262 | const struct tcphdr *tcph, *tcph_check; 1263 | const struct iphdr *iph, *iph_check; 1264 | struct ipv6hdr _iph, _iph_check; 1265 | const struct sk_buff *skb; 1266 | int seglen, num_found = 0; 1267 | u32 tstamp = 0, tsecr = 0; 1268 | __be32 elig_flags = 0; 1269 | int sack_comp; 1270 | 1271 | /* no other possible ACKs to filter */ 1272 | if (flow->head == flow->tail) 1273 | return NULL; 1274 | 1275 | skb = flow->tail; 1276 | tcph = cake_get_tcphdr(skb, _tcph, sizeof(_tcph)); 1277 | iph = cake_get_iphdr(skb, &_iph); 1278 | if (!tcph) 1279 | return NULL; 1280 | 1281 | cake_tcph_get_tstamp(tcph, &tstamp, &tsecr); 1282 | 1283 | /* the 'triggering' packet need only have the ACK flag set. 1284 | * also check that SYN is not set, as there won't be any previous ACKs. 1285 | */ 1286 | if ((tcp_flag_word(tcph) & 1287 | (TCP_FLAG_ACK | TCP_FLAG_SYN)) != TCP_FLAG_ACK) 1288 | return NULL; 1289 | 1290 | /* the 'triggering' ACK is at the tail of the queue, we have already 1291 | * returned if it is the only packet in the flow. loop through the rest 1292 | * of the queue looking for pure ACKs with the same 5-tuple as the 1293 | * triggering one. 1294 | */ 1295 | for (skb_check = flow->head; 1296 | skb_check && skb_check != skb; 1297 | skb_prev = skb_check, skb_check = skb_check->next) { 1298 | iph_check = cake_get_iphdr(skb_check, &_iph_check); 1299 | tcph_check = cake_get_tcphdr(skb_check, &_tcph_check, 1300 | sizeof(_tcph_check)); 1301 | 1302 | /* only TCP packets with matching 5-tuple are eligible, and only 1303 | * drop safe headers 1304 | */ 1305 | if (!tcph_check || iph->version != iph_check->version || 1306 | tcph_check->source != tcph->source || 1307 | tcph_check->dest != tcph->dest) 1308 | continue; 1309 | 1310 | if (iph_check->version == 4) { 1311 | if (iph_check->saddr != iph->saddr || 1312 | iph_check->daddr != iph->daddr) 1313 | continue; 1314 | 1315 | seglen = ntohs(iph_check->tot_len) - 1316 | (4 * iph_check->ihl); 1317 | } else if (iph_check->version == 6) { 1318 | ipv6h = (struct ipv6hdr *)iph; 1319 | ipv6h_check = (struct ipv6hdr *)iph_check; 1320 | 1321 | if (ipv6_addr_cmp(&ipv6h_check->saddr, &ipv6h->saddr) || 1322 | ipv6_addr_cmp(&ipv6h_check->daddr, &ipv6h->daddr)) 1323 | continue; 1324 | 1325 | seglen = ntohs(ipv6h_check->payload_len); 1326 | } else { 1327 | WARN_ON(1); /* shouldn't happen */ 1328 | continue; 1329 | } 1330 | 1331 | /* If the ECE/CWR flags changed from the previous eligible 1332 | * packet in the same flow, we should no longer be dropping that 1333 | * previous packet as this would lose information. 1334 | */ 1335 | if (elig_ack && (tcp_flag_word(tcph_check) & 1336 | (TCP_FLAG_ECE | TCP_FLAG_CWR)) != elig_flags) { 1337 | elig_ack = NULL; 1338 | elig_ack_prev = NULL; 1339 | num_found--; 1340 | } 1341 | 1342 | /* Check TCP options and flags, don't drop ACKs with segment 1343 | * data, and don't drop ACKs with a higher cumulative ACK 1344 | * counter than the triggering packet. Check ACK seqno here to 1345 | * avoid parsing SACK options of packets we are going to exclude 1346 | * anyway. 1347 | */ 1348 | if (!cake_tcph_may_drop(tcph_check, tstamp, tsecr) || 1349 | (seglen - __tcp_hdrlen(tcph_check)) != 0 || 1350 | after(ntohl(tcph_check->ack_seq), ntohl(tcph->ack_seq))) 1351 | continue; 1352 | 1353 | /* Check SACK options. The triggering packet must SACK more data 1354 | * than the ACK under consideration, or SACK the same range but 1355 | * have a larger cumulative ACK counter. The latter is a 1356 | * pathological case, but is contained in the following check 1357 | * anyway, just to be safe. 1358 | */ 1359 | sack_comp = cake_tcph_sack_compare(tcph_check, tcph); 1360 | 1361 | if (sack_comp < 0 || 1362 | (ntohl(tcph_check->ack_seq) == ntohl(tcph->ack_seq) && 1363 | sack_comp == 0)) 1364 | continue; 1365 | 1366 | /* At this point we have found an eligible pure ACK to drop; if 1367 | * we are in aggressive mode, we are done. Otherwise, keep 1368 | * searching unless this is the second eligible ACK we 1369 | * found. 1370 | * 1371 | * Since we want to drop ACK closest to the head of the queue, 1372 | * save the first eligible ACK we find, even if we need to loop 1373 | * again. 1374 | */ 1375 | if (!elig_ack) { 1376 | elig_ack = skb_check; 1377 | elig_ack_prev = skb_prev; 1378 | elig_flags = (tcp_flag_word(tcph_check) 1379 | & (TCP_FLAG_ECE | TCP_FLAG_CWR)); 1380 | } 1381 | 1382 | if (num_found++ > 0) 1383 | goto found; 1384 | } 1385 | 1386 | /* We made it through the queue without finding two eligible ACKs . If 1387 | * we found a single eligible ACK we can drop it in aggressive mode if 1388 | * we can guarantee that this does not interfere with ECN flag 1389 | * information. We ensure this by dropping it only if the enqueued 1390 | * packet is consecutive with the eligible ACK, and their flags match. 1391 | */ 1392 | if (elig_ack && aggressive && elig_ack->next == skb && 1393 | (elig_flags == (tcp_flag_word(tcph) & 1394 | (TCP_FLAG_ECE | TCP_FLAG_CWR)))) 1395 | goto found; 1396 | 1397 | return NULL; 1398 | 1399 | found: 1400 | if (elig_ack_prev) 1401 | elig_ack_prev->next = elig_ack->next; 1402 | else 1403 | flow->head = elig_ack->next; 1404 | 1405 | elig_ack->next = NULL; 1406 | 1407 | return elig_ack; 1408 | } 1409 | 1410 | static u64 cake_ewma(u64 avg, u64 sample, u32 shift) 1411 | { 1412 | avg -= avg >> shift; 1413 | avg += sample >> shift; 1414 | return avg; 1415 | } 1416 | 1417 | static u32 cake_calc_overhead(struct cake_sched_data *q, u32 len, u32 off) 1418 | { 1419 | if (q->rate_flags & CAKE_FLAG_OVERHEAD) 1420 | len -= off; 1421 | 1422 | if (q->max_netlen < len) 1423 | q->max_netlen = len; 1424 | if (q->min_netlen > len) 1425 | q->min_netlen = len; 1426 | 1427 | len += q->rate_overhead; 1428 | 1429 | if (len < q->rate_mpu) 1430 | len = q->rate_mpu; 1431 | 1432 | if (q->atm_mode == CAKE_ATM_ATM) { 1433 | len += 47; 1434 | len /= 48; 1435 | len *= 53; 1436 | } else if (q->atm_mode == CAKE_ATM_PTM) { 1437 | /* Add one byte per 64 bytes or part thereof. 1438 | * This is conservative and easier to calculate than the 1439 | * precise value. 1440 | */ 1441 | len += (len + 63) / 64; 1442 | } 1443 | 1444 | if (q->max_adjlen < len) 1445 | q->max_adjlen = len; 1446 | if (q->min_adjlen > len) 1447 | q->min_adjlen = len; 1448 | 1449 | return len; 1450 | } 1451 | 1452 | static u32 cake_overhead(struct cake_sched_data *q, const struct sk_buff *skb) 1453 | { 1454 | const struct skb_shared_info *shinfo = skb_shinfo(skb); 1455 | unsigned int hdr_len, last_len = 0; 1456 | u32 off = skb_network_offset(skb); 1457 | u32 len = qdisc_pkt_len(skb); 1458 | u16 segs = 1; 1459 | 1460 | q->avg_netoff = cake_ewma(q->avg_netoff, off << 16, 8); 1461 | 1462 | if (!shinfo->gso_size) 1463 | return cake_calc_overhead(q, len, off); 1464 | 1465 | /* borrowed from qdisc_pkt_len_init() */ 1466 | hdr_len = skb_transport_header(skb) - skb_mac_header(skb); 1467 | 1468 | /* + transport layer */ 1469 | if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | 1470 | SKB_GSO_TCPV6))) { 1471 | const struct tcphdr *th; 1472 | struct tcphdr _tcphdr; 1473 | 1474 | th = skb_header_pointer(skb, skb_transport_offset(skb), 1475 | sizeof(_tcphdr), &_tcphdr); 1476 | if (likely(th)) 1477 | hdr_len += __tcp_hdrlen(th); 1478 | } else { 1479 | struct udphdr _udphdr; 1480 | 1481 | if (skb_header_pointer(skb, skb_transport_offset(skb), 1482 | sizeof(_udphdr), &_udphdr)) 1483 | hdr_len += sizeof(struct udphdr); 1484 | } 1485 | 1486 | if (unlikely(shinfo->gso_type & SKB_GSO_DODGY)) 1487 | segs = DIV_ROUND_UP(skb->len - hdr_len, 1488 | shinfo->gso_size); 1489 | else 1490 | segs = shinfo->gso_segs; 1491 | 1492 | len = shinfo->gso_size + hdr_len; 1493 | last_len = skb->len - shinfo->gso_size * (segs - 1); 1494 | 1495 | return (cake_calc_overhead(q, len, off) * (segs - 1) + 1496 | cake_calc_overhead(q, last_len, off)); 1497 | } 1498 | 1499 | static void cake_heap_swap(struct cake_sched_data *q, u16 i, u16 j) 1500 | { 1501 | struct cake_heap_entry ii = q->overflow_heap[i]; 1502 | struct cake_heap_entry jj = q->overflow_heap[j]; 1503 | 1504 | q->overflow_heap[i] = jj; 1505 | q->overflow_heap[j] = ii; 1506 | 1507 | q->tins[ii.t].overflow_idx[ii.b] = j; 1508 | q->tins[jj.t].overflow_idx[jj.b] = i; 1509 | } 1510 | 1511 | static u32 cake_heap_get_backlog(const struct cake_sched_data *q, u16 i) 1512 | { 1513 | struct cake_heap_entry ii = q->overflow_heap[i]; 1514 | 1515 | return q->tins[ii.t].backlogs[ii.b]; 1516 | } 1517 | 1518 | static void cake_heapify(struct cake_sched_data *q, u16 i) 1519 | { 1520 | static const u32 a = CAKE_MAX_TINS * CAKE_QUEUES; 1521 | u32 mb = cake_heap_get_backlog(q, i); 1522 | u32 m = i; 1523 | 1524 | while (m < a) { 1525 | u32 l = m + m + 1; 1526 | u32 r = l + 1; 1527 | 1528 | if (l < a) { 1529 | u32 lb = cake_heap_get_backlog(q, l); 1530 | 1531 | if (lb > mb) { 1532 | m = l; 1533 | mb = lb; 1534 | } 1535 | } 1536 | 1537 | if (r < a) { 1538 | u32 rb = cake_heap_get_backlog(q, r); 1539 | 1540 | if (rb > mb) { 1541 | m = r; 1542 | mb = rb; 1543 | } 1544 | } 1545 | 1546 | if (m != i) { 1547 | cake_heap_swap(q, i, m); 1548 | i = m; 1549 | } else { 1550 | break; 1551 | } 1552 | } 1553 | } 1554 | 1555 | static void cake_heapify_up(struct cake_sched_data *q, u16 i) 1556 | { 1557 | while (i > 0 && i < CAKE_MAX_TINS * CAKE_QUEUES) { 1558 | u16 p = (i - 1) >> 1; 1559 | u32 ib = cake_heap_get_backlog(q, i); 1560 | u32 pb = cake_heap_get_backlog(q, p); 1561 | 1562 | if (ib > pb) { 1563 | cake_heap_swap(q, i, p); 1564 | i = p; 1565 | } else { 1566 | break; 1567 | } 1568 | } 1569 | } 1570 | 1571 | static int cake_advance_shaper(struct cake_sched_data *q, 1572 | struct cake_tin_data *b, 1573 | struct sk_buff *skb, 1574 | ktime_t now, bool drop) 1575 | { 1576 | u32 len = get_cobalt_cb(skb)->adjusted_len; 1577 | 1578 | /* charge packet bandwidth to this tin 1579 | * and to the global shaper. 1580 | */ 1581 | if (q->rate_ns) { 1582 | u64 tin_dur = (len * b->tin_rate_ns) >> b->tin_rate_shft; 1583 | u64 global_dur = (len * q->rate_ns) >> q->rate_shft; 1584 | u64 failsafe_dur = global_dur + (global_dur >> 1); 1585 | 1586 | if (ktime_before(b->time_next_packet, now)) 1587 | b->time_next_packet = ktime_add_ns(b->time_next_packet, 1588 | tin_dur); 1589 | 1590 | else if (ktime_before(b->time_next_packet, 1591 | ktime_add_ns(now, tin_dur))) 1592 | b->time_next_packet = ktime_add_ns(now, tin_dur); 1593 | 1594 | q->time_next_packet = ktime_add_ns(q->time_next_packet, 1595 | global_dur); 1596 | if (!drop) 1597 | q->failsafe_next_packet = \ 1598 | ktime_add_ns(q->failsafe_next_packet, 1599 | failsafe_dur); 1600 | } 1601 | return len; 1602 | } 1603 | 1604 | #if LINUX_VERSION_CODE < KERNEL_VERSION(4, 8, 0) 1605 | static unsigned int cake_drop(struct Qdisc *sch) 1606 | #else 1607 | static unsigned int cake_drop(struct Qdisc *sch, struct sk_buff **to_free) 1608 | #endif 1609 | { 1610 | struct cake_sched_data *q = qdisc_priv(sch); 1611 | ktime_t now = ktime_get(); 1612 | u32 idx = 0, tin = 0, len; 1613 | struct cake_heap_entry qq; 1614 | struct cake_tin_data *b; 1615 | struct cake_flow *flow; 1616 | struct sk_buff *skb; 1617 | 1618 | if (!q->overflow_timeout) { 1619 | int i; 1620 | /* Build fresh max-heap */ 1621 | for (i = CAKE_MAX_TINS * CAKE_QUEUES / 2; i >= 0; i--) 1622 | cake_heapify(q, i); 1623 | } 1624 | q->overflow_timeout = 65535; 1625 | 1626 | /* select longest queue for pruning */ 1627 | qq = q->overflow_heap[0]; 1628 | tin = qq.t; 1629 | idx = qq.b; 1630 | 1631 | b = &q->tins[tin]; 1632 | flow = &b->flows[idx]; 1633 | skb = dequeue_head(flow); 1634 | if (unlikely(!skb)) { 1635 | /* heap has gone wrong, rebuild it next time */ 1636 | q->overflow_timeout = 0; 1637 | return idx + (tin << 16); 1638 | } 1639 | 1640 | if (cobalt_queue_full(&flow->cvars, &b->cparams, now)) 1641 | b->unresponsive_flow_count++; 1642 | 1643 | len = qdisc_pkt_len(skb); 1644 | q->buffer_used -= skb->truesize; 1645 | b->backlogs[idx] -= len; 1646 | b->tin_backlog -= len; 1647 | sch->qstats.backlog -= len; 1648 | qdisc_tree_reduce_backlog(sch, 1, len); 1649 | 1650 | flow->dropped++; 1651 | b->tin_dropped++; 1652 | sch->qstats.drops++; 1653 | 1654 | if (q->rate_flags & CAKE_FLAG_INGRESS) 1655 | cake_advance_shaper(q, b, skb, now, true); 1656 | 1657 | #if LINUX_VERSION_CODE < KERNEL_VERSION(4, 8, 0) 1658 | kfree_skb(skb); 1659 | #else 1660 | __qdisc_drop(skb, to_free); 1661 | #endif 1662 | sch->q.qlen--; 1663 | 1664 | cake_heapify(q, 0); 1665 | 1666 | return idx + (tin << 16); 1667 | } 1668 | 1669 | static u8 cake_handle_diffserv(struct sk_buff *skb, bool wash) 1670 | { 1671 | const int offset = skb_network_offset(skb); 1672 | u16 *buf, buf_; 1673 | u8 dscp; 1674 | 1675 | switch (cake_skb_proto(skb)) { 1676 | case htons(ETH_P_IP): 1677 | buf = skb_header_pointer(skb, offset, sizeof(buf_), &buf_); 1678 | if (unlikely(!buf)) 1679 | return 0; 1680 | 1681 | /* ToS is in the second byte of iphdr */ 1682 | dscp = ipv4_get_dsfield((struct iphdr *)buf) >> 2; 1683 | 1684 | if (wash && dscp) { 1685 | const int wlen = offset + sizeof(struct iphdr); 1686 | 1687 | if (!pskb_may_pull(skb, wlen) || 1688 | skb_try_make_writable(skb, wlen)) 1689 | return 0; 1690 | 1691 | ipv4_change_dsfield(ip_hdr(skb), INET_ECN_MASK, 0); 1692 | } 1693 | 1694 | return dscp; 1695 | 1696 | case htons(ETH_P_IPV6): 1697 | buf = skb_header_pointer(skb, offset, sizeof(buf_), &buf_); 1698 | if (unlikely(!buf)) 1699 | return 0; 1700 | 1701 | /* Traffic class is in the first and second bytes of ipv6hdr */ 1702 | dscp = ipv6_get_dsfield((struct ipv6hdr *)buf) >> 2; 1703 | 1704 | if (wash && dscp) { 1705 | const int wlen = offset + sizeof(struct ipv6hdr); 1706 | 1707 | if (!pskb_may_pull(skb, wlen) || 1708 | skb_try_make_writable(skb, wlen)) 1709 | return 0; 1710 | 1711 | ipv6_change_dsfield(ipv6_hdr(skb), INET_ECN_MASK, 0); 1712 | } 1713 | 1714 | return dscp; 1715 | 1716 | case htons(ETH_P_ARP): 1717 | return 0x38; /* CS7 - Net Control */ 1718 | 1719 | default: 1720 | /* If there is no Diffserv field, treat as best-effort */ 1721 | return 0; 1722 | } 1723 | } 1724 | 1725 | static struct cake_tin_data *cake_select_tin(struct Qdisc *sch, 1726 | struct sk_buff *skb) 1727 | { 1728 | struct cake_sched_data *q = qdisc_priv(sch); 1729 | u32 tin, mark; 1730 | bool wash; 1731 | u8 dscp; 1732 | 1733 | /* Tin selection: Default to diffserv-based selection, allow overriding 1734 | * using firewall marks or skb->priority. Call DSCP parsing early if 1735 | * wash is enabled, otherwise defer to below to skip unneeded parsing. 1736 | */ 1737 | mark = (skb->mark & q->fwmark_mask) >> q->fwmark_shft; 1738 | wash = !!(q->rate_flags & CAKE_FLAG_WASH); 1739 | if (wash) 1740 | dscp = cake_handle_diffserv(skb, wash); 1741 | 1742 | if (q->tin_mode == CAKE_DIFFSERV_BESTEFFORT) 1743 | tin = 0; 1744 | 1745 | else if (mark && mark <= q->tin_cnt) 1746 | tin = q->tin_order[mark - 1]; 1747 | 1748 | else if (TC_H_MAJ(skb->priority) == sch->handle && 1749 | TC_H_MIN(skb->priority) > 0 && 1750 | TC_H_MIN(skb->priority) <= q->tin_cnt) 1751 | tin = q->tin_order[TC_H_MIN(skb->priority) - 1]; 1752 | 1753 | else { 1754 | if (!wash) 1755 | dscp = cake_handle_diffserv(skb, wash); 1756 | tin = q->tin_index[dscp]; 1757 | 1758 | if (unlikely(tin >= q->tin_cnt)) 1759 | tin = 0; 1760 | } 1761 | 1762 | return &q->tins[tin]; 1763 | } 1764 | 1765 | static u32 cake_classify(struct Qdisc *sch, struct cake_tin_data **t, 1766 | struct sk_buff *skb, int flow_mode, int *qerr) 1767 | { 1768 | struct cake_sched_data *q = qdisc_priv(sch); 1769 | struct tcf_proto *filter; 1770 | struct tcf_result res; 1771 | u16 flow = 0, host = 0; 1772 | int result; 1773 | 1774 | filter = rcu_dereference_bh(q->filter_list); 1775 | if (!filter) 1776 | goto hash; 1777 | 1778 | *qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS; 1779 | result = tcf_classify(skb, filter, &res, false); 1780 | 1781 | if (result >= 0) { 1782 | #ifdef CONFIG_NET_CLS_ACT 1783 | switch (result) { 1784 | case TC_ACT_STOLEN: 1785 | case TC_ACT_QUEUED: 1786 | #if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 13, 0) 1787 | case TC_ACT_TRAP: 1788 | #endif 1789 | *qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN; 1790 | /* fall through */ 1791 | case TC_ACT_SHOT: 1792 | return 0; 1793 | } 1794 | #endif 1795 | if (TC_H_MIN(res.classid) <= CAKE_QUEUES) 1796 | flow = TC_H_MIN(res.classid); 1797 | if (TC_H_MAJ(res.classid) <= (CAKE_QUEUES << 16)) 1798 | host = TC_H_MAJ(res.classid) >> 16; 1799 | } 1800 | hash: 1801 | *t = cake_select_tin(sch, skb); 1802 | return cake_hash(*t, skb, flow_mode, flow, host) + 1; 1803 | } 1804 | 1805 | static void cake_reconfigure(struct Qdisc *sch); 1806 | 1807 | #if LINUX_VERSION_CODE < KERNEL_VERSION(4, 8, 0) 1808 | static s32 cake_enqueue(struct sk_buff *skb, struct Qdisc *sch) 1809 | #else 1810 | static s32 cake_enqueue(struct sk_buff *skb, struct Qdisc *sch, 1811 | struct sk_buff **to_free) 1812 | #endif 1813 | { 1814 | struct cake_sched_data *q = qdisc_priv(sch); 1815 | int len = qdisc_pkt_len(skb); 1816 | int uninitialized_var(ret); 1817 | struct sk_buff *ack = NULL; 1818 | ktime_t now = ktime_get(); 1819 | struct cake_tin_data *b; 1820 | struct cake_flow *flow; 1821 | u32 idx; 1822 | 1823 | /* choose flow to insert into */ 1824 | idx = cake_classify(sch, &b, skb, q->flow_mode, &ret); 1825 | if (idx == 0) { 1826 | if (ret & __NET_XMIT_BYPASS) 1827 | qdisc_qstats_drop(sch); 1828 | #if LINUX_VERSION_CODE < KERNEL_VERSION(4, 8, 0) 1829 | kfree_skb(skb); 1830 | #else 1831 | __qdisc_drop(skb, to_free); 1832 | #endif 1833 | return ret; 1834 | } 1835 | idx--; 1836 | flow = &b->flows[idx]; 1837 | 1838 | /* ensure shaper state isn't stale */ 1839 | if (!b->tin_backlog) { 1840 | if (ktime_before(b->time_next_packet, now)) 1841 | b->time_next_packet = now; 1842 | 1843 | if (!sch->q.qlen) { 1844 | if (ktime_before(q->time_next_packet, now)) { 1845 | q->failsafe_next_packet = now; 1846 | q->time_next_packet = now; 1847 | } else if (ktime_after(q->time_next_packet, now) && 1848 | ktime_after(q->failsafe_next_packet, now)) { 1849 | u64 next = \ 1850 | min(ktime_to_ns(q->time_next_packet), 1851 | ktime_to_ns( 1852 | q->failsafe_next_packet)); 1853 | sch->qstats.overlimits++; 1854 | qdisc_watchdog_schedule_ns(&q->watchdog, next); 1855 | } 1856 | } 1857 | } 1858 | 1859 | if (unlikely(len > b->max_skblen)) 1860 | b->max_skblen = len; 1861 | 1862 | if (skb_is_gso(skb) && q->rate_flags & CAKE_FLAG_SPLIT_GSO) { 1863 | struct sk_buff *segs, *nskb; 1864 | netdev_features_t features = netif_skb_features(skb); 1865 | unsigned int slen = 0, numsegs = 0; 1866 | 1867 | segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK); 1868 | if (IS_ERR_OR_NULL(segs)) 1869 | #if LINUX_VERSION_CODE < KERNEL_VERSION(4, 8, 0) 1870 | return qdisc_reshape_fail(skb, sch); 1871 | #else 1872 | return qdisc_drop(skb, sch, to_free); 1873 | #endif 1874 | 1875 | while (segs) { 1876 | nskb = segs->next; 1877 | segs->next = NULL; 1878 | qdisc_skb_cb(segs)->pkt_len = segs->len; 1879 | cobalt_set_enqueue_time(segs, now); 1880 | get_cobalt_cb(segs)->adjusted_len = cake_overhead(q, 1881 | segs); 1882 | flow_queue_add(flow, segs); 1883 | 1884 | sch->q.qlen++; 1885 | numsegs++; 1886 | slen += segs->len; 1887 | q->buffer_used += segs->truesize; 1888 | b->packets++; 1889 | segs = nskb; 1890 | } 1891 | 1892 | /* stats */ 1893 | b->bytes += slen; 1894 | b->backlogs[idx] += slen; 1895 | b->tin_backlog += slen; 1896 | sch->qstats.backlog += slen; 1897 | q->avg_window_bytes += slen; 1898 | 1899 | qdisc_tree_reduce_backlog(sch, 1-numsegs, len-slen); 1900 | consume_skb(skb); 1901 | } else { 1902 | /* not splitting */ 1903 | cobalt_set_enqueue_time(skb, now); 1904 | get_cobalt_cb(skb)->adjusted_len = cake_overhead(q, skb); 1905 | flow_queue_add(flow, skb); 1906 | 1907 | if (q->ack_filter) 1908 | ack = cake_ack_filter(q, flow); 1909 | 1910 | if (ack) { 1911 | b->ack_drops++; 1912 | sch->qstats.drops++; 1913 | b->bytes += qdisc_pkt_len(ack); 1914 | len -= qdisc_pkt_len(ack); 1915 | q->buffer_used += skb->truesize - ack->truesize; 1916 | if (q->rate_flags & CAKE_FLAG_INGRESS) 1917 | cake_advance_shaper(q, b, ack, now, true); 1918 | 1919 | qdisc_tree_reduce_backlog(sch, 1, qdisc_pkt_len(ack)); 1920 | consume_skb(ack); 1921 | } else { 1922 | sch->q.qlen++; 1923 | q->buffer_used += skb->truesize; 1924 | } 1925 | 1926 | /* stats */ 1927 | b->packets++; 1928 | b->bytes += len; 1929 | b->backlogs[idx] += len; 1930 | b->tin_backlog += len; 1931 | sch->qstats.backlog += len; 1932 | q->avg_window_bytes += len; 1933 | } 1934 | 1935 | if (q->overflow_timeout) 1936 | cake_heapify_up(q, b->overflow_idx[idx]); 1937 | 1938 | /* incoming bandwidth capacity estimate */ 1939 | if (q->rate_flags & CAKE_FLAG_AUTORATE_INGRESS) { 1940 | u64 packet_interval = \ 1941 | ktime_to_ns(ktime_sub(now, q->last_packet_time)); 1942 | 1943 | if (packet_interval > NSEC_PER_SEC) 1944 | packet_interval = NSEC_PER_SEC; 1945 | 1946 | /* filter out short-term bursts, eg. wifi aggregation */ 1947 | q->avg_packet_interval = \ 1948 | cake_ewma(q->avg_packet_interval, 1949 | packet_interval, 1950 | (packet_interval > q->avg_packet_interval ? 1951 | 2 : 8)); 1952 | 1953 | q->last_packet_time = now; 1954 | 1955 | if (packet_interval > q->avg_packet_interval) { 1956 | u64 window_interval = \ 1957 | ktime_to_ns(ktime_sub(now, 1958 | q->avg_window_begin)); 1959 | u64 b = q->avg_window_bytes * (u64)NSEC_PER_SEC; 1960 | 1961 | b = div64_u64(b, window_interval); 1962 | q->avg_peak_bandwidth = 1963 | cake_ewma(q->avg_peak_bandwidth, b, 1964 | b > q->avg_peak_bandwidth ? 2 : 8); 1965 | q->avg_window_bytes = 0; 1966 | q->avg_window_begin = now; 1967 | 1968 | if (ktime_after(now, 1969 | ktime_add_ms(q->last_reconfig_time, 1970 | 250))) { 1971 | q->rate_bps = (q->avg_peak_bandwidth * 15) >> 4; 1972 | cake_reconfigure(sch); 1973 | } 1974 | } 1975 | } else { 1976 | q->avg_window_bytes = 0; 1977 | q->last_packet_time = now; 1978 | } 1979 | 1980 | /* flowchain */ 1981 | if (!flow->set || flow->set == CAKE_SET_DECAYING) { 1982 | struct cake_host *srchost = &b->hosts[flow->srchost]; 1983 | struct cake_host *dsthost = &b->hosts[flow->dsthost]; 1984 | u16 host_load = 1; 1985 | 1986 | if (!flow->set) { 1987 | list_add_tail(&flow->flowchain, &b->new_flows); 1988 | } else { 1989 | b->decaying_flow_count--; 1990 | list_move_tail(&flow->flowchain, &b->new_flows); 1991 | } 1992 | flow->set = CAKE_SET_SPARSE; 1993 | b->sparse_flow_count++; 1994 | 1995 | if (cake_dsrc(q->flow_mode)) 1996 | host_load = max(host_load, srchost->srchost_bulk_flow_count); 1997 | 1998 | if (cake_ddst(q->flow_mode)) 1999 | host_load = max(host_load, dsthost->dsthost_bulk_flow_count); 2000 | 2001 | flow->deficit = (b->flow_quantum * 2002 | quantum_div[host_load]) >> 16; 2003 | } else if (flow->set == CAKE_SET_SPARSE_WAIT) { 2004 | struct cake_host *srchost = &b->hosts[flow->srchost]; 2005 | struct cake_host *dsthost = &b->hosts[flow->dsthost]; 2006 | 2007 | /* this flow was empty, accounted as a sparse flow, but actually 2008 | * in the bulk rotation. 2009 | */ 2010 | flow->set = CAKE_SET_BULK; 2011 | b->sparse_flow_count--; 2012 | b->bulk_flow_count++; 2013 | 2014 | if (cake_dsrc(q->flow_mode)) 2015 | srchost->srchost_bulk_flow_count++; 2016 | 2017 | if (cake_ddst(q->flow_mode)) 2018 | dsthost->dsthost_bulk_flow_count++; 2019 | 2020 | } 2021 | 2022 | if (q->buffer_used > q->buffer_max_used) 2023 | q->buffer_max_used = q->buffer_used; 2024 | 2025 | if (q->buffer_used > q->buffer_limit) { 2026 | u32 dropped = 0; 2027 | 2028 | while (q->buffer_used > q->buffer_limit) { 2029 | dropped++; 2030 | #if LINUX_VERSION_CODE < KERNEL_VERSION(4, 8, 0) 2031 | cake_drop(sch); 2032 | #else 2033 | cake_drop(sch, to_free); 2034 | #endif 2035 | } 2036 | b->drop_overlimit += dropped; 2037 | } 2038 | return NET_XMIT_SUCCESS; 2039 | } 2040 | 2041 | static struct sk_buff *cake_dequeue_one(struct Qdisc *sch) 2042 | { 2043 | struct cake_sched_data *q = qdisc_priv(sch); 2044 | struct cake_tin_data *b = &q->tins[q->cur_tin]; 2045 | struct cake_flow *flow = &b->flows[q->cur_flow]; 2046 | struct sk_buff *skb = NULL; 2047 | u32 len; 2048 | 2049 | if (flow->head) { 2050 | skb = dequeue_head(flow); 2051 | len = qdisc_pkt_len(skb); 2052 | b->backlogs[q->cur_flow] -= len; 2053 | b->tin_backlog -= len; 2054 | sch->qstats.backlog -= len; 2055 | q->buffer_used -= skb->truesize; 2056 | sch->q.qlen--; 2057 | 2058 | if (q->overflow_timeout) 2059 | cake_heapify(q, b->overflow_idx[q->cur_flow]); 2060 | } 2061 | return skb; 2062 | } 2063 | 2064 | /* Discard leftover packets from a tin no longer in use. */ 2065 | static void cake_clear_tin(struct Qdisc *sch, u16 tin) 2066 | { 2067 | struct cake_sched_data *q = qdisc_priv(sch); 2068 | struct sk_buff *skb; 2069 | 2070 | q->cur_tin = tin; 2071 | for (q->cur_flow = 0; q->cur_flow < CAKE_QUEUES; q->cur_flow++) 2072 | while (!!(skb = cake_dequeue_one(sch))) 2073 | kfree_skb(skb); 2074 | } 2075 | 2076 | static struct sk_buff *cake_dequeue(struct Qdisc *sch) 2077 | { 2078 | struct cake_sched_data *q = qdisc_priv(sch); 2079 | struct cake_tin_data *b = &q->tins[q->cur_tin]; 2080 | struct cake_host *srchost, *dsthost; 2081 | ktime_t now = ktime_get(); 2082 | struct cake_flow *flow; 2083 | struct list_head *head; 2084 | bool first_flow = true; 2085 | struct sk_buff *skb; 2086 | u16 host_load; 2087 | u64 delay; 2088 | u32 len; 2089 | 2090 | begin: 2091 | if (!sch->q.qlen) 2092 | return NULL; 2093 | 2094 | /* global hard shaper */ 2095 | if (ktime_after(q->time_next_packet, now) && 2096 | ktime_after(q->failsafe_next_packet, now)) { 2097 | u64 next = min(ktime_to_ns(q->time_next_packet), 2098 | ktime_to_ns(q->failsafe_next_packet)); 2099 | 2100 | sch->qstats.overlimits++; 2101 | qdisc_watchdog_schedule_ns(&q->watchdog, next); 2102 | return NULL; 2103 | } 2104 | 2105 | /* Choose a class to work on. */ 2106 | if (!q->rate_ns) { 2107 | /* In unlimited mode, can't rely on shaper timings, just balance 2108 | * with DRR 2109 | */ 2110 | bool wrapped = false, empty = true; 2111 | 2112 | while (b->tin_deficit < 0 || 2113 | !(b->sparse_flow_count + b->bulk_flow_count)) { 2114 | if (b->tin_deficit <= 0) 2115 | b->tin_deficit += b->tin_quantum; 2116 | if (b->sparse_flow_count + b->bulk_flow_count) 2117 | empty = false; 2118 | 2119 | q->cur_tin++; 2120 | b++; 2121 | if (q->cur_tin >= q->tin_cnt) { 2122 | q->cur_tin = 0; 2123 | b = q->tins; 2124 | 2125 | if (wrapped) { 2126 | /* It's possible for q->qlen to be 2127 | * nonzero when we actually have no 2128 | * packets anywhere. 2129 | */ 2130 | if (empty) 2131 | return NULL; 2132 | } else { 2133 | wrapped = true; 2134 | } 2135 | } 2136 | } 2137 | } else { 2138 | /* In shaped mode, choose: 2139 | * - Highest-priority tin with queue and meeting schedule, or 2140 | * - The earliest-scheduled tin with queue. 2141 | */ 2142 | ktime_t best_time = ns_to_ktime(KTIME_MAX); 2143 | int tin, best_tin = 0; 2144 | 2145 | for (tin = 0; tin < q->tin_cnt; tin++) { 2146 | b = q->tins + tin; 2147 | if ((b->sparse_flow_count + b->bulk_flow_count) > 0) { 2148 | ktime_t time_to_pkt = \ 2149 | ktime_sub(b->time_next_packet, now); 2150 | 2151 | if (ktime_to_ns(time_to_pkt) <= 0 || 2152 | ktime_compare(time_to_pkt, 2153 | best_time) <= 0) { 2154 | best_time = time_to_pkt; 2155 | best_tin = tin; 2156 | } 2157 | } 2158 | } 2159 | 2160 | q->cur_tin = best_tin; 2161 | b = q->tins + best_tin; 2162 | 2163 | /* No point in going further if no packets to deliver. */ 2164 | if (unlikely(!(b->sparse_flow_count + b->bulk_flow_count))) 2165 | return NULL; 2166 | } 2167 | 2168 | retry: 2169 | /* service this class */ 2170 | head = &b->decaying_flows; 2171 | if (!first_flow || list_empty(head)) { 2172 | head = &b->new_flows; 2173 | if (list_empty(head)) { 2174 | head = &b->old_flows; 2175 | if (unlikely(list_empty(head))) { 2176 | head = &b->decaying_flows; 2177 | if (unlikely(list_empty(head))) 2178 | goto begin; 2179 | } 2180 | } 2181 | } 2182 | flow = list_first_entry(head, struct cake_flow, flowchain); 2183 | q->cur_flow = flow - b->flows; 2184 | first_flow = false; 2185 | 2186 | /* triple isolation (modified DRR++) */ 2187 | srchost = &b->hosts[flow->srchost]; 2188 | dsthost = &b->hosts[flow->dsthost]; 2189 | host_load = 1; 2190 | 2191 | /* flow isolation (DRR++) */ 2192 | if (flow->deficit <= 0) { 2193 | /* Keep all flows with deficits out of the sparse and decaying 2194 | * rotations. No non-empty flow can go into the decaying 2195 | * rotation, so they can't get deficits 2196 | */ 2197 | if (flow->set == CAKE_SET_SPARSE) { 2198 | if (flow->head) { 2199 | b->sparse_flow_count--; 2200 | b->bulk_flow_count++; 2201 | 2202 | if (cake_dsrc(q->flow_mode)) 2203 | srchost->srchost_bulk_flow_count++; 2204 | 2205 | if (cake_ddst(q->flow_mode)) 2206 | dsthost->dsthost_bulk_flow_count++; 2207 | 2208 | flow->set = CAKE_SET_BULK; 2209 | } else { 2210 | /* we've moved it to the bulk rotation for 2211 | * correct deficit accounting but we still want 2212 | * to count it as a sparse flow, not a bulk one. 2213 | */ 2214 | flow->set = CAKE_SET_SPARSE_WAIT; 2215 | } 2216 | } 2217 | 2218 | if (cake_dsrc(q->flow_mode)) 2219 | host_load = max(host_load, srchost->srchost_bulk_flow_count); 2220 | 2221 | if (cake_ddst(q->flow_mode)) 2222 | host_load = max(host_load, dsthost->dsthost_bulk_flow_count); 2223 | 2224 | WARN_ON(host_load > CAKE_QUEUES); 2225 | 2226 | /* The shifted prandom_u32() is a way to apply dithering to 2227 | * avoid accumulating roundoff errors 2228 | */ 2229 | flow->deficit += (b->flow_quantum * quantum_div[host_load] + 2230 | (prandom_u32() >> 16)) >> 16; 2231 | list_move_tail(&flow->flowchain, &b->old_flows); 2232 | 2233 | goto retry; 2234 | } 2235 | 2236 | /* Retrieve a packet via the AQM */ 2237 | while (1) { 2238 | skb = cake_dequeue_one(sch); 2239 | if (!skb) { 2240 | /* this queue was actually empty */ 2241 | if (cobalt_queue_empty(&flow->cvars, &b->cparams, now)) 2242 | b->unresponsive_flow_count--; 2243 | 2244 | if (flow->cvars.p_drop || flow->cvars.count || 2245 | ktime_before(now, flow->cvars.drop_next)) { 2246 | /* keep in the flowchain until the state has 2247 | * decayed to rest 2248 | */ 2249 | list_move_tail(&flow->flowchain, 2250 | &b->decaying_flows); 2251 | if (flow->set == CAKE_SET_BULK) { 2252 | b->bulk_flow_count--; 2253 | 2254 | if (cake_dsrc(q->flow_mode)) 2255 | srchost->srchost_bulk_flow_count--; 2256 | 2257 | if (cake_ddst(q->flow_mode)) 2258 | dsthost->dsthost_bulk_flow_count--; 2259 | 2260 | b->decaying_flow_count++; 2261 | } else if (flow->set == CAKE_SET_SPARSE || 2262 | flow->set == CAKE_SET_SPARSE_WAIT) { 2263 | b->sparse_flow_count--; 2264 | b->decaying_flow_count++; 2265 | } 2266 | flow->set = CAKE_SET_DECAYING; 2267 | } else { 2268 | /* remove empty queue from the flowchain */ 2269 | list_del_init(&flow->flowchain); 2270 | if (flow->set == CAKE_SET_SPARSE || 2271 | flow->set == CAKE_SET_SPARSE_WAIT) 2272 | b->sparse_flow_count--; 2273 | else if (flow->set == CAKE_SET_BULK) { 2274 | b->bulk_flow_count--; 2275 | 2276 | if (cake_dsrc(q->flow_mode)) 2277 | srchost->srchost_bulk_flow_count--; 2278 | 2279 | if (cake_ddst(q->flow_mode)) 2280 | dsthost->dsthost_bulk_flow_count--; 2281 | 2282 | } else 2283 | b->decaying_flow_count--; 2284 | 2285 | flow->set = CAKE_SET_NONE; 2286 | } 2287 | goto begin; 2288 | } 2289 | 2290 | /* Last packet in queue may be marked, shouldn't be dropped */ 2291 | if (!cobalt_should_drop(&flow->cvars, &b->cparams, now, skb, 2292 | (b->bulk_flow_count * 2293 | !!(q->rate_flags & 2294 | CAKE_FLAG_INGRESS))) || 2295 | !flow->head) 2296 | break; 2297 | 2298 | /* drop this packet, get another one */ 2299 | if (q->rate_flags & CAKE_FLAG_INGRESS) { 2300 | len = cake_advance_shaper(q, b, skb, 2301 | now, true); 2302 | flow->deficit -= len; 2303 | b->tin_deficit -= len; 2304 | } 2305 | flow->dropped++; 2306 | b->tin_dropped++; 2307 | qdisc_tree_reduce_backlog(sch, 1, qdisc_pkt_len(skb)); 2308 | #if LINUX_VERSION_CODE < KERNEL_VERSION(4, 8, 0) 2309 | qdisc_drop(skb, sch); 2310 | #else 2311 | qdisc_qstats_drop(sch); 2312 | kfree_skb(skb); 2313 | #endif 2314 | if (q->rate_flags & CAKE_FLAG_INGRESS) 2315 | goto retry; 2316 | } 2317 | 2318 | b->tin_ecn_mark += !!flow->cvars.ecn_marked; 2319 | qdisc_bstats_update(sch, skb); 2320 | 2321 | /* collect delay stats */ 2322 | delay = ktime_to_ns(ktime_sub(now, cobalt_get_enqueue_time(skb))); 2323 | b->avge_delay = cake_ewma(b->avge_delay, delay, 8); 2324 | b->peak_delay = cake_ewma(b->peak_delay, delay, 2325 | delay > b->peak_delay ? 2 : 8); 2326 | b->base_delay = cake_ewma(b->base_delay, delay, 2327 | delay < b->base_delay ? 2 : 8); 2328 | 2329 | len = cake_advance_shaper(q, b, skb, now, false); 2330 | flow->deficit -= len; 2331 | b->tin_deficit -= len; 2332 | 2333 | if (ktime_after(q->time_next_packet, now) && sch->q.qlen) { 2334 | u64 next = min(ktime_to_ns(q->time_next_packet), 2335 | ktime_to_ns(q->failsafe_next_packet)); 2336 | 2337 | qdisc_watchdog_schedule_ns(&q->watchdog, next); 2338 | } else if (!sch->q.qlen) { 2339 | int i; 2340 | 2341 | for (i = 0; i < q->tin_cnt; i++) { 2342 | if (q->tins[i].decaying_flow_count) { 2343 | ktime_t next = \ 2344 | ktime_add_ns(now, 2345 | q->tins[i].cparams.target); 2346 | 2347 | qdisc_watchdog_schedule_ns(&q->watchdog, 2348 | ktime_to_ns(next)); 2349 | break; 2350 | } 2351 | } 2352 | } 2353 | 2354 | if (q->overflow_timeout) 2355 | q->overflow_timeout--; 2356 | 2357 | return skb; 2358 | } 2359 | 2360 | static void cake_reset(struct Qdisc *sch) 2361 | { 2362 | u32 c; 2363 | 2364 | for (c = 0; c < CAKE_MAX_TINS; c++) 2365 | cake_clear_tin(sch, c); 2366 | } 2367 | 2368 | static const struct nla_policy cake_policy[TCA_CAKE_MAX + 1] = { 2369 | [TCA_CAKE_BASE_RATE64] = { .type = NLA_U64 }, 2370 | [TCA_CAKE_DIFFSERV_MODE] = { .type = NLA_U32 }, 2371 | [TCA_CAKE_ATM] = { .type = NLA_U32 }, 2372 | [TCA_CAKE_FLOW_MODE] = { .type = NLA_U32 }, 2373 | [TCA_CAKE_OVERHEAD] = { .type = NLA_S32 }, 2374 | [TCA_CAKE_RTT] = { .type = NLA_U32 }, 2375 | [TCA_CAKE_TARGET] = { .type = NLA_U32 }, 2376 | [TCA_CAKE_AUTORATE] = { .type = NLA_U32 }, 2377 | [TCA_CAKE_MEMORY] = { .type = NLA_U32 }, 2378 | [TCA_CAKE_NAT] = { .type = NLA_U32 }, 2379 | [TCA_CAKE_RAW] = { .type = NLA_U32 }, 2380 | [TCA_CAKE_WASH] = { .type = NLA_U32 }, 2381 | [TCA_CAKE_MPU] = { .type = NLA_U32 }, 2382 | [TCA_CAKE_INGRESS] = { .type = NLA_U32 }, 2383 | [TCA_CAKE_ACK_FILTER] = { .type = NLA_U32 }, 2384 | [TCA_CAKE_SPLIT_GSO] = { .type = NLA_U32 }, 2385 | [TCA_CAKE_FWMARK] = { .type = NLA_U32 }, 2386 | }; 2387 | 2388 | static void cake_set_rate(struct cake_tin_data *b, u64 rate, u32 mtu, 2389 | u64 target_ns, u64 rtt_est_ns) 2390 | { 2391 | /* convert byte-rate into time-per-byte 2392 | * so it will always unwedge in reasonable time. 2393 | */ 2394 | static const u64 MIN_RATE = 64; 2395 | u32 byte_target = mtu; 2396 | u64 byte_target_ns; 2397 | u8 rate_shft = 0; 2398 | u64 rate_ns = 0; 2399 | 2400 | b->flow_quantum = 1514; 2401 | if (rate) { 2402 | b->flow_quantum = max(min(rate >> 12, 1514ULL), 300ULL); 2403 | rate_shft = 34; 2404 | rate_ns = ((u64)NSEC_PER_SEC) << rate_shft; 2405 | rate_ns = div64_u64(rate_ns, max(MIN_RATE, rate)); 2406 | while (!!(rate_ns >> 34)) { 2407 | rate_ns >>= 1; 2408 | rate_shft--; 2409 | } 2410 | } /* else unlimited, ie. zero delay */ 2411 | 2412 | b->tin_rate_bps = rate; 2413 | b->tin_rate_ns = rate_ns; 2414 | b->tin_rate_shft = rate_shft; 2415 | 2416 | byte_target_ns = (byte_target * rate_ns) >> rate_shft; 2417 | 2418 | b->cparams.target = max((byte_target_ns * 3) / 2, target_ns); 2419 | b->cparams.interval = max(rtt_est_ns + 2420 | b->cparams.target - target_ns, 2421 | b->cparams.target * 2); 2422 | b->cparams.mtu_time = byte_target_ns; 2423 | b->cparams.p_inc = 1 << 24; /* 1/256 */ 2424 | b->cparams.p_dec = 1 << 20; /* 1/4096 */ 2425 | } 2426 | 2427 | static int cake_config_besteffort(struct Qdisc *sch) 2428 | { 2429 | struct cake_sched_data *q = qdisc_priv(sch); 2430 | struct cake_tin_data *b = &q->tins[0]; 2431 | u32 mtu = psched_mtu(qdisc_dev(sch)); 2432 | u64 rate = q->rate_bps; 2433 | 2434 | q->tin_cnt = 1; 2435 | 2436 | q->tin_index = besteffort; 2437 | q->tin_order = normal_order; 2438 | 2439 | cake_set_rate(b, rate, mtu, 2440 | us_to_ns(q->target), us_to_ns(q->interval)); 2441 | b->tin_quantum = 65535; 2442 | 2443 | return 0; 2444 | } 2445 | 2446 | static int cake_config_precedence(struct Qdisc *sch) 2447 | { 2448 | /* convert high-level (user visible) parameters into internal format */ 2449 | struct cake_sched_data *q = qdisc_priv(sch); 2450 | u32 mtu = psched_mtu(qdisc_dev(sch)); 2451 | u64 rate = q->rate_bps; 2452 | u32 quantum = 256; 2453 | u32 i; 2454 | 2455 | q->tin_cnt = 8; 2456 | q->tin_index = precedence; 2457 | q->tin_order = normal_order; 2458 | 2459 | for (i = 0; i < q->tin_cnt; i++) { 2460 | struct cake_tin_data *b = &q->tins[i]; 2461 | 2462 | cake_set_rate(b, rate, mtu, us_to_ns(q->target), 2463 | us_to_ns(q->interval)); 2464 | 2465 | b->tin_quantum = max_t(u16, 1U, quantum); 2466 | 2467 | /* calculate next class's parameters */ 2468 | rate *= 7; 2469 | rate >>= 3; 2470 | 2471 | quantum *= 7; 2472 | quantum >>= 3; 2473 | } 2474 | 2475 | return 0; 2476 | } 2477 | 2478 | /* List of known Diffserv codepoints: 2479 | * 2480 | * Least Effort (CS1) 2481 | * Best Effort (CS0) 2482 | * Max Reliability & LLT "Lo" (TOS1) 2483 | * Max Throughput (TOS2) 2484 | * Min Delay (TOS4) 2485 | * LLT "La" (TOS5) 2486 | * Assured Forwarding 1 (AF1x) - x3 2487 | * Assured Forwarding 2 (AF2x) - x3 2488 | * Assured Forwarding 3 (AF3x) - x3 2489 | * Assured Forwarding 4 (AF4x) - x3 2490 | * Precedence Class 2 (CS2) 2491 | * Precedence Class 3 (CS3) 2492 | * Precedence Class 4 (CS4) 2493 | * Precedence Class 5 (CS5) 2494 | * Precedence Class 6 (CS6) 2495 | * Precedence Class 7 (CS7) 2496 | * Voice Admit (VA) 2497 | * Expedited Forwarding (EF) 2498 | 2499 | * Total 25 codepoints. 2500 | */ 2501 | 2502 | /* List of traffic classes in RFC 4594: 2503 | * (roughly descending order of contended priority) 2504 | * (roughly ascending order of uncontended throughput) 2505 | * 2506 | * Network Control (CS6,CS7) - routing traffic 2507 | * Telephony (EF,VA) - aka. VoIP streams 2508 | * Signalling (CS5) - VoIP setup 2509 | * Multimedia Conferencing (AF4x) - aka. video calls 2510 | * Realtime Interactive (CS4) - eg. games 2511 | * Multimedia Streaming (AF3x) - eg. YouTube, NetFlix, Twitch 2512 | * Broadcast Video (CS3) 2513 | * Low Latency Data (AF2x,TOS4) - eg. database 2514 | * Ops, Admin, Management (CS2,TOS1) - eg. ssh 2515 | * Standard Service (CS0 & unrecognised codepoints) 2516 | * High Throughput Data (AF1x,TOS2) - eg. web traffic 2517 | * Low Priority Data (CS1) - eg. BitTorrent 2518 | 2519 | * Total 12 traffic classes. 2520 | */ 2521 | 2522 | static int cake_config_diffserv8(struct Qdisc *sch) 2523 | { 2524 | /* Pruned list of traffic classes for typical applications: 2525 | * 2526 | * Network Control (CS6, CS7) 2527 | * Minimum Latency (EF, VA, CS5, CS4) 2528 | * Interactive Shell (CS2, TOS1) 2529 | * Low Latency Transactions (AF2x, TOS4) 2530 | * Video Streaming (AF4x, AF3x, CS3) 2531 | * Bog Standard (CS0 etc.) 2532 | * High Throughput (AF1x, TOS2) 2533 | * Background Traffic (CS1) 2534 | * 2535 | * Total 8 traffic classes. 2536 | */ 2537 | 2538 | struct cake_sched_data *q = qdisc_priv(sch); 2539 | u32 mtu = psched_mtu(qdisc_dev(sch)); 2540 | u64 rate = q->rate_bps; 2541 | u32 quantum = 256; 2542 | u32 i; 2543 | 2544 | q->tin_cnt = 8; 2545 | 2546 | /* codepoint to class mapping */ 2547 | q->tin_index = diffserv8; 2548 | q->tin_order = normal_order; 2549 | 2550 | /* class characteristics */ 2551 | for (i = 0; i < q->tin_cnt; i++) { 2552 | struct cake_tin_data *b = &q->tins[i]; 2553 | 2554 | cake_set_rate(b, rate, mtu, us_to_ns(q->target), 2555 | us_to_ns(q->interval)); 2556 | 2557 | b->tin_quantum = max_t(u16, 1U, quantum); 2558 | 2559 | /* calculate next class's parameters */ 2560 | rate *= 7; 2561 | rate >>= 3; 2562 | 2563 | quantum *= 7; 2564 | quantum >>= 3; 2565 | } 2566 | 2567 | return 0; 2568 | } 2569 | 2570 | static int cake_config_diffserv4(struct Qdisc *sch) 2571 | { 2572 | /* Further pruned list of traffic classes for four-class system: 2573 | * 2574 | * Latency Sensitive (CS7, CS6, EF, VA, CS5, CS4) 2575 | * Streaming Media (AF4x, AF3x, CS3, AF2x, TOS4, CS2, TOS1) 2576 | * Best Effort (CS0, AF1x, TOS2, and those not specified) 2577 | * Background Traffic (CS1) 2578 | * 2579 | * Total 4 traffic classes. 2580 | */ 2581 | 2582 | struct cake_sched_data *q = qdisc_priv(sch); 2583 | u32 mtu = psched_mtu(qdisc_dev(sch)); 2584 | u64 rate = q->rate_bps; 2585 | u32 quantum = 1024; 2586 | 2587 | q->tin_cnt = 4; 2588 | 2589 | /* codepoint to class mapping */ 2590 | q->tin_index = diffserv4; 2591 | q->tin_order = bulk_order; 2592 | 2593 | /* class characteristics */ 2594 | cake_set_rate(&q->tins[0], rate, mtu, 2595 | us_to_ns(q->target), us_to_ns(q->interval)); 2596 | cake_set_rate(&q->tins[1], rate >> 4, mtu, 2597 | us_to_ns(q->target), us_to_ns(q->interval)); 2598 | cake_set_rate(&q->tins[2], rate >> 1, mtu, 2599 | us_to_ns(q->target), us_to_ns(q->interval)); 2600 | cake_set_rate(&q->tins[3], rate >> 2, mtu, 2601 | us_to_ns(q->target), us_to_ns(q->interval)); 2602 | 2603 | /* bandwidth-sharing weights */ 2604 | q->tins[0].tin_quantum = quantum; 2605 | q->tins[1].tin_quantum = quantum >> 4; 2606 | q->tins[2].tin_quantum = quantum >> 1; 2607 | q->tins[3].tin_quantum = quantum >> 2; 2608 | 2609 | return 0; 2610 | } 2611 | 2612 | static int cake_config_diffserv3(struct Qdisc *sch) 2613 | { 2614 | /* Simplified Diffserv structure with 3 tins. 2615 | * Low Priority (CS1) 2616 | * Best Effort 2617 | * Latency Sensitive (TOS4, VA, EF, CS6, CS7) 2618 | */ 2619 | struct cake_sched_data *q = qdisc_priv(sch); 2620 | u32 mtu = psched_mtu(qdisc_dev(sch)); 2621 | u64 rate = q->rate_bps; 2622 | u32 quantum = 1024; 2623 | 2624 | q->tin_cnt = 3; 2625 | 2626 | /* codepoint to class mapping */ 2627 | q->tin_index = diffserv3; 2628 | q->tin_order = bulk_order; 2629 | 2630 | /* class characteristics */ 2631 | cake_set_rate(&q->tins[0], rate, mtu, 2632 | us_to_ns(q->target), us_to_ns(q->interval)); 2633 | cake_set_rate(&q->tins[1], rate >> 4, mtu, 2634 | us_to_ns(q->target), us_to_ns(q->interval)); 2635 | cake_set_rate(&q->tins[2], rate >> 2, mtu, 2636 | us_to_ns(q->target), us_to_ns(q->interval)); 2637 | 2638 | /* bandwidth-sharing weights */ 2639 | q->tins[0].tin_quantum = quantum; 2640 | q->tins[1].tin_quantum = quantum >> 4; 2641 | q->tins[2].tin_quantum = quantum >> 2; 2642 | 2643 | return 0; 2644 | } 2645 | 2646 | static void cake_reconfigure(struct Qdisc *sch) 2647 | { 2648 | struct cake_sched_data *q = qdisc_priv(sch); 2649 | int c, ft; 2650 | 2651 | switch (q->tin_mode) { 2652 | case CAKE_DIFFSERV_BESTEFFORT: 2653 | ft = cake_config_besteffort(sch); 2654 | break; 2655 | 2656 | case CAKE_DIFFSERV_PRECEDENCE: 2657 | ft = cake_config_precedence(sch); 2658 | break; 2659 | 2660 | case CAKE_DIFFSERV_DIFFSERV8: 2661 | ft = cake_config_diffserv8(sch); 2662 | break; 2663 | 2664 | case CAKE_DIFFSERV_DIFFSERV4: 2665 | ft = cake_config_diffserv4(sch); 2666 | break; 2667 | 2668 | case CAKE_DIFFSERV_DIFFSERV3: 2669 | default: 2670 | ft = cake_config_diffserv3(sch); 2671 | break; 2672 | } 2673 | 2674 | for (c = q->tin_cnt; c < CAKE_MAX_TINS; c++) { 2675 | cake_clear_tin(sch, c); 2676 | q->tins[c].cparams.mtu_time = q->tins[ft].cparams.mtu_time; 2677 | } 2678 | 2679 | q->rate_ns = q->tins[ft].tin_rate_ns; 2680 | q->rate_shft = q->tins[ft].tin_rate_shft; 2681 | 2682 | if (q->buffer_config_limit) { 2683 | q->buffer_limit = q->buffer_config_limit; 2684 | } else if (q->rate_bps) { 2685 | u64 t = q->rate_bps * q->interval; 2686 | 2687 | do_div(t, USEC_PER_SEC / 4); 2688 | q->buffer_limit = max_t(u32, t, 4U << 20); 2689 | } else { 2690 | q->buffer_limit = ~0; 2691 | } 2692 | 2693 | sch->flags &= ~TCQ_F_CAN_BYPASS; 2694 | 2695 | q->buffer_limit = min(q->buffer_limit, 2696 | max(sch->limit * psched_mtu(qdisc_dev(sch)), 2697 | q->buffer_config_limit)); 2698 | } 2699 | 2700 | #if LINUX_VERSION_CODE < KERNEL_VERSION(4, 16, 0) 2701 | static int cake_change(struct Qdisc *sch, struct nlattr *opt) 2702 | #else 2703 | static int cake_change(struct Qdisc *sch, struct nlattr *opt, 2704 | struct netlink_ext_ack *extack) 2705 | #endif 2706 | { 2707 | struct cake_sched_data *q = qdisc_priv(sch); 2708 | struct nlattr *tb[TCA_CAKE_MAX + 1]; 2709 | int err; 2710 | 2711 | if (!opt) 2712 | return -EINVAL; 2713 | 2714 | #if LINUX_VERSION_CODE < KERNEL_VERSION(4, 12, 0) 2715 | err = nla_parse_nested(tb, TCA_CAKE_MAX, opt, cake_policy); 2716 | #elif LINUX_VERSION_CODE < KERNEL_VERSION(4, 16, 0) 2717 | err = nla_parse_nested(tb, TCA_CAKE_MAX, opt, cake_policy, NULL); 2718 | #else 2719 | err = nla_parse_nested(tb, TCA_CAKE_MAX, opt, cake_policy, extack); 2720 | #endif 2721 | if (err < 0) 2722 | return err; 2723 | 2724 | if (tb[TCA_CAKE_NAT]) { 2725 | #if IS_REACHABLE(CONFIG_NF_CONNTRACK) 2726 | q->flow_mode &= ~CAKE_FLOW_NAT_FLAG; 2727 | q->flow_mode |= CAKE_FLOW_NAT_FLAG * 2728 | !!nla_get_u32(tb[TCA_CAKE_NAT]); 2729 | #else 2730 | #if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 16, 0) 2731 | NL_SET_ERR_MSG_ATTR(extack, tb[TCA_CAKE_NAT], 2732 | "No conntrack support in kernel"); 2733 | #endif 2734 | return -EOPNOTSUPP; 2735 | #endif 2736 | } 2737 | 2738 | if (tb[TCA_CAKE_BASE_RATE64]) 2739 | q->rate_bps = nla_get_u64(tb[TCA_CAKE_BASE_RATE64]); 2740 | 2741 | if (tb[TCA_CAKE_DIFFSERV_MODE]) 2742 | q->tin_mode = nla_get_u32(tb[TCA_CAKE_DIFFSERV_MODE]); 2743 | 2744 | if (tb[TCA_CAKE_WASH]) { 2745 | if (!!nla_get_u32(tb[TCA_CAKE_WASH])) 2746 | q->rate_flags |= CAKE_FLAG_WASH; 2747 | else 2748 | q->rate_flags &= ~CAKE_FLAG_WASH; 2749 | } 2750 | 2751 | if (tb[TCA_CAKE_FLOW_MODE]) 2752 | q->flow_mode = ((q->flow_mode & CAKE_FLOW_NAT_FLAG) | 2753 | (nla_get_u32(tb[TCA_CAKE_FLOW_MODE]) & 2754 | CAKE_FLOW_MASK)); 2755 | 2756 | if (tb[TCA_CAKE_ATM]) 2757 | q->atm_mode = nla_get_u32(tb[TCA_CAKE_ATM]); 2758 | 2759 | if (tb[TCA_CAKE_OVERHEAD]) { 2760 | q->rate_overhead = nla_get_s32(tb[TCA_CAKE_OVERHEAD]); 2761 | q->rate_flags |= CAKE_FLAG_OVERHEAD; 2762 | 2763 | q->max_netlen = 0; 2764 | q->max_adjlen = 0; 2765 | q->min_netlen = ~0; 2766 | q->min_adjlen = ~0; 2767 | } 2768 | 2769 | if (tb[TCA_CAKE_RAW]) { 2770 | q->rate_flags &= ~CAKE_FLAG_OVERHEAD; 2771 | 2772 | q->max_netlen = 0; 2773 | q->max_adjlen = 0; 2774 | q->min_netlen = ~0; 2775 | q->min_adjlen = ~0; 2776 | } 2777 | 2778 | if (tb[TCA_CAKE_MPU]) 2779 | q->rate_mpu = nla_get_u32(tb[TCA_CAKE_MPU]); 2780 | 2781 | if (tb[TCA_CAKE_RTT]) { 2782 | q->interval = nla_get_u32(tb[TCA_CAKE_RTT]); 2783 | 2784 | if (!q->interval) 2785 | q->interval = 1; 2786 | } 2787 | 2788 | if (tb[TCA_CAKE_TARGET]) { 2789 | q->target = nla_get_u32(tb[TCA_CAKE_TARGET]); 2790 | 2791 | if (!q->target) 2792 | q->target = 1; 2793 | } 2794 | 2795 | if (tb[TCA_CAKE_AUTORATE]) { 2796 | if (!!nla_get_u32(tb[TCA_CAKE_AUTORATE])) 2797 | q->rate_flags |= CAKE_FLAG_AUTORATE_INGRESS; 2798 | else 2799 | q->rate_flags &= ~CAKE_FLAG_AUTORATE_INGRESS; 2800 | } 2801 | 2802 | if (tb[TCA_CAKE_INGRESS]) { 2803 | if (!!nla_get_u32(tb[TCA_CAKE_INGRESS])) 2804 | q->rate_flags |= CAKE_FLAG_INGRESS; 2805 | else 2806 | q->rate_flags &= ~CAKE_FLAG_INGRESS; 2807 | } 2808 | 2809 | if (tb[TCA_CAKE_ACK_FILTER]) 2810 | q->ack_filter = nla_get_u32(tb[TCA_CAKE_ACK_FILTER]); 2811 | 2812 | if (tb[TCA_CAKE_MEMORY]) 2813 | q->buffer_config_limit = nla_get_u32(tb[TCA_CAKE_MEMORY]); 2814 | 2815 | if (tb[TCA_CAKE_SPLIT_GSO]) { 2816 | if (!!nla_get_u32(tb[TCA_CAKE_SPLIT_GSO])) 2817 | q->rate_flags |= CAKE_FLAG_SPLIT_GSO; 2818 | else 2819 | q->rate_flags &= ~CAKE_FLAG_SPLIT_GSO; 2820 | } 2821 | 2822 | if (tb[TCA_CAKE_FWMARK]) { 2823 | q->fwmark_mask = nla_get_u32(tb[TCA_CAKE_FWMARK]); 2824 | q->fwmark_shft = q->fwmark_mask ? __ffs(q->fwmark_mask) : 0; 2825 | } 2826 | 2827 | if (q->tins) { 2828 | sch_tree_lock(sch); 2829 | cake_reconfigure(sch); 2830 | sch_tree_unlock(sch); 2831 | } 2832 | 2833 | return 0; 2834 | } 2835 | 2836 | static void cake_destroy(struct Qdisc *sch) 2837 | { 2838 | struct cake_sched_data *q = qdisc_priv(sch); 2839 | 2840 | qdisc_watchdog_cancel(&q->watchdog); 2841 | #if LINUX_VERSION_CODE < KERNEL_VERSION(4, 13, 0) 2842 | tcf_destroy_chain(&q->filter_list); 2843 | #else 2844 | tcf_block_put(q->block); 2845 | #endif 2846 | kvfree(q->tins); 2847 | } 2848 | 2849 | #if LINUX_VERSION_CODE < KERNEL_VERSION(4, 16, 0) 2850 | static int cake_init(struct Qdisc *sch, struct nlattr *opt) 2851 | #else 2852 | static int cake_init(struct Qdisc *sch, struct nlattr *opt, 2853 | struct netlink_ext_ack *extack) 2854 | #endif 2855 | { 2856 | struct cake_sched_data *q = qdisc_priv(sch); 2857 | int i, j, err; 2858 | 2859 | sch->limit = 10240; 2860 | q->tin_mode = CAKE_DIFFSERV_DIFFSERV3; 2861 | q->flow_mode = CAKE_FLOW_TRIPLE; 2862 | 2863 | q->rate_bps = 0; /* unlimited by default */ 2864 | 2865 | q->interval = 100000; /* 100ms default */ 2866 | q->target = 5000; /* 5ms: codel RFC argues 2867 | * for 5 to 10% of interval 2868 | */ 2869 | q->rate_flags |= CAKE_FLAG_SPLIT_GSO; 2870 | q->cur_tin = 0; 2871 | q->cur_flow = 0; 2872 | 2873 | qdisc_watchdog_init(&q->watchdog, sch); 2874 | 2875 | if (opt) { 2876 | #if LINUX_VERSION_CODE < KERNEL_VERSION(4, 16, 0) 2877 | err = cake_change(sch, opt); 2878 | #else 2879 | err = cake_change(sch, opt, extack); 2880 | #endif 2881 | 2882 | if (err) 2883 | return err; 2884 | } 2885 | 2886 | #if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 13, 0) 2887 | #if LINUX_VERSION_CODE < KERNEL_VERSION(4, 15, 0) 2888 | err = tcf_block_get(&q->block, &q->filter_list); 2889 | #elif LINUX_VERSION_CODE < KERNEL_VERSION(4, 16, 0) 2890 | err = tcf_block_get(&q->block, &q->filter_list, sch); 2891 | #else 2892 | err = tcf_block_get(&q->block, &q->filter_list, sch, extack); 2893 | #endif 2894 | if (err) 2895 | return err; 2896 | #endif 2897 | 2898 | quantum_div[0] = ~0; 2899 | for (i = 1; i <= CAKE_QUEUES; i++) 2900 | quantum_div[i] = 65535 / i; 2901 | 2902 | q->tins = kvzalloc(CAKE_MAX_TINS * sizeof(struct cake_tin_data), 2903 | GFP_KERNEL); 2904 | if (!q->tins) 2905 | goto nomem; 2906 | 2907 | for (i = 0; i < CAKE_MAX_TINS; i++) { 2908 | struct cake_tin_data *b = q->tins + i; 2909 | 2910 | b->perturb = prandom_u32(); 2911 | INIT_LIST_HEAD(&b->new_flows); 2912 | INIT_LIST_HEAD(&b->old_flows); 2913 | INIT_LIST_HEAD(&b->decaying_flows); 2914 | b->sparse_flow_count = 0; 2915 | b->bulk_flow_count = 0; 2916 | b->decaying_flow_count = 0; 2917 | 2918 | for (j = 0; j < CAKE_QUEUES; j++) { 2919 | struct cake_flow *flow = b->flows + j; 2920 | u32 k = j * CAKE_MAX_TINS + i; 2921 | 2922 | INIT_LIST_HEAD(&flow->flowchain); 2923 | cobalt_vars_init(&flow->cvars); 2924 | 2925 | q->overflow_heap[k].t = i; 2926 | q->overflow_heap[k].b = j; 2927 | b->overflow_idx[j] = k; 2928 | } 2929 | } 2930 | 2931 | cake_reconfigure(sch); 2932 | q->avg_peak_bandwidth = q->rate_bps; 2933 | q->min_netlen = ~0; 2934 | q->min_adjlen = ~0; 2935 | return 0; 2936 | 2937 | nomem: 2938 | cake_destroy(sch); 2939 | return -ENOMEM; 2940 | } 2941 | 2942 | static int cake_dump(struct Qdisc *sch, struct sk_buff *skb) 2943 | { 2944 | struct cake_sched_data *q = qdisc_priv(sch); 2945 | struct nlattr *opts; 2946 | 2947 | opts = nla_nest_start(skb, TCA_OPTIONS); 2948 | if (!opts) 2949 | goto nla_put_failure; 2950 | 2951 | if (nla_put_u64_64bit(skb, TCA_CAKE_BASE_RATE64, q->rate_bps, 2952 | TCA_CAKE_PAD)) 2953 | goto nla_put_failure; 2954 | 2955 | if (nla_put_u32(skb, TCA_CAKE_FLOW_MODE, 2956 | q->flow_mode & CAKE_FLOW_MASK)) 2957 | goto nla_put_failure; 2958 | 2959 | if (nla_put_u32(skb, TCA_CAKE_RTT, q->interval)) 2960 | goto nla_put_failure; 2961 | 2962 | if (nla_put_u32(skb, TCA_CAKE_TARGET, q->target)) 2963 | goto nla_put_failure; 2964 | 2965 | if (nla_put_u32(skb, TCA_CAKE_MEMORY, q->buffer_config_limit)) 2966 | goto nla_put_failure; 2967 | 2968 | if (nla_put_u32(skb, TCA_CAKE_AUTORATE, 2969 | !!(q->rate_flags & CAKE_FLAG_AUTORATE_INGRESS))) 2970 | goto nla_put_failure; 2971 | 2972 | if (nla_put_u32(skb, TCA_CAKE_INGRESS, 2973 | !!(q->rate_flags & CAKE_FLAG_INGRESS))) 2974 | goto nla_put_failure; 2975 | 2976 | if (nla_put_u32(skb, TCA_CAKE_ACK_FILTER, q->ack_filter)) 2977 | goto nla_put_failure; 2978 | 2979 | if (nla_put_u32(skb, TCA_CAKE_NAT, 2980 | !!(q->flow_mode & CAKE_FLOW_NAT_FLAG))) 2981 | goto nla_put_failure; 2982 | 2983 | if (nla_put_u32(skb, TCA_CAKE_DIFFSERV_MODE, q->tin_mode)) 2984 | goto nla_put_failure; 2985 | 2986 | if (nla_put_u32(skb, TCA_CAKE_WASH, 2987 | !!(q->rate_flags & CAKE_FLAG_WASH))) 2988 | goto nla_put_failure; 2989 | 2990 | if (nla_put_u32(skb, TCA_CAKE_OVERHEAD, q->rate_overhead)) 2991 | goto nla_put_failure; 2992 | 2993 | if (!(q->rate_flags & CAKE_FLAG_OVERHEAD)) 2994 | if (nla_put_u32(skb, TCA_CAKE_RAW, 0)) 2995 | goto nla_put_failure; 2996 | 2997 | if (nla_put_u32(skb, TCA_CAKE_ATM, q->atm_mode)) 2998 | goto nla_put_failure; 2999 | 3000 | if (nla_put_u32(skb, TCA_CAKE_MPU, q->rate_mpu)) 3001 | goto nla_put_failure; 3002 | 3003 | if (nla_put_u32(skb, TCA_CAKE_SPLIT_GSO, 3004 | !!(q->rate_flags & CAKE_FLAG_SPLIT_GSO))) 3005 | goto nla_put_failure; 3006 | 3007 | if (nla_put_u32(skb, TCA_CAKE_FWMARK, q->fwmark_mask)) 3008 | goto nla_put_failure; 3009 | 3010 | return nla_nest_end(skb, opts); 3011 | 3012 | nla_put_failure: 3013 | return -1; 3014 | } 3015 | 3016 | static int cake_dump_stats(struct Qdisc *sch, struct gnet_dump *d) 3017 | { 3018 | struct nlattr *stats = nla_nest_start(d->skb, TCA_STATS_APP); 3019 | struct cake_sched_data *q = qdisc_priv(sch); 3020 | struct nlattr *tstats, *ts; 3021 | int i; 3022 | 3023 | if (!stats) 3024 | return -1; 3025 | 3026 | #define PUT_STAT_U32(attr, data) do { \ 3027 | if (nla_put_u32(d->skb, TCA_CAKE_STATS_ ## attr, data)) \ 3028 | goto nla_put_failure; \ 3029 | } while (0) 3030 | #define PUT_STAT_U64(attr, data) do { \ 3031 | if (nla_put_u64_64bit(d->skb, TCA_CAKE_STATS_ ## attr, \ 3032 | data, TCA_CAKE_STATS_PAD)) \ 3033 | goto nla_put_failure; \ 3034 | } while (0) 3035 | 3036 | PUT_STAT_U64(CAPACITY_ESTIMATE64, q->avg_peak_bandwidth); 3037 | PUT_STAT_U32(MEMORY_LIMIT, q->buffer_limit); 3038 | PUT_STAT_U32(MEMORY_USED, q->buffer_max_used); 3039 | PUT_STAT_U32(AVG_NETOFF, ((q->avg_netoff + 0x8000) >> 16)); 3040 | PUT_STAT_U32(MAX_NETLEN, q->max_netlen); 3041 | PUT_STAT_U32(MAX_ADJLEN, q->max_adjlen); 3042 | PUT_STAT_U32(MIN_NETLEN, q->min_netlen); 3043 | PUT_STAT_U32(MIN_ADJLEN, q->min_adjlen); 3044 | 3045 | #undef PUT_STAT_U32 3046 | #undef PUT_STAT_U64 3047 | 3048 | tstats = nla_nest_start(d->skb, TCA_CAKE_STATS_TIN_STATS); 3049 | if (!tstats) 3050 | goto nla_put_failure; 3051 | 3052 | #define PUT_TSTAT_U32(attr, data) do { \ 3053 | if (nla_put_u32(d->skb, TCA_CAKE_TIN_STATS_ ## attr, data)) \ 3054 | goto nla_put_failure; \ 3055 | } while (0) 3056 | #define PUT_TSTAT_U64(attr, data) do { \ 3057 | if (nla_put_u64_64bit(d->skb, TCA_CAKE_TIN_STATS_ ## attr, \ 3058 | data, TCA_CAKE_TIN_STATS_PAD)) \ 3059 | goto nla_put_failure; \ 3060 | } while (0) 3061 | 3062 | for (i = 0; i < q->tin_cnt; i++) { 3063 | struct cake_tin_data *b = &q->tins[q->tin_order[i]]; 3064 | 3065 | ts = nla_nest_start(d->skb, i + 1); 3066 | if (!ts) 3067 | goto nla_put_failure; 3068 | 3069 | PUT_TSTAT_U64(THRESHOLD_RATE64, b->tin_rate_bps); 3070 | PUT_TSTAT_U64(SENT_BYTES64, b->bytes); 3071 | PUT_TSTAT_U32(BACKLOG_BYTES, b->tin_backlog); 3072 | 3073 | PUT_TSTAT_U32(TARGET_US, 3074 | ktime_to_us(ns_to_ktime(b->cparams.target))); 3075 | PUT_TSTAT_U32(INTERVAL_US, 3076 | ktime_to_us(ns_to_ktime(b->cparams.interval))); 3077 | 3078 | PUT_TSTAT_U32(SENT_PACKETS, b->packets); 3079 | PUT_TSTAT_U32(DROPPED_PACKETS, b->tin_dropped); 3080 | PUT_TSTAT_U32(ECN_MARKED_PACKETS, b->tin_ecn_mark); 3081 | PUT_TSTAT_U32(ACKS_DROPPED_PACKETS, b->ack_drops); 3082 | 3083 | PUT_TSTAT_U32(PEAK_DELAY_US, 3084 | ktime_to_us(ns_to_ktime(b->peak_delay))); 3085 | PUT_TSTAT_U32(AVG_DELAY_US, 3086 | ktime_to_us(ns_to_ktime(b->avge_delay))); 3087 | PUT_TSTAT_U32(BASE_DELAY_US, 3088 | ktime_to_us(ns_to_ktime(b->base_delay))); 3089 | 3090 | PUT_TSTAT_U32(WAY_INDIRECT_HITS, b->way_hits); 3091 | PUT_TSTAT_U32(WAY_MISSES, b->way_misses); 3092 | PUT_TSTAT_U32(WAY_COLLISIONS, b->way_collisions); 3093 | 3094 | PUT_TSTAT_U32(SPARSE_FLOWS, b->sparse_flow_count + 3095 | b->decaying_flow_count); 3096 | PUT_TSTAT_U32(BULK_FLOWS, b->bulk_flow_count); 3097 | PUT_TSTAT_U32(UNRESPONSIVE_FLOWS, b->unresponsive_flow_count); 3098 | PUT_TSTAT_U32(MAX_SKBLEN, b->max_skblen); 3099 | 3100 | PUT_TSTAT_U32(FLOW_QUANTUM, b->flow_quantum); 3101 | nla_nest_end(d->skb, ts); 3102 | } 3103 | 3104 | #undef PUT_TSTAT_U32 3105 | #undef PUT_TSTAT_U64 3106 | 3107 | nla_nest_end(d->skb, tstats); 3108 | return nla_nest_end(d->skb, stats); 3109 | 3110 | nla_put_failure: 3111 | nla_nest_cancel(d->skb, stats); 3112 | #if LINUX_VERSION_CODE < KERNEL_VERSION(4, 8, 0) 3113 | sch_tree_unlock(sch); 3114 | #endif 3115 | return -1; 3116 | } 3117 | 3118 | static struct Qdisc *cake_leaf(struct Qdisc *sch, unsigned long arg) 3119 | { 3120 | return NULL; 3121 | } 3122 | 3123 | static unsigned long cake_find(struct Qdisc *sch, u32 classid) 3124 | { 3125 | return 0; 3126 | } 3127 | 3128 | static unsigned long cake_bind(struct Qdisc *sch, unsigned long parent, 3129 | u32 classid) 3130 | { 3131 | return 0; 3132 | } 3133 | 3134 | static void cake_unbind(struct Qdisc *q, unsigned long cl) 3135 | { 3136 | } 3137 | 3138 | #if LINUX_VERSION_CODE < KERNEL_VERSION(4, 13, 0) 3139 | static struct tcf_proto __rcu **cake_find_tcf(struct Qdisc *sch, unsigned long cl) 3140 | #else 3141 | #if LINUX_VERSION_CODE < KERNEL_VERSION(4, 16, 0) 3142 | static struct tcf_block *cake_tcf_block(struct Qdisc *sch, unsigned long cl) 3143 | #else 3144 | static struct tcf_block *cake_tcf_block(struct Qdisc *sch, unsigned long cl, 3145 | struct netlink_ext_ack *extack) 3146 | #endif 3147 | #endif 3148 | { 3149 | struct cake_sched_data *q = qdisc_priv(sch); 3150 | 3151 | if (cl) 3152 | return NULL; 3153 | #if LINUX_VERSION_CODE < KERNEL_VERSION(4, 13, 0) 3154 | return &q->filter_list; 3155 | #else 3156 | return q->block; 3157 | #endif 3158 | } 3159 | 3160 | static int cake_dump_class(struct Qdisc *sch, unsigned long cl, 3161 | struct sk_buff *skb, struct tcmsg *tcm) 3162 | { 3163 | tcm->tcm_handle |= TC_H_MIN(cl); 3164 | return 0; 3165 | } 3166 | 3167 | static int cake_dump_class_stats(struct Qdisc *sch, unsigned long cl, 3168 | struct gnet_dump *d) 3169 | { 3170 | struct cake_sched_data *q = qdisc_priv(sch); 3171 | const struct cake_flow *flow = NULL; 3172 | struct gnet_stats_queue qs = { 0 }; 3173 | struct nlattr *stats; 3174 | u32 idx = cl - 1; 3175 | 3176 | if (idx < CAKE_QUEUES * q->tin_cnt) { 3177 | const struct cake_tin_data *b = \ 3178 | &q->tins[q->tin_order[idx / CAKE_QUEUES]]; 3179 | const struct sk_buff *skb; 3180 | 3181 | flow = &b->flows[idx % CAKE_QUEUES]; 3182 | 3183 | if (flow->head) { 3184 | cake_maybe_lock(sch); 3185 | skb = flow->head; 3186 | while (skb) { 3187 | qs.qlen++; 3188 | skb = skb->next; 3189 | } 3190 | cake_maybe_unlock(sch); 3191 | } 3192 | qs.backlog = b->backlogs[idx % CAKE_QUEUES]; 3193 | qs.drops = flow->dropped; 3194 | } 3195 | if (gnet_stats_copy_queue(d, NULL, &qs, qs.qlen) < 0) 3196 | return -1; 3197 | if (flow) { 3198 | ktime_t now = ktime_get(); 3199 | 3200 | stats = nla_nest_start(d->skb, TCA_STATS_APP); 3201 | if (!stats) 3202 | return -1; 3203 | 3204 | #define PUT_STAT_U32(attr, data) do { \ 3205 | if (nla_put_u32(d->skb, TCA_CAKE_STATS_ ## attr, data)) \ 3206 | goto nla_put_failure; \ 3207 | } while (0) 3208 | #define PUT_STAT_S32(attr, data) do { \ 3209 | if (nla_put_s32(d->skb, TCA_CAKE_STATS_ ## attr, data)) \ 3210 | goto nla_put_failure; \ 3211 | } while (0) 3212 | 3213 | PUT_STAT_S32(DEFICIT, flow->deficit); 3214 | PUT_STAT_U32(DROPPING, flow->cvars.dropping); 3215 | PUT_STAT_U32(COBALT_COUNT, flow->cvars.count); 3216 | PUT_STAT_U32(P_DROP, flow->cvars.p_drop); 3217 | if (flow->cvars.p_drop) { 3218 | PUT_STAT_S32(BLUE_TIMER_US, 3219 | ktime_to_us( 3220 | ktime_sub(now, 3221 | flow->cvars.blue_timer))); 3222 | } 3223 | if (flow->cvars.dropping) { 3224 | PUT_STAT_S32(DROP_NEXT_US, 3225 | ktime_to_us( 3226 | ktime_sub(now, 3227 | flow->cvars.drop_next))); 3228 | } 3229 | 3230 | if (nla_nest_end(d->skb, stats) < 0) 3231 | return -1; 3232 | } 3233 | 3234 | return 0; 3235 | 3236 | nla_put_failure: 3237 | nla_nest_cancel(d->skb, stats); 3238 | return -1; 3239 | } 3240 | 3241 | static void cake_walk(struct Qdisc *sch, struct qdisc_walker *arg) 3242 | { 3243 | struct cake_sched_data *q = qdisc_priv(sch); 3244 | unsigned int i, j; 3245 | 3246 | if (arg->stop) 3247 | return; 3248 | 3249 | for (i = 0; i < q->tin_cnt; i++) { 3250 | struct cake_tin_data *b = &q->tins[q->tin_order[i]]; 3251 | 3252 | for (j = 0; j < CAKE_QUEUES; j++) { 3253 | if (list_empty(&b->flows[j].flowchain) || 3254 | arg->count < arg->skip) { 3255 | arg->count++; 3256 | continue; 3257 | } 3258 | if (arg->fn(sch, i * CAKE_QUEUES + j + 1, arg) < 0) { 3259 | arg->stop = 1; 3260 | break; 3261 | } 3262 | arg->count++; 3263 | } 3264 | } 3265 | } 3266 | 3267 | static const struct Qdisc_class_ops cake_class_ops = { 3268 | #if LINUX_VERSION_CODE < KERNEL_VERSION(4, 13, 0) 3269 | .tcf_chain = cake_find_tcf, 3270 | #else 3271 | .tcf_block = cake_tcf_block, 3272 | #endif 3273 | #if LINUX_VERSION_CODE < KERNEL_VERSION(4, 14, 0) 3274 | .get = cake_find, 3275 | .put = cake_unbind, 3276 | #else 3277 | .find = cake_find, 3278 | #endif 3279 | .unbind_tcf = cake_unbind, 3280 | .bind_tcf = cake_bind, 3281 | .leaf = cake_leaf, 3282 | .dump = cake_dump_class, 3283 | .dump_stats = cake_dump_class_stats, 3284 | .walk = cake_walk, 3285 | }; 3286 | 3287 | static struct Qdisc_ops cake_qdisc_ops __read_mostly = { 3288 | .cl_ops = &cake_class_ops, 3289 | .id = "cake", 3290 | .priv_size = sizeof(struct cake_sched_data), 3291 | .enqueue = cake_enqueue, 3292 | .dequeue = cake_dequeue, 3293 | .peek = qdisc_peek_dequeued, 3294 | #if LINUX_VERSION_CODE < KERNEL_VERSION(4, 8, 0) 3295 | .drop = cake_drop, 3296 | #endif 3297 | .init = cake_init, 3298 | .reset = cake_reset, 3299 | .destroy = cake_destroy, 3300 | .change = cake_change, 3301 | .dump = cake_dump, 3302 | .dump_stats = cake_dump_stats, 3303 | .owner = THIS_MODULE, 3304 | }; 3305 | 3306 | static int __init cake_module_init(void) 3307 | { 3308 | return register_qdisc(&cake_qdisc_ops); 3309 | } 3310 | 3311 | static void __exit cake_module_exit(void) 3312 | { 3313 | unregister_qdisc(&cake_qdisc_ops); 3314 | } 3315 | 3316 | module_init(cake_module_init) 3317 | module_exit(cake_module_exit) 3318 | MODULE_AUTHOR("Jonathan Morton"); 3319 | MODULE_LICENSE("Dual BSD/GPL"); 3320 | MODULE_DESCRIPTION("The CAKE shaper."); 3321 | --------------------------------------------------------------------------------