├── .gitignore
├── Kbuild
├── Kconfig
├── Makefile
├── README.org
├── cobalt_compat.h
├── gen_cake_const.c
├── pkt_sched.h
└── sch_cake.c


/.gitignore:
--------------------------------------------------------------------------------
1 | .*.cmd
2 | .tmp_versions
3 | Module.symvers
4 | modules.order
5 | sch_cake.ko
6 | sch_cake.mod.c
7 | *.o
8 | *.dwo
9 | 


--------------------------------------------------------------------------------
/Kbuild:
--------------------------------------------------------------------------------
1 | ifneq ($(KBUILD_EXTMOD),)
2 | CONFIG_NET_SCH_CAKE := m
3 | endif
4 | 
5 | obj-$(CONFIG_NET_SCH_CAKE)	+= sch_cake.o
6 | 


--------------------------------------------------------------------------------
/Kconfig:
--------------------------------------------------------------------------------
 1 | config NET_SCH_CAKE
 2 | 	tristate "Common Applications Kept Enhanced (CAKE)"
 3 |         depends on NET_SCHED
 4 | 	help
 5 | 	  Say Y here if you want to use the Common Applications Kept Enhanced
 6 |           (CAKE) queue management algorithm.
 7 | 
 8 | 	  To compile this driver as a module, choose M here: the module
 9 | 	  will be called sch_cake.
10 | 
11 | 	  If unsure, say N.
12 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | KERNEL_VERSION := $(shell uname -r)
 2 | IDIR := /lib/modules/$(KERNEL_VERSION)/kernel/net/sched/
 3 | KDIR := /lib/modules/$(KERNEL_VERSION)/build
 4 | PWD := $(shell pwd)
 5 | VERSION := $(shell git rev-parse HEAD 2>/dev/null)
 6 | default:
 7 | 	@$(MAKE) -C $(KDIR) M=$(PWD) modules $(if $(VERSION),LDFLAGS_MODULE="--build-id=0x$(VERSION)" CFLAGS_MODULE="-DCAKE_VERSION=\\\"$(VERSION)\\\"")
 8 | 
 9 | install:
10 | 	install -v -m 644 sch_cake.ko $(IDIR)
11 | 	depmod "$(KERNEL_VERSION)"
12 | 	[ "$(KERNEL_VERSION)" != `uname -r` ] || modprobe sch_cake
13 | 
14 | clean:
15 | 	@$(MAKE) -C $(KDIR) M=$(PWD) clean
16 | 


--------------------------------------------------------------------------------
/README.org:
--------------------------------------------------------------------------------
 1 | #+OPTIONS: ^:nil
 2 | 
 3 | * Common Applications Kept Enhanced (CAKE) scheduler
 4 | 
 5 | This is the out-of-tree version of [[https://www.bufferbloat.net/projects/codel/wiki/Cake/][CAKE]], the Linux qdisc that combines scheduler
 6 | and traffic shaper for effective bufferbloat mitigation.
 7 | 
 8 | Note that sch_cake is part of the upstream Linux kernel since kernel version
 9 | 4.19, so this repository exists only as a resource for building the qdisc with
10 | older versions of the kernel. If you're already on kernel 4.19 or newer, you can
11 | just load CAKE with =tc= and the kernel shipped by your distribution!
12 | 


--------------------------------------------------------------------------------
/cobalt_compat.h:
--------------------------------------------------------------------------------
  1 | #ifndef __NET_SCHED_COBALT_COMPAT_H
  2 | #define __NET_SCHED_COBALT_COMPAT_H
  3 | /* Backport some stuff if needed.
  4 |  */
  5 | #if KERNEL_VERSION(3, 11, 0) > LINUX_VERSION_CODE
  6 | #define ktime_add_ms(kt, msec) ktime_add_ns(kt, msec * NSEC_PER_MSEC)
  7 | #endif
  8 | 
  9 | #if KERNEL_VERSION(3, 14, 0) > LINUX_VERSION_CODE
 10 | 
 11 | static inline u32 reciprocal_scale(u32 val, u32 ep_ro)
 12 | {
 13 | 	return (u32)(((u64) val * ep_ro) >> 32);
 14 | }
 15 | 
 16 | #endif
 17 | 
 18 | #if KERNEL_VERSION(3, 15, 0) > LINUX_VERSION_CODE
 19 | 
 20 | static inline void kvfree(const void *addr)
 21 | {
 22 | 	if (is_vmalloc_addr(addr))
 23 | 		vfree(addr);
 24 | 	else
 25 | 		kfree(addr);
 26 | }
 27 | 
 28 | #endif
 29 | 
 30 | #if KERNEL_VERSION(3, 16, 0) > LINUX_VERSION_CODE
 31 | #define ktime_after(cmp1, cmp2) ktime_compare(cmp1, cmp2) > 0
 32 | #define ktime_before(cmp1, cmp2) ktime_compare(cmp1, cmp2) < 0
 33 | #endif
 34 | 
 35 | #if KERNEL_VERSION(3, 17, 0) > LINUX_VERSION_CODE
 36 | 
 37 | #define ktime_get_ns() ktime_to_ns(ktime_get())
 38 | 
 39 | #endif
 40 | 
 41 | /* 3.18 > 4.7 use 3 arg, everything else uses 2 arg versions
 42 |  * of qdisc_watchdog_schedule_ns
 43 |  */
 44 | #if ((KERNEL_VERSION(3, 18, 0) <= LINUX_VERSION_CODE) && (KERNEL_VERSION(4, 8, 0) > LINUX_VERSION_CODE))
 45 | #define qdisc_watchdog_schedule_ns(_a, _b) qdisc_watchdog_schedule_ns(_a, _b, true);
 46 | #endif
 47 | 
 48 | #if KERNEL_VERSION(3, 18, 0) > LINUX_VERSION_CODE
 49 | static inline void qdisc_qstats_backlog_dec(struct Qdisc *sch,
 50 | 					    const struct sk_buff *skb)
 51 | {
 52 | 	sch->qstats.backlog -= qdisc_pkt_len(skb);
 53 | }
 54 | 
 55 | static inline void qdisc_qstats_backlog_inc(struct Qdisc *sch,
 56 | 					    const struct sk_buff *skb)
 57 | {
 58 | 	sch->qstats.backlog += qdisc_pkt_len(skb);
 59 | }
 60 | 
 61 | static inline void __qdisc_qstats_drop(struct Qdisc *sch, int count)
 62 | {
 63 | 	sch->qstats.drops += count;
 64 | }
 65 | 
 66 | static inline void qdisc_qstats_drop(struct Qdisc *sch)
 67 | {
 68 | 	sch->qstats.drops++;
 69 | }
 70 | 
 71 | #define gnet_stats_copy_queue(_a, _b, _c, _d) gnet_stats_copy_queue(_a, _c)
 72 | 
 73 | #endif
 74 | 
 75 | #if KERNEL_VERSION(4, 1, 0) > LINUX_VERSION_CODE
 76 | #define TCPOPT_FASTOPEN	34
 77 | #endif
 78 | 
 79 | #if KERNEL_VERSION(4, 3, 0) > LINUX_VERSION_CODE
 80 | #define tcf_classify(_a, _b, _c, _d) tc_classify(_a, _b, _c);
 81 | #elif KERNEL_VERSION(4, 13, 0) > LINUX_VERSION_CODE
 82 | #define tcf_classify(_a, _b, _c, _d) tc_classify(_a, _b, _c, _d);
 83 | #endif
 84 | 
 85 | #if !defined(IS_REACHABLE)
 86 | #define IS_REACHABLE(option) (config_enabled(option) || \
 87 | 		(config_enabled(option##_MODULE) && config_enabled(MODULE)))
 88 | #endif
 89 | 
 90 | #if ((KERNEL_VERSION(4, 4, 114) > LINUX_VERSION_CODE) && \
 91 |      ((KERNEL_VERSION(4,  1, 50) > LINUX_VERSION_CODE) || (KERNEL_VERSION(4,  2, 0) <= LINUX_VERSION_CODE)))
 92 | static inline unsigned int __tcp_hdrlen(const struct tcphdr *th)
 93 | {
 94 | 	return th->doff * 4;
 95 | }
 96 | #endif
 97 | 
 98 | #if KERNEL_VERSION(4, 5, 0) > LINUX_VERSION_CODE
 99 | #define IP6_ECN_set_ce(_a, _b) IP6_ECN_set_ce(_b)
100 | #endif
101 | 
102 | #if KERNEL_VERSION(4, 6, 0) > LINUX_VERSION_CODE
103 | static inline int skb_try_make_writable(struct sk_buff *skb,
104 | 					unsigned int write_len)
105 | {
106 | 	return skb_cloned(skb) && !skb_clone_writable(skb, write_len) &&
107 | 	       pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
108 | }
109 | #endif
110 | 
111 | #if KERNEL_VERSION(4, 11, 0) > LINUX_VERSION_CODE
112 | static inline int skb_mac_offset(const struct sk_buff *skb)
113 | {
114 | 	return skb_mac_header(skb) - skb->data;
115 | }
116 | #endif
117 | 
118 | #if KERNEL_VERSION(4, 7, 0) > LINUX_VERSION_CODE
119 | #define nla_put_u64_64bit(skb, attrtype, value, padattr) nla_put_u64(skb, attrtype, value)
120 | #endif
121 | 
122 | #if KERNEL_VERSION(4, 8, 0) > LINUX_VERSION_CODE
123 | #define cake_maybe_lock(sch)
124 | #define cake_maybe_unlock(sch)
125 | #else
126 | #define cake_maybe_lock(sch) sch_tree_lock(sch);
127 | #define cake_maybe_unlock(sch) sch_tree_unlock(sch);
128 | #endif
129 | 
130 | 
131 | #if KERNEL_VERSION(4, 12, 0) > LINUX_VERSION_CODE
132 | static void *kvzalloc(size_t sz, gfp_t flags)
133 | {
134 | 	void *ptr = kzalloc(sz, flags);
135 | 
136 | 	if (!ptr)
137 | 		ptr = vzalloc(sz);
138 | 	return ptr;
139 | }
140 | #endif
141 | 
142 | /* save the best till last
143 |  * qdisc_tree_reduce_backlog appears in kernel from:
144 | 3.16.37 onward
145 | not in 3.17
146 | 3.18.37
147 | not in 3.19
148 | not in 4.0
149 | 4.1.28 onward
150 | not in 4.2
151 | not in 4.3
152 | 4.4.11 onward
153 | 4.5.5 onward
154 |  */
155 | #if ((KERNEL_VERSION(3,  0, 0) <= LINUX_VERSION_CODE) && (KERNEL_VERSION(3, 16, 37) > LINUX_VERSION_CODE)) || \
156 |     ((KERNEL_VERSION(3, 18, 0) <= LINUX_VERSION_CODE) && (KERNEL_VERSION(3, 18, 37) > LINUX_VERSION_CODE)) || \
157 |     ((KERNEL_VERSION(4,  1, 0) <= LINUX_VERSION_CODE) && (KERNEL_VERSION(4,  1, 28) > LINUX_VERSION_CODE)) || \
158 |     ((KERNEL_VERSION(4,  4, 0) <= LINUX_VERSION_CODE) && (KERNEL_VERSION(4,  4, 11) > LINUX_VERSION_CODE)) || \
159 |     ((KERNEL_VERSION(4,  5, 0) <= LINUX_VERSION_CODE) && (KERNEL_VERSION(4,  5,  5) > LINUX_VERSION_CODE))
160 | #define qdisc_tree_reduce_backlog(_a, _b, _c) qdisc_tree_decrease_qlen(_a, _b)
161 | #endif
162 | 
163 | 
164 | #endif
165 | 


--------------------------------------------------------------------------------
/gen_cake_const.c:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * cake_const.c
  3 |  * No point in calculating the diffserv lookup tables at runtime
  4 |  * Dave Taht
  5 |  * 2015-12-21
  6 |  */
  7 | 
  8 | #include <stdio.h>
  9 | #include <stdlib.h>
 10 | #include <stdint.h>
 11 | 
 12 | 
 13 | /*	List of known Diffserv codepoints:
 14 |  *
 15 |  *	Least Effort (CS1)
 16 |  *	Best Effort (CS0)
 17 |  *	Max Reliability (TOS1)
 18 |  *	Max Throughput (TOS2)
 19 |  *	Min Delay (TOS4)
 20 |  *	Assured Forwarding 1 (AF1x) - x3
 21 |  *	Assured Forwarding 2 (AF2x) - x3
 22 |  *	Assured Forwarding 3 (AF3x) - x3
 23 |  *	Assured Forwarding 4 (AF4x) - x3
 24 |  *	Precedence Class 2 (CS2)
 25 |  *	Precedence Class 3 (CS3)
 26 |  *	Precedence Class 4 (CS4)
 27 |  *	Precedence Class 5 (CS5)
 28 |  *	Precedence Class 6 (CS6)
 29 |  *	Precedence Class 7 (CS7)
 30 |  *	Voice Admit (VA)
 31 |  *	Expedited Forwarding (EF)
 32 | 
 33 |  *	Total 25 codepoints.
 34 |  */
 35 | 
 36 | /*	List of traffic classes in RFC 4594:
 37 |  *		(roughly descending order of contended priority)
 38 |  *		(roughly ascending order of uncontended throughput)
 39 |  *
 40 |  *	Network Control (CS6,CS7)      - routing traffic
 41 |  *	Telephony (EF,VA)         - aka. VoIP streams
 42 |  *	Signalling (CS5)               - VoIP setup
 43 |  *	Multimedia Conferencing (AF4x) - aka. video calls
 44 |  *	Realtime Interactive (CS4)     - eg. games
 45 |  *	Multimedia Streaming (AF3x)    - eg. YouTube, NetFlix, Twitch
 46 |  *	Broadcast Video (CS3)
 47 |  *	Low Latency Data (AF2x,TOS4)      - eg. database
 48 |  *	Ops, Admin, Management (CS2,TOS1) - eg. ssh
 49 |  *	Standard Service (CS0 & unrecognised codepoints)
 50 |  *	High Throughput Data (AF1x,TOS2)  - eg. web traffic
 51 |  *	Low Priority Data (CS1)           - eg. BitTorrent
 52 | 
 53 |  *	Total 12 traffic classes.
 54 |  */
 55 | 
 56 | static int min(int a, int b) {
 57 | 	return (a < b ? a : b);
 58 | }
 59 | 
 60 | static void print_dscp(char *var, uint8_t *dscp) {
 61 | 	printf("static const u8 %s[] = {", var);
 62 | 	for(int i=0;i<64;i+=8) {
 63 | 		for(int j=0; j<7; j++) {
 64 | 			printf("%d, ",(int)dscp[i+j]);
 65 | 		}
 66 | 		printf("%d,\n\t\t\t\t", dscp[i+7]);
 67 | 	}
 68 | 	printf("};\n");
 69 | }
 70 | 
 71 | void precedence() {
 72 | 	uint8_t dscp[64];
 73 | 	for (int i = 0; i < 64; i++)
 74 | 		dscp[i]= min((i >> 3), 8);
 75 | 	print_dscp("precedence",dscp);
 76 | }
 77 | 
 78 | /*	Pruned list of traffic classes for typical applications:
 79 |  *
 80 |  *		Network Control          (CS6, CS7)
 81 |  *		Minimum Latency          (EF, VA, CS5, CS4)
 82 |  *		Interactive Shell        (CS2, TOS1)
 83 |  *		Low Latency Transactions (AF2x, TOS4)
 84 |  *		Video Streaming          (AF4x, AF3x, CS3)
 85 |  *		Bog Standard             (CS0 etc.)
 86 |  *		High Throughput          (AF1x, TOS2)
 87 |  *		Background Traffic       (CS1)
 88 |  *
 89 |  *		Total 8 traffic classes.
 90 | */
 91 | 
 92 | void diffserv8() {
 93 | 	uint8_t dscp[64];
 94 | 
 95 | 	/* codepoint to class mapping */
 96 | 	for (int i = 0; i < 64; i++)
 97 | 		dscp[i] = 2;	/* default to best-effort */
 98 | 
 99 | 	dscp[0x08] = 0;	/* CS1 */
100 | 	dscp[0x02] = 1;	/* TOS2 */
101 | 	dscp[0x18] = 3;	/* CS3 */
102 | 	dscp[0x04] = 4;	/* TOS4 */
103 | 	dscp[0x01] = 5;	/* TOS1 */
104 | 	dscp[0x10] = 5;	/* CS2 */
105 | 	dscp[0x20] = 6;	/* CS4 */
106 | 	dscp[0x28] = 6;	/* CS5 */
107 | 	dscp[0x2c] = 6;	/* VA */
108 | 	dscp[0x2e] = 6;	/* EF */
109 | 	dscp[0x30] = 7;	/* CS6 */
110 | 	dscp[0x38] = 7;	/* CS7 */
111 | 
112 | 	for (int i = 2; i <= 6; i += 2) {
113 | 		dscp[0x08 + i] = 1;	/* AF1x */
114 | 		dscp[0x10 + i] = 4;	/* AF2x */
115 | 		dscp[0x18 + i] = 3;	/* AF3x */
116 | 		dscp[0x20 + i] = 3;	/* AF4x */
117 | 	}
118 | 
119 | 	print_dscp("diffserv8",dscp);
120 | }
121 | 
122 | /*  Diffserv structure specialised for Latency-Loss-Tradeoff spec.
123 |  *		Loss Sensitive		(TOS1, TOS2)
124 |  *		Best Effort
125 |  *		Latency Sensitive	(TOS4, TOS5, VA, EF)
126 |  *		Low Priority		(CS1)
127 |  *		Network Control		(CS6, CS7)
128 |  */
129 | 
130 | void diffserv_llt() {
131 | 	uint8_t dscp[64];
132 | 	/* codepoint to class mapping */
133 | 
134 | 	for (int i = 0; i < 64; i++)
135 | 		dscp[i] = 1;	/* default to best-effort */
136 | 
137 | 	dscp[0x01] = 0;	/* TOS1 */
138 | 	dscp[0x02] = 0;	/* TOS2 */
139 | 	dscp[0x04] = 2;	/* TOS4 */
140 | 	dscp[0x05] = 2;	/* TOS5 */
141 | 	dscp[0x2c] = 2;	/* VA */
142 | 	dscp[0x2e] = 2;	/* EF */
143 | 	dscp[0x08] = 3;	/* CS1 */
144 | 	dscp[0x30] = 4;	/* CS6 */
145 | 	dscp[0x38] = 4;	/* CS7 */
146 | 
147 | 	print_dscp("diffserv_llt",dscp);
148 | 
149 | }
150 | 
151 | /*  Further pruned list of traffic classes for four-class system:
152 |  *
153 |  *	    Latency Sensitive  (CS7, CS6, EF, VA, CS5, CS4)
154 |  *	    Streaming Media    (AF4x, AF3x, CS3, AF2x, TOS4, CS2, TOS1)
155 |  *	    Best Effort        (CS0, AF1x, TOS2, and those not specified)
156 |  *	    Background Traffic (CS1)
157 |  *
158 |  *		Total 4 traffic classes.
159 |  */
160 | 
161 | void diffserv4() {
162 | 	uint8_t dscp[64];
163 | 	/* codepoint to class mapping */
164 | 	for (int i = 0; i < 64; i++)
165 | 		dscp[i] = 1;	/* default to best-effort */
166 | 
167 | 	dscp[0x08] = 0;	/* CS1 */
168 | 
169 | 	dscp[0x18] = 2;	/* CS3 */
170 | 	dscp[0x04] = 2;	/* TOS4 */
171 | 	dscp[0x01] = 2;	/* TOS1 */
172 | 	dscp[0x10] = 2;	/* CS2 */
173 | 
174 | 	dscp[0x20] = 3;	/* CS4 */
175 | 	dscp[0x28] = 3;	/* CS5 */
176 | 	dscp[0x2c] = 3;	/* VA */
177 | 	dscp[0x2e] = 3;	/* EF */
178 | 	dscp[0x30] = 3;	/* CS6 */
179 | 	dscp[0x38] = 3;	/* CS7 */
180 | 
181 | 	for (int i = 2; i <= 6; i += 2) {
182 | 		dscp[0x10 + i] = 2;	/* AF2x */
183 | 		dscp[0x18 + i] = 2;	/* AF3x */
184 | 		dscp[0x20 + i] = 2;	/* AF4x */
185 | 	}
186 | 
187 | 	print_dscp("diffserv4",dscp);
188 | }
189 | 
190 | /*  Simplified Diffserv structure with 3 tins.
191 |  *		Low Priority		(CS1)
192 |  *		Best Effort
193 |  *		Latency Sensitive	(TOS4, VA, EF, CS6, CS7)
194 |  */
195 | 
196 | void diffserv3() {
197 | 	uint8_t dscp[64];
198 | 	/* codepoint to class mapping */
199 | 	for (int i = 0; i < 64; i++)
200 | 		dscp[i] = 1;	/* default to best-effort */
201 | 
202 | 	dscp[0x08] = 0;	/* CS1 */
203 | 
204 | 	dscp[0x04] = 2;	/* TOS4 */
205 | 	dscp[0x2c] = 2;	/* VA */
206 | 	dscp[0x2e] = 2;	/* EF */
207 | 	dscp[0x30] = 2;	/* CS6 */
208 | 	dscp[0x38] = 2;	/* CS7 */
209 | 
210 | 	print_dscp("diffserv3",dscp);
211 | }
212 | 
213 | void besteffort() {
214 | 	uint8_t dscp[64];
215 | 	for (int i = 0; i < 64; i++)
216 | 		dscp[i]=0;
217 | 	print_dscp("besteffort",dscp);
218 | }
219 | 
220 | int main(int argc, char **argv) {
221 | 	precedence();
222 | 	diffserv_llt();
223 | 	diffserv8();
224 | 	diffserv4();
225 | 	diffserv3();
226 | 	besteffort();
227 | }
228 | 


--------------------------------------------------------------------------------
/pkt_sched.h:
--------------------------------------------------------------------------------
  1 | #ifndef __LINUX_PKT_SCHED_H
  2 | #define __LINUX_PKT_SCHED_H
  3 | 
  4 | #include <linux/types.h>
  5 | 
  6 | /* Logical priority bands not depending on specific packet scheduler.
  7 |    Every scheduler will map them to real traffic classes, if it has
  8 |    no more precise mechanism to classify packets.
  9 | 
 10 |    These numbers have no special meaning, though their coincidence
 11 |    with obsolete IPv6 values is not occasional :-). New IPv6 drafts
 12 |    preferred full anarchy inspired by diffserv group.
 13 | 
 14 |    Note: TC_PRIO_BESTEFFORT does not mean that it is the most unhappy
 15 |    class, actually, as rule it will be handled with more care than
 16 |    filler or even bulk.
 17 |  */
 18 | 
 19 | #define TC_PRIO_BESTEFFORT		0
 20 | #define TC_PRIO_FILLER			1
 21 | #define TC_PRIO_BULK			2
 22 | #define TC_PRIO_INTERACTIVE_BULK	4
 23 | #define TC_PRIO_INTERACTIVE		6
 24 | #define TC_PRIO_CONTROL			7
 25 | 
 26 | #define TC_PRIO_MAX			15
 27 | 
 28 | /* Generic queue statistics, available for all the elements.
 29 |    Particular schedulers may have also their private records.
 30 |  */
 31 | 
 32 | struct tc_stats {
 33 | 	__u64	bytes;			/* Number of enqueued bytes */
 34 | 	__u32	packets;		/* Number of enqueued packets	*/
 35 | 	__u32	drops;			/* Packets dropped because of lack of resources */
 36 | 	__u32	overlimits;		/* Number of throttle events when this
 37 | 					 * flow goes out of allocated bandwidth */
 38 | 	__u32	bps;			/* Current flow byte rate */
 39 | 	__u32	pps;			/* Current flow packet rate */
 40 | 	__u32	qlen;
 41 | 	__u32	backlog;
 42 | };
 43 | 
 44 | struct tc_estimator {
 45 | 	signed char	interval;
 46 | 	unsigned char	ewma_log;
 47 | };
 48 | 
 49 | /* "Handles"
 50 |    ---------
 51 | 
 52 |     All the traffic control objects have 32bit identifiers, or "handles".
 53 | 
 54 |     They can be considered as opaque numbers from user API viewpoint,
 55 |     but actually they always consist of two fields: major and
 56 |     minor numbers, which are interpreted by kernel specially,
 57 |     that may be used by applications, though not recommended.
 58 | 
 59 |     F.e. qdisc handles always have minor number equal to zero,
 60 |     classes (or flows) have major equal to parent qdisc major, and
 61 |     minor uniquely identifying class inside qdisc.
 62 | 
 63 |     Macros to manipulate handles:
 64 |  */
 65 | 
 66 | 
 67 | #define TC_H_MAJ_MASK (0xFFFF0000U)
 68 | #define TC_H_MIN_MASK (0x0000FFFFU)
 69 | #define TC_H_MAJ(h) ((h)&TC_H_MAJ_MASK)
 70 | #define TC_H_MIN(h) ((h)&TC_H_MIN_MASK)
 71 | #define TC_H_MAKE(maj,min) (((maj)&TC_H_MAJ_MASK)|((min)&TC_H_MIN_MASK))
 72 | 
 73 | #define TC_H_UNSPEC	(0U)
 74 | #define TC_H_ROOT	(0xFFFFFFFFU)
 75 | #define TC_H_INGRESS    (0xFFFFFFF1U)
 76 | #ifndef TC_H_CLSACT
 77 | #define TC_H_CLSACT	TC_H_INGRESS
 78 | #define TC_H_MIN_PRIORITY	0xFFF0U
 79 | #define TC_H_MIN_INGRESS	0xFFF2U
 80 | #define TC_H_MIN_EGRESS		0xFFF3U
 81 | #endif
 82 | 
 83 | /* Need to corrospond to iproute2 tc/tc_core.h "enum link_layer" */
 84 | enum tc_link_layer {
 85 | 	TC_LINKLAYER_UNAWARE, /* Indicate unaware old iproute2 util */
 86 | 	TC_LINKLAYER_ETHERNET,
 87 | 	TC_LINKLAYER_ATM,
 88 | };
 89 | #define TC_LINKLAYER_MASK 0x0F /* limit use to lower 4 bits */
 90 | 
 91 | struct tc_ratespec {
 92 | 	unsigned char	cell_log;
 93 | 	__u8		linklayer; /* lower 4 bits */
 94 | 	unsigned short	overhead;
 95 | 	short		cell_align;
 96 | 	unsigned short	mpu;
 97 | 	__u32		rate;
 98 | };
 99 | 
100 | #define TC_RTAB_SIZE	1024
101 | 
102 | struct tc_sizespec {
103 | 	unsigned char	cell_log;
104 | 	unsigned char	size_log;
105 | 	short		cell_align;
106 | 	int		overhead;
107 | 	unsigned int	linklayer;
108 | 	unsigned int	mpu;
109 | 	unsigned int	mtu;
110 | 	unsigned int	tsize;
111 | };
112 | 
113 | enum {
114 | 	TCA_STAB_UNSPEC,
115 | 	TCA_STAB_BASE,
116 | 	TCA_STAB_DATA,
117 | 	__TCA_STAB_MAX
118 | };
119 | 
120 | #define TCA_STAB_MAX (__TCA_STAB_MAX - 1)
121 | 
122 | /* FIFO section */
123 | 
124 | struct tc_fifo_qopt {
125 | 	__u32	limit;	/* Queue length: bytes for bfifo, packets for pfifo */
126 | };
127 | 
128 | /* PRIO section */
129 | 
130 | #define TCQ_PRIO_BANDS	16
131 | #define TCQ_MIN_PRIO_BANDS 2
132 | 
133 | struct tc_prio_qopt {
134 | 	int	bands;			/* Number of bands */
135 | 	__u8	priomap[TC_PRIO_MAX+1];	/* Map: logical priority -> PRIO band */
136 | };
137 | 
138 | /* MULTIQ section */
139 | 
140 | struct tc_multiq_qopt {
141 | 	__u16	bands;			/* Number of bands */
142 | 	__u16	max_bands;		/* Maximum number of queues */
143 | };
144 | 
145 | /* PLUG section */
146 | 
147 | #define TCQ_PLUG_BUFFER                0
148 | #define TCQ_PLUG_RELEASE_ONE           1
149 | #define TCQ_PLUG_RELEASE_INDEFINITE    2
150 | #define TCQ_PLUG_LIMIT                 3
151 | 
152 | struct tc_plug_qopt {
153 | 	/* TCQ_PLUG_BUFFER: Inset a plug into the queue and
154 | 	 *  buffer any incoming packets
155 | 	 * TCQ_PLUG_RELEASE_ONE: Dequeue packets from queue head
156 | 	 *   to beginning of the next plug.
157 | 	 * TCQ_PLUG_RELEASE_INDEFINITE: Dequeue all packets from queue.
158 | 	 *   Stop buffering packets until the next TCQ_PLUG_BUFFER
159 | 	 *   command is received (just act as a pass-thru queue).
160 | 	 * TCQ_PLUG_LIMIT: Increase/decrease queue size
161 | 	 */
162 | 	int             action;
163 | 	__u32           limit;
164 | };
165 | 
166 | /* TBF section */
167 | 
168 | struct tc_tbf_qopt {
169 | 	struct tc_ratespec rate;
170 | 	struct tc_ratespec peakrate;
171 | 	__u32		limit;
172 | 	__u32		buffer;
173 | 	__u32		mtu;
174 | };
175 | 
176 | enum {
177 | 	TCA_TBF_UNSPEC,
178 | 	TCA_TBF_PARMS,
179 | 	TCA_TBF_RTAB,
180 | 	TCA_TBF_PTAB,
181 | 	TCA_TBF_RATE64,
182 | 	TCA_TBF_PRATE64,
183 | 	TCA_TBF_BURST,
184 | 	TCA_TBF_PBURST,
185 | 	__TCA_TBF_MAX,
186 | };
187 | 
188 | #define TCA_TBF_MAX (__TCA_TBF_MAX - 1)
189 | 
190 | 
191 | /* TEQL section */
192 | 
193 | /* TEQL does not require any parameters */
194 | 
195 | /* SFQ section */
196 | 
197 | struct tc_sfq_qopt {
198 | 	unsigned	quantum;	/* Bytes per round allocated to flow */
199 | 	int		perturb_period;	/* Period of hash perturbation */
200 | 	__u32		limit;		/* Maximal packets in queue */
201 | 	unsigned	divisor;	/* Hash divisor  */
202 | 	unsigned	flows;		/* Maximal number of flows  */
203 | };
204 | 
205 | struct tc_sfqred_stats {
206 | 	__u32           prob_drop;      /* Early drops, below max threshold */
207 | 	__u32           forced_drop;	/* Early drops, after max threshold */
208 | 	__u32           prob_mark;      /* Marked packets, below max threshold */
209 | 	__u32           forced_mark;    /* Marked packets, after max threshold */
210 | 	__u32           prob_mark_head; /* Marked packets, below max threshold */
211 | 	__u32           forced_mark_head;/* Marked packets, after max threshold */
212 | };
213 | 
214 | struct tc_sfq_qopt_v1 {
215 | 	struct tc_sfq_qopt v0;
216 | 	unsigned int	depth;		/* max number of packets per flow */
217 | 	unsigned int	headdrop;
218 | /* SFQRED parameters */
219 | 	__u32		limit;		/* HARD maximal flow queue length (bytes) */
220 | 	__u32		qth_min;	/* Min average length threshold (bytes) */
221 | 	__u32		qth_max;	/* Max average length threshold (bytes) */
222 | 	unsigned char   Wlog;		/* log(W)		*/
223 | 	unsigned char   Plog;		/* log(P_max/(qth_max-qth_min))	*/
224 | 	unsigned char   Scell_log;	/* cell size for idle damping */
225 | 	unsigned char	flags;
226 | 	__u32		max_P;		/* probability, high resolution */
227 | /* SFQRED stats */
228 | 	struct tc_sfqred_stats stats;
229 | };
230 | 
231 | 
232 | struct tc_sfq_xstats {
233 | 	__s32		allot;
234 | };
235 | 
236 | /* RED section */
237 | 
238 | enum {
239 | 	TCA_RED_UNSPEC,
240 | 	TCA_RED_PARMS,
241 | 	TCA_RED_STAB,
242 | 	TCA_RED_MAX_P,
243 | 	__TCA_RED_MAX,
244 | };
245 | 
246 | #define TCA_RED_MAX (__TCA_RED_MAX - 1)
247 | 
248 | struct tc_red_qopt {
249 | 	__u32		limit;		/* HARD maximal queue length (bytes)	*/
250 | 	__u32		qth_min;	/* Min average length threshold (bytes) */
251 | 	__u32		qth_max;	/* Max average length threshold (bytes) */
252 | 	unsigned char   Wlog;		/* log(W)		*/
253 | 	unsigned char   Plog;		/* log(P_max/(qth_max-qth_min))	*/
254 | 	unsigned char   Scell_log;	/* cell size for idle damping */
255 | 	unsigned char	flags;
256 | #define TC_RED_ECN		1
257 | #define TC_RED_HARDDROP		2
258 | #define TC_RED_ADAPTATIVE	4
259 | };
260 | 
261 | struct tc_red_xstats {
262 | 	__u32           early;          /* Early drops */
263 | 	__u32           pdrop;          /* Drops due to queue limits */
264 | 	__u32           other;          /* Drops due to drop() calls */
265 | 	__u32           marked;         /* Marked packets */
266 | };
267 | 
268 | /* GRED section */
269 | 
270 | #define MAX_DPs 16
271 | 
272 | enum {
273 |        TCA_GRED_UNSPEC,
274 |        TCA_GRED_PARMS,
275 |        TCA_GRED_STAB,
276 |        TCA_GRED_DPS,
277 |        TCA_GRED_MAX_P,
278 |        TCA_GRED_LIMIT,
279 |        __TCA_GRED_MAX,
280 | };
281 | 
282 | #define TCA_GRED_MAX (__TCA_GRED_MAX - 1)
283 | 
284 | struct tc_gred_qopt {
285 | 	__u32		limit;        /* HARD maximal queue length (bytes)    */
286 | 	__u32		qth_min;      /* Min average length threshold (bytes) */
287 | 	__u32		qth_max;      /* Max average length threshold (bytes) */
288 | 	__u32		DP;           /* up to 2^32 DPs */
289 | 	__u32		backlog;
290 | 	__u32		qave;
291 | 	__u32		forced;
292 | 	__u32		early;
293 | 	__u32		other;
294 | 	__u32		pdrop;
295 | 	__u8		Wlog;         /* log(W)               */
296 | 	__u8		Plog;         /* log(P_max/(qth_max-qth_min)) */
297 | 	__u8		Scell_log;    /* cell size for idle damping */
298 | 	__u8		prio;         /* prio of this VQ */
299 | 	__u32		packets;
300 | 	__u32		bytesin;
301 | };
302 | 
303 | /* gred setup */
304 | struct tc_gred_sopt {
305 | 	__u32		DPs;
306 | 	__u32		def_DP;
307 | 	__u8		grio;
308 | 	__u8		flags;
309 | 	__u16		pad1;
310 | };
311 | 
312 | /* CHOKe section */
313 | 
314 | enum {
315 | 	TCA_CHOKE_UNSPEC,
316 | 	TCA_CHOKE_PARMS,
317 | 	TCA_CHOKE_STAB,
318 | 	TCA_CHOKE_MAX_P,
319 | 	__TCA_CHOKE_MAX,
320 | };
321 | 
322 | #define TCA_CHOKE_MAX (__TCA_CHOKE_MAX - 1)
323 | 
324 | struct tc_choke_qopt {
325 | 	__u32		limit;		/* Hard queue length (packets)	*/
326 | 	__u32		qth_min;	/* Min average threshold (packets) */
327 | 	__u32		qth_max;	/* Max average threshold (packets) */
328 | 	unsigned char   Wlog;		/* log(W)		*/
329 | 	unsigned char   Plog;		/* log(P_max/(qth_max-qth_min))	*/
330 | 	unsigned char   Scell_log;	/* cell size for idle damping */
331 | 	unsigned char	flags;		/* see RED flags */
332 | };
333 | 
334 | struct tc_choke_xstats {
335 | 	__u32		early;          /* Early drops */
336 | 	__u32		pdrop;          /* Drops due to queue limits */
337 | 	__u32		other;          /* Drops due to drop() calls */
338 | 	__u32		marked;         /* Marked packets */
339 | 	__u32		matched;	/* Drops due to flow match */
340 | };
341 | 
342 | /* HTB section */
343 | #define TC_HTB_NUMPRIO		8
344 | #define TC_HTB_MAXDEPTH		8
345 | #define TC_HTB_PROTOVER		3 /* the same as HTB and TC's major */
346 | 
347 | struct tc_htb_opt {
348 | 	struct tc_ratespec	rate;
349 | 	struct tc_ratespec	ceil;
350 | 	__u32	buffer;
351 | 	__u32	cbuffer;
352 | 	__u32	quantum;
353 | 	__u32	level;		/* out only */
354 | 	__u32	prio;
355 | };
356 | struct tc_htb_glob {
357 | 	__u32 version;		/* to match HTB/TC */
358 | 	__u32 rate2quantum;	/* bps->quantum divisor */
359 | 	__u32 defcls;		/* default class number */
360 | 	__u32 debug;		/* debug flags */
361 | 
362 | 	/* stats */
363 | 	__u32 direct_pkts; /* count of non shaped packets */
364 | };
365 | enum {
366 | 	TCA_HTB_UNSPEC,
367 | 	TCA_HTB_PARMS,
368 | 	TCA_HTB_INIT,
369 | 	TCA_HTB_CTAB,
370 | 	TCA_HTB_RTAB,
371 | 	TCA_HTB_DIRECT_QLEN,
372 | 	TCA_HTB_RATE64,
373 | 	TCA_HTB_CEIL64,
374 | 	__TCA_HTB_MAX,
375 | };
376 | 
377 | #define TCA_HTB_MAX (__TCA_HTB_MAX - 1)
378 | 
379 | struct tc_htb_xstats {
380 | 	__u32 lends;
381 | 	__u32 borrows;
382 | 	__u32 giants;	/* too big packets (rate will not be accurate) */
383 | 	__u32 tokens;
384 | 	__u32 ctokens;
385 | };
386 | 
387 | /* HFSC section */
388 | 
389 | struct tc_hfsc_qopt {
390 | 	__u16	defcls;		/* default class */
391 | };
392 | 
393 | struct tc_service_curve {
394 | 	__u32	m1;		/* slope of the first segment in bps */
395 | 	__u32	d;		/* x-projection of the first segment in us */
396 | 	__u32	m2;		/* slope of the second segment in bps */
397 | };
398 | 
399 | struct tc_hfsc_stats {
400 | 	__u64	work;		/* total work done */
401 | 	__u64	rtwork;		/* work done by real-time criteria */
402 | 	__u32	period;		/* current period */
403 | 	__u32	level;		/* class level in hierarchy */
404 | };
405 | 
406 | enum {
407 | 	TCA_HFSC_UNSPEC,
408 | 	TCA_HFSC_RSC,
409 | 	TCA_HFSC_FSC,
410 | 	TCA_HFSC_USC,
411 | 	__TCA_HFSC_MAX,
412 | };
413 | 
414 | #define TCA_HFSC_MAX (__TCA_HFSC_MAX - 1)
415 | 
416 | 
417 | /* CBQ section */
418 | 
419 | #define TC_CBQ_MAXPRIO		8
420 | #define TC_CBQ_MAXLEVEL		8
421 | #define TC_CBQ_DEF_EWMA		5
422 | 
423 | struct tc_cbq_lssopt {
424 | 	unsigned char	change;
425 | 	unsigned char	flags;
426 | #define TCF_CBQ_LSS_BOUNDED	1
427 | #define TCF_CBQ_LSS_ISOLATED	2
428 | 	unsigned char	ewma_log;
429 | 	unsigned char	level;
430 | #define TCF_CBQ_LSS_FLAGS	1
431 | #define TCF_CBQ_LSS_EWMA	2
432 | #define TCF_CBQ_LSS_MAXIDLE	4
433 | #define TCF_CBQ_LSS_MINIDLE	8
434 | #define TCF_CBQ_LSS_OFFTIME	0x10
435 | #define TCF_CBQ_LSS_AVPKT	0x20
436 | 	__u32		maxidle;
437 | 	__u32		minidle;
438 | 	__u32		offtime;
439 | 	__u32		avpkt;
440 | };
441 | 
442 | struct tc_cbq_wrropt {
443 | 	unsigned char	flags;
444 | 	unsigned char	priority;
445 | 	unsigned char	cpriority;
446 | 	unsigned char	__reserved;
447 | 	__u32		allot;
448 | 	__u32		weight;
449 | };
450 | 
451 | struct tc_cbq_ovl {
452 | 	unsigned char	strategy;
453 | #define	TC_CBQ_OVL_CLASSIC	0
454 | #define	TC_CBQ_OVL_DELAY	1
455 | #define	TC_CBQ_OVL_LOWPRIO	2
456 | #define	TC_CBQ_OVL_DROP		3
457 | #define	TC_CBQ_OVL_RCLASSIC	4
458 | 	unsigned char	priority2;
459 | 	__u16		pad;
460 | 	__u32		penalty;
461 | };
462 | 
463 | struct tc_cbq_police {
464 | 	unsigned char	police;
465 | 	unsigned char	__res1;
466 | 	unsigned short	__res2;
467 | };
468 | 
469 | struct tc_cbq_fopt {
470 | 	__u32		split;
471 | 	__u32		defmap;
472 | 	__u32		defchange;
473 | };
474 | 
475 | struct tc_cbq_xstats {
476 | 	__u32		borrows;
477 | 	__u32		overactions;
478 | 	__s32		avgidle;
479 | 	__s32		undertime;
480 | };
481 | 
482 | enum {
483 | 	TCA_CBQ_UNSPEC,
484 | 	TCA_CBQ_LSSOPT,
485 | 	TCA_CBQ_WRROPT,
486 | 	TCA_CBQ_FOPT,
487 | 	TCA_CBQ_OVL_STRATEGY,
488 | 	TCA_CBQ_RATE,
489 | 	TCA_CBQ_RTAB,
490 | 	TCA_CBQ_POLICE,
491 | 	__TCA_CBQ_MAX,
492 | };
493 | 
494 | #define TCA_CBQ_MAX	(__TCA_CBQ_MAX - 1)
495 | 
496 | /* dsmark section */
497 | 
498 | enum {
499 | 	TCA_DSMARK_UNSPEC,
500 | 	TCA_DSMARK_INDICES,
501 | 	TCA_DSMARK_DEFAULT_INDEX,
502 | 	TCA_DSMARK_SET_TC_INDEX,
503 | 	TCA_DSMARK_MASK,
504 | 	TCA_DSMARK_VALUE,
505 | 	__TCA_DSMARK_MAX,
506 | };
507 | 
508 | #define TCA_DSMARK_MAX (__TCA_DSMARK_MAX - 1)
509 | 
510 | /* ATM  section */
511 | 
512 | enum {
513 | 	TCA_ATM_UNSPEC,
514 | 	TCA_ATM_FD,		/* file/socket descriptor */
515 | 	TCA_ATM_PTR,		/* pointer to descriptor - later */
516 | 	TCA_ATM_HDR,		/* LL header */
517 | 	TCA_ATM_EXCESS,		/* excess traffic class (0 for CLP)  */
518 | 	TCA_ATM_ADDR,		/* PVC address (for output only) */
519 | 	TCA_ATM_STATE,		/* VC state (ATM_VS_*; for output only) */
520 | 	__TCA_ATM_MAX,
521 | };
522 | 
523 | #define TCA_ATM_MAX	(__TCA_ATM_MAX - 1)
524 | 
525 | /* Network emulator */
526 | 
527 | enum {
528 | 	TCA_NETEM_UNSPEC,
529 | 	TCA_NETEM_CORR,
530 | 	TCA_NETEM_DELAY_DIST,
531 | 	TCA_NETEM_REORDER,
532 | 	TCA_NETEM_CORRUPT,
533 | 	TCA_NETEM_LOSS,
534 | 	TCA_NETEM_RATE,
535 | 	TCA_NETEM_ECN,
536 | 	TCA_NETEM_RATE64,
537 | 	__TCA_NETEM_MAX,
538 | };
539 | 
540 | #define TCA_NETEM_MAX (__TCA_NETEM_MAX - 1)
541 | 
542 | struct tc_netem_qopt {
543 | 	__u32	latency;	/* added delay (us) */
544 | 	__u32   limit;		/* fifo limit (packets) */
545 | 	__u32	loss;		/* random packet loss (0=none ~0=100%) */
546 | 	__u32	gap;		/* re-ordering gap (0 for none) */
547 | 	__u32   duplicate;	/* random packet dup  (0=none ~0=100%) */
548 | 	__u32	jitter;		/* random jitter in latency (us) */
549 | };
550 | 
551 | struct tc_netem_corr {
552 | 	__u32	delay_corr;	/* delay correlation */
553 | 	__u32	loss_corr;	/* packet loss correlation */
554 | 	__u32	dup_corr;	/* duplicate correlation  */
555 | };
556 | 
557 | struct tc_netem_reorder {
558 | 	__u32	probability;
559 | 	__u32	correlation;
560 | };
561 | 
562 | struct tc_netem_corrupt {
563 | 	__u32	probability;
564 | 	__u32	correlation;
565 | };
566 | 
567 | struct tc_netem_rate {
568 | 	__u32	rate;	/* byte/s */
569 | 	__s32	packet_overhead;
570 | 	__u32	cell_size;
571 | 	__s32	cell_overhead;
572 | };
573 | 
574 | enum {
575 | 	NETEM_LOSS_UNSPEC,
576 | 	NETEM_LOSS_GI,		/* General Intuitive - 4 state model */
577 | 	NETEM_LOSS_GE,		/* Gilbert Elliot models */
578 | 	__NETEM_LOSS_MAX
579 | };
580 | #define NETEM_LOSS_MAX (__NETEM_LOSS_MAX - 1)
581 | 
582 | /* State transition probabilities for 4 state model */
583 | struct tc_netem_gimodel {
584 | 	__u32	p13;
585 | 	__u32	p31;
586 | 	__u32	p32;
587 | 	__u32	p14;
588 | 	__u32	p23;
589 | };
590 | 
591 | /* Gilbert-Elliot models */
592 | struct tc_netem_gemodel {
593 | 	__u32 p;
594 | 	__u32 r;
595 | 	__u32 h;
596 | 	__u32 k1;
597 | };
598 | 
599 | #define NETEM_DIST_SCALE	8192
600 | #define NETEM_DIST_MAX		16384
601 | 
602 | /* DRR */
603 | 
604 | enum {
605 | 	TCA_DRR_UNSPEC,
606 | 	TCA_DRR_QUANTUM,
607 | 	__TCA_DRR_MAX
608 | };
609 | 
610 | #define TCA_DRR_MAX	(__TCA_DRR_MAX - 1)
611 | 
612 | struct tc_drr_stats {
613 | 	__u32	deficit;
614 | };
615 | 
616 | /* MQPRIO */
617 | #define TC_QOPT_BITMASK 15
618 | #define TC_QOPT_MAX_QUEUE 16
619 | 
620 | struct tc_mqprio_qopt {
621 | 	__u8	num_tc;
622 | 	__u8	prio_tc_map[TC_QOPT_BITMASK + 1];
623 | 	__u8	hw;
624 | 	__u16	count[TC_QOPT_MAX_QUEUE];
625 | 	__u16	offset[TC_QOPT_MAX_QUEUE];
626 | };
627 | 
628 | /* SFB */
629 | 
630 | enum {
631 | 	TCA_SFB_UNSPEC,
632 | 	TCA_SFB_PARMS,
633 | 	__TCA_SFB_MAX,
634 | };
635 | 
636 | #define TCA_SFB_MAX (__TCA_SFB_MAX - 1)
637 | 
638 | /*
639 |  * Note: increment, decrement are Q0.16 fixed-point values.
640 |  */
641 | struct tc_sfb_qopt {
642 | 	__u32 rehash_interval;	/* delay between hash move, in ms */
643 | 	__u32 warmup_time;	/* double buffering warmup time in ms (warmup_time < rehash_interval) */
644 | 	__u32 max;		/* max len of qlen_min */
645 | 	__u32 bin_size;		/* maximum queue length per bin */
646 | 	__u32 increment;	/* probability increment, (d1 in Blue) */
647 | 	__u32 decrement;	/* probability decrement, (d2 in Blue) */
648 | 	__u32 limit;		/* max SFB queue length */
649 | 	__u32 penalty_rate;	/* inelastic flows are rate limited to 'rate' pps */
650 | 	__u32 penalty_burst;
651 | };
652 | 
653 | struct tc_sfb_xstats {
654 | 	__u32 earlydrop;
655 | 	__u32 penaltydrop;
656 | 	__u32 bucketdrop;
657 | 	__u32 queuedrop;
658 | 	__u32 childdrop; /* drops in child qdisc */
659 | 	__u32 marked;
660 | 	__u32 maxqlen;
661 | 	__u32 maxprob;
662 | 	__u32 avgprob;
663 | };
664 | 
665 | #define SFB_MAX_PROB 0xFFFF
666 | 
667 | /* QFQ */
668 | enum {
669 | 	TCA_QFQ_UNSPEC,
670 | 	TCA_QFQ_WEIGHT,
671 | 	TCA_QFQ_LMAX,
672 | 	__TCA_QFQ_MAX
673 | };
674 | 
675 | #define TCA_QFQ_MAX	(__TCA_QFQ_MAX - 1)
676 | 
677 | struct tc_qfq_stats {
678 | 	__u32 weight;
679 | 	__u32 lmax;
680 | };
681 | 
682 | /* CODEL */
683 | 
684 | enum {
685 | 	TCA_CODEL_UNSPEC,
686 | 	TCA_CODEL_TARGET,
687 | 	TCA_CODEL_LIMIT,
688 | 	TCA_CODEL_INTERVAL,
689 | 	TCA_CODEL_ECN,
690 | 	TCA_CODEL_CE_THRESHOLD,
691 | 	__TCA_CODEL_MAX
692 | };
693 | 
694 | #define TCA_CODEL_MAX	(__TCA_CODEL_MAX - 1)
695 | 
696 | struct tc_codel_xstats {
697 | 	__u32	maxpacket; /* largest packet we've seen so far */
698 | 	__u32	count;	   /* how many drops we've done since the last time we
699 | 			    * entered dropping state
700 | 			    */
701 | 	__u32	lastcount; /* count at entry to dropping state */
702 | 	__u32	ldelay;    /* in-queue delay seen by most recently dequeued packet */
703 | 	__s32	drop_next; /* time to drop next packet */
704 | 	__u32	drop_overlimit; /* number of time max qdisc packet limit was hit */
705 | 	__u32	ecn_mark;  /* number of packets we ECN marked instead of dropped */
706 | 	__u32	dropping;  /* are we in dropping state ? */
707 | 	__u32	ce_mark;   /* number of CE marked packets because of ce_threshold */
708 | };
709 | 
710 | /* FQ_CODEL */
711 | 
712 | enum {
713 | 	TCA_FQ_CODEL_UNSPEC,
714 | 	TCA_FQ_CODEL_TARGET,
715 | 	TCA_FQ_CODEL_LIMIT,
716 | 	TCA_FQ_CODEL_INTERVAL,
717 | 	TCA_FQ_CODEL_ECN,
718 | 	TCA_FQ_CODEL_FLOWS,
719 | 	TCA_FQ_CODEL_QUANTUM,
720 | 	TCA_FQ_CODEL_CE_THRESHOLD,
721 | 	__TCA_FQ_CODEL_MAX
722 | };
723 | 
724 | #define TCA_FQ_CODEL_MAX	(__TCA_FQ_CODEL_MAX - 1)
725 | 
726 | enum {
727 | 	TCA_FQ_CODEL_XSTATS_QDISC,
728 | 	TCA_FQ_CODEL_XSTATS_CLASS,
729 | };
730 | 
731 | struct tc_fq_codel_qd_stats {
732 | 	__u32	maxpacket;	/* largest packet we've seen so far */
733 | 	__u32	drop_overlimit; /* number of time max qdisc
734 | 				 * packet limit was hit
735 | 				 */
736 | 	__u32	ecn_mark;	/* number of packets we ECN marked
737 | 				 * instead of being dropped
738 | 				 */
739 | 	__u32	new_flow_count; /* number of time packets
740 | 				 * created a 'new flow'
741 | 				 */
742 | 	__u32	new_flows_len;	/* count of flows in new list */
743 | 	__u32	old_flows_len;	/* count of flows in old list */
744 | 	__u32	ce_mark;	/* packets above ce_threshold */
745 | };
746 | 
747 | struct tc_fq_codel_cl_stats {
748 | 	__s32	deficit;
749 | 	__u32	ldelay;		/* in-queue delay seen by most recently
750 | 				 * dequeued packet
751 | 				 */
752 | 	__u32	count;
753 | 	__u32	lastcount;
754 | 	__u32	dropping;
755 | 	__s32	drop_next;
756 | };
757 | 
758 | struct tc_fq_codel_xstats {
759 | 	__u32	type;
760 | 	union {
761 | 		struct tc_fq_codel_qd_stats qdisc_stats;
762 | 		struct tc_fq_codel_cl_stats class_stats;
763 | 	};
764 | };
765 | 
766 | /* FQ */
767 | 
768 | enum {
769 | 	TCA_FQ_UNSPEC,
770 | 
771 | 	TCA_FQ_PLIMIT,		/* limit of total number of packets in queue */
772 | 
773 | 	TCA_FQ_FLOW_PLIMIT,	/* limit of packets per flow */
774 | 
775 | 	TCA_FQ_QUANTUM,		/* RR quantum */
776 | 
777 | 	TCA_FQ_INITIAL_QUANTUM,		/* RR quantum for new flow */
778 | 
779 | 	TCA_FQ_RATE_ENABLE,	/* enable/disable rate limiting */
780 | 
781 | 	TCA_FQ_FLOW_DEFAULT_RATE,/* obsolete, do not use */
782 | 
783 | 	TCA_FQ_FLOW_MAX_RATE,	/* per flow max rate */
784 | 
785 | 	TCA_FQ_BUCKETS_LOG,	/* log2(number of buckets) */
786 | 
787 | 	TCA_FQ_FLOW_REFILL_DELAY,	/* flow credit refill delay in usec */
788 | 
789 | 	TCA_FQ_ORPHAN_MASK,	/* mask applied to orphaned skb hashes */
790 | 
791 | 	__TCA_FQ_MAX
792 | };
793 | 
794 | #define TCA_FQ_MAX	(__TCA_FQ_MAX - 1)
795 | 
796 | struct tc_fq_qd_stats {
797 | 	__u64	gc_flows;
798 | 	__u64	highprio_packets;
799 | 	__u64	tcp_retrans;
800 | 	__u64	throttled;
801 | 	__u64	flows_plimit;
802 | 	__u64	pkts_too_long;
803 | 	__u64	allocation_errors;
804 | 	__s64	time_next_delayed_flow;
805 | 	__u32	flows;
806 | 	__u32	inactive_flows;
807 | 	__u32	throttled_flows;
808 | 	__u32	pad;
809 | };
810 | 
811 | /* Heavy-Hitter Filter */
812 | 
813 | enum {
814 | 	TCA_HHF_UNSPEC,
815 | 	TCA_HHF_BACKLOG_LIMIT,
816 | 	TCA_HHF_QUANTUM,
817 | 	TCA_HHF_HH_FLOWS_LIMIT,
818 | 	TCA_HHF_RESET_TIMEOUT,
819 | 	TCA_HHF_ADMIT_BYTES,
820 | 	TCA_HHF_EVICT_TIMEOUT,
821 | 	TCA_HHF_NON_HH_WEIGHT,
822 | 	__TCA_HHF_MAX
823 | };
824 | 
825 | #define TCA_HHF_MAX	(__TCA_HHF_MAX - 1)
826 | 
827 | struct tc_hhf_xstats {
828 | 	__u32	drop_overlimit; /* number of times max qdisc packet limit
829 | 				 * was hit
830 | 				 */
831 | 	__u32	hh_overlimit;   /* number of times max heavy-hitters was hit */
832 | 	__u32	hh_tot_count;   /* number of captured heavy-hitters so far */
833 | 	__u32	hh_cur_count;   /* number of current heavy-hitters */
834 | };
835 | 
836 | /* PIE */
837 | enum {
838 | 	TCA_PIE_UNSPEC,
839 | 	TCA_PIE_TARGET,
840 | 	TCA_PIE_LIMIT,
841 | 	TCA_PIE_TUPDATE,
842 | 	TCA_PIE_ALPHA,
843 | 	TCA_PIE_BETA,
844 | 	TCA_PIE_ECN,
845 | 	TCA_PIE_BYTEMODE,
846 | 	__TCA_PIE_MAX
847 | };
848 | #define TCA_PIE_MAX   (__TCA_PIE_MAX - 1)
849 | 
850 | struct tc_pie_xstats {
851 | 	__u32 prob;             /* current probability */
852 | 	__u32 delay;            /* current delay in ms */
853 | 	__u32 avg_dq_rate;      /* current average dq_rate in bits/pie_time */
854 | 	__u32 packets_in;       /* total number of packets enqueued */
855 | 	__u32 dropped;          /* packets dropped due to pie_action */
856 | 	__u32 overlimit;        /* dropped due to lack of space in queue */
857 | 	__u32 maxq;             /* maximum queue size */
858 | 	__u32 ecn_mark;         /* packets marked with ecn*/
859 | };
860 | 
861 | /* CAKE */
862 | enum {
863 | 	TCA_CAKE_UNSPEC,
864 | 	TCA_CAKE_PAD,
865 | 	TCA_CAKE_BASE_RATE64,
866 | 	TCA_CAKE_DIFFSERV_MODE,
867 | 	TCA_CAKE_ATM,
868 | 	TCA_CAKE_FLOW_MODE,
869 | 	TCA_CAKE_OVERHEAD,
870 | 	TCA_CAKE_RTT,
871 | 	TCA_CAKE_TARGET,
872 | 	TCA_CAKE_AUTORATE,
873 | 	TCA_CAKE_MEMORY,
874 | 	TCA_CAKE_NAT,
875 | 	TCA_CAKE_RAW, // was _ETHERNET
876 | 	TCA_CAKE_WASH,
877 | 	TCA_CAKE_MPU,
878 | 	TCA_CAKE_INGRESS,
879 | 	TCA_CAKE_ACK_FILTER,
880 | 	TCA_CAKE_SPLIT_GSO,
881 | 	TCA_CAKE_FWMARK,
882 | 	__TCA_CAKE_MAX
883 | };
884 | #define TCA_CAKE_MAX	(__TCA_CAKE_MAX - 1)
885 | 
886 | enum {
887 | 	__TCA_CAKE_STATS_INVALID,
888 | 	TCA_CAKE_STATS_PAD,
889 | 	TCA_CAKE_STATS_CAPACITY_ESTIMATE64,
890 | 	TCA_CAKE_STATS_MEMORY_LIMIT,
891 | 	TCA_CAKE_STATS_MEMORY_USED,
892 | 	TCA_CAKE_STATS_AVG_NETOFF,
893 | 	TCA_CAKE_STATS_MIN_NETLEN,
894 | 	TCA_CAKE_STATS_MAX_NETLEN,
895 | 	TCA_CAKE_STATS_MIN_ADJLEN,
896 | 	TCA_CAKE_STATS_MAX_ADJLEN,
897 | 	TCA_CAKE_STATS_TIN_STATS,
898 | 	TCA_CAKE_STATS_DEFICIT,
899 | 	TCA_CAKE_STATS_COBALT_COUNT,
900 | 	TCA_CAKE_STATS_DROPPING,
901 | 	TCA_CAKE_STATS_DROP_NEXT_US,
902 | 	TCA_CAKE_STATS_P_DROP,
903 | 	TCA_CAKE_STATS_BLUE_TIMER_US,
904 | 	__TCA_CAKE_STATS_MAX
905 | };
906 | #define TCA_CAKE_STATS_MAX (__TCA_CAKE_STATS_MAX - 1)
907 | 
908 | enum {
909 | 	__TCA_CAKE_TIN_STATS_INVALID,
910 | 	TCA_CAKE_TIN_STATS_PAD,
911 | 	TCA_CAKE_TIN_STATS_SENT_PACKETS,
912 | 	TCA_CAKE_TIN_STATS_SENT_BYTES64,
913 | 	TCA_CAKE_TIN_STATS_DROPPED_PACKETS,
914 | 	TCA_CAKE_TIN_STATS_DROPPED_BYTES64,
915 | 	TCA_CAKE_TIN_STATS_ACKS_DROPPED_PACKETS,
916 | 	TCA_CAKE_TIN_STATS_ACKS_DROPPED_BYTES64,
917 | 	TCA_CAKE_TIN_STATS_ECN_MARKED_PACKETS,
918 | 	TCA_CAKE_TIN_STATS_ECN_MARKED_BYTES64,
919 | 	TCA_CAKE_TIN_STATS_BACKLOG_PACKETS,
920 | 	TCA_CAKE_TIN_STATS_BACKLOG_BYTES,
921 | 	TCA_CAKE_TIN_STATS_THRESHOLD_RATE64,
922 | 	TCA_CAKE_TIN_STATS_TARGET_US,
923 | 	TCA_CAKE_TIN_STATS_INTERVAL_US,
924 | 	TCA_CAKE_TIN_STATS_WAY_INDIRECT_HITS,
925 | 	TCA_CAKE_TIN_STATS_WAY_MISSES,
926 | 	TCA_CAKE_TIN_STATS_WAY_COLLISIONS,
927 | 	TCA_CAKE_TIN_STATS_PEAK_DELAY_US,
928 | 	TCA_CAKE_TIN_STATS_AVG_DELAY_US,
929 | 	TCA_CAKE_TIN_STATS_BASE_DELAY_US,
930 | 	TCA_CAKE_TIN_STATS_SPARSE_FLOWS,
931 | 	TCA_CAKE_TIN_STATS_BULK_FLOWS,
932 | 	TCA_CAKE_TIN_STATS_UNRESPONSIVE_FLOWS,
933 | 	TCA_CAKE_TIN_STATS_MAX_SKBLEN,
934 | 	TCA_CAKE_TIN_STATS_FLOW_QUANTUM,
935 | 	__TCA_CAKE_TIN_STATS_MAX
936 | };
937 | #define TCA_CAKE_TIN_STATS_MAX (__TCA_CAKE_TIN_STATS_MAX - 1)
938 | #define TC_CAKE_MAX_TINS (8)
939 | 
940 | enum {
941 | 	CAKE_FLOW_NONE = 0,
942 | 	CAKE_FLOW_SRC_IP,
943 | 	CAKE_FLOW_DST_IP,
944 | 	CAKE_FLOW_HOSTS,    /* = CAKE_FLOW_SRC_IP | CAKE_FLOW_DST_IP */
945 | 	CAKE_FLOW_FLOWS,
946 | 	CAKE_FLOW_DUAL_SRC, /* = CAKE_FLOW_SRC_IP | CAKE_FLOW_FLOWS */
947 | 	CAKE_FLOW_DUAL_DST, /* = CAKE_FLOW_DST_IP | CAKE_FLOW_FLOWS */
948 | 	CAKE_FLOW_TRIPLE,   /* = CAKE_FLOW_HOSTS  | CAKE_FLOW_FLOWS */
949 | 	CAKE_FLOW_MAX,
950 | };
951 | 
952 | enum {
953 | 	CAKE_DIFFSERV_DIFFSERV3 = 0,
954 | 	CAKE_DIFFSERV_DIFFSERV4,
955 | 	CAKE_DIFFSERV_DIFFSERV8,
956 | 	CAKE_DIFFSERV_BESTEFFORT,
957 | 	CAKE_DIFFSERV_PRECEDENCE,
958 | 	CAKE_DIFFSERV_MAX
959 | };
960 | 
961 | enum {
962 | 	CAKE_ACK_NONE = 0,
963 | 	CAKE_ACK_FILTER,
964 | 	CAKE_ACK_AGGRESSIVE,
965 | 	CAKE_ACK_MAX
966 | };
967 | 
968 | enum {
969 | 	CAKE_ATM_NONE = 0,
970 | 	CAKE_ATM_ATM,
971 | 	CAKE_ATM_PTM,
972 | 	CAKE_ATM_MAX
973 | };
974 | 
975 | #endif
976 | 


--------------------------------------------------------------------------------
/sch_cake.c:
--------------------------------------------------------------------------------
   1 | // SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause
   2 | 
   3 | /* COMMON Applications Kept Enhanced (CAKE) discipline
   4 |  *
   5 |  * Copyright (C) 2014-2018 Jonathan Morton <chromatix99@gmail.com>
   6 |  * Copyright (C) 2015-2018 Toke Høiland-Jørgensen <toke@toke.dk>
   7 |  * Copyright (C) 2014-2018 Dave Täht <dave.taht@gmail.com>
   8 |  * Copyright (C) 2015-2018 Sebastian Moeller <moeller0@gmx.de>
   9 |  * (C) 2015-2018 Kevin Darbyshire-Bryant <kevin@darbyshire-bryant.me.uk>
  10 |  * Copyright (C) 2017-2018 Ryan Mounce <ryan@mounce.com.au>
  11 |  *
  12 |  * The CAKE Principles:
  13 |  *		   (or, how to have your cake and eat it too)
  14 |  *
  15 |  * This is a combination of several shaping, AQM and FQ techniques into one
  16 |  * easy-to-use package:
  17 |  *
  18 |  * - An overall bandwidth shaper, to move the bottleneck away from dumb CPE
  19 |  *   equipment and bloated MACs.  This operates in deficit mode (as in sch_fq),
  20 |  *   eliminating the need for any sort of burst parameter (eg. token bucket
  21 |  *   depth).  Burst support is limited to that necessary to overcome scheduling
  22 |  *   latency.
  23 |  *
  24 |  * - A Diffserv-aware priority queue, giving more priority to certain classes,
  25 |  *   up to a specified fraction of bandwidth.  Above that bandwidth threshold,
  26 |  *   the priority is reduced to avoid starving other tins.
  27 |  *
  28 |  * - Each priority tin has a separate Flow Queue system, to isolate traffic
  29 |  *   flows from each other.  This prevents a burst on one flow from increasing
  30 |  *   the delay to another.  Flows are distributed to queues using a
  31 |  *   set-associative hash function.
  32 |  *
  33 |  * - Each queue is actively managed by Cobalt, which is a combination of the
  34 |  *   Codel and Blue AQM algorithms.  This serves flows fairly, and signals
  35 |  *   congestion early via ECN (if available) and/or packet drops, to keep
  36 |  *   latency low.  The codel parameters are auto-tuned based on the bandwidth
  37 |  *   setting, as is necessary at low bandwidths.
  38 |  *
  39 |  * The configuration parameters are kept deliberately simple for ease of use.
  40 |  * Everything has sane defaults.  Complete generality of configuration is *not*
  41 |  * a goal.
  42 |  *
  43 |  * The priority queue operates according to a weighted DRR scheme, combined with
  44 |  * a bandwidth tracker which reuses the shaper logic to detect which side of the
  45 |  * bandwidth sharing threshold the tin is operating.  This determines whether a
  46 |  * priority-based weight (high) or a bandwidth-based weight (low) is used for
  47 |  * that tin in the current pass.
  48 |  *
  49 |  * This qdisc was inspired by Eric Dumazet's fq_codel code, which he kindly
  50 |  * granted us permission to leverage.
  51 |  */
  52 | 
  53 | #include <linux/module.h>
  54 | #include <linux/types.h>
  55 | #include <linux/kernel.h>
  56 | #include <linux/jiffies.h>
  57 | #include <linux/string.h>
  58 | #include <linux/in.h>
  59 | #include <linux/errno.h>
  60 | #include <linux/init.h>
  61 | #include <linux/skbuff.h>
  62 | #include <linux/jhash.h>
  63 | #include <linux/slab.h>
  64 | #include <linux/vmalloc.h>
  65 | #include <linux/reciprocal_div.h>
  66 | #include <net/netlink.h>
  67 | #include <linux/version.h>
  68 | #include "pkt_sched.h"
  69 | #include <net/pkt_cls.h>
  70 | #include <linux/if_vlan.h>
  71 | #include <net/tcp.h>
  72 | #if LINUX_VERSION_CODE < KERNEL_VERSION(4, 2, 0)
  73 | #include <net/flow_keys.h>
  74 | #else
  75 | #include <net/flow_dissector.h>
  76 | #endif
  77 | #include "cobalt_compat.h"
  78 | 
  79 | #if IS_REACHABLE(CONFIG_NF_CONNTRACK)
  80 | #include <net/netfilter/nf_conntrack_core.h>
  81 | #include <net/netfilter/nf_conntrack_zones.h>
  82 | #include <net/netfilter/nf_conntrack.h>
  83 | #endif
  84 | 
  85 | #define CAKE_SET_WAYS (8)
  86 | #define CAKE_MAX_TINS (8)
  87 | #define CAKE_QUEUES (1024)
  88 | #define CAKE_FLOW_MASK 63
  89 | #define CAKE_FLOW_NAT_FLAG 64
  90 | 
  91 | /* struct cobalt_params - contains codel and blue parameters
  92 |  * @interval:	codel initial drop rate
  93 |  * @target:     maximum persistent sojourn time & blue update rate
  94 |  * @mtu_time:   serialisation delay of maximum-size packet
  95 |  * @p_inc:      increment of blue drop probability (0.32 fxp)
  96 |  * @p_dec:      decrement of blue drop probability (0.32 fxp)
  97 |  */
  98 | struct cobalt_params {
  99 | 	u64	interval;
 100 | 	u64	target;
 101 | 	u64	mtu_time;
 102 | 	u32	p_inc;
 103 | 	u32	p_dec;
 104 | };
 105 | 
 106 | /* struct cobalt_vars - contains codel and blue variables
 107 |  * @count:		codel dropping frequency
 108 |  * @rec_inv_sqrt:	reciprocal value of sqrt(count) >> 1
 109 |  * @drop_next:		time to drop next packet, or when we dropped last
 110 |  * @blue_timer:		Blue time to next drop
 111 |  * @p_drop:		BLUE drop probability (0.32 fxp)
 112 |  * @dropping:		set if in dropping state
 113 |  * @ecn_marked:		set if marked
 114 |  */
 115 | struct cobalt_vars {
 116 | 	u32	count;
 117 | 	u32	rec_inv_sqrt;
 118 | 	ktime_t	drop_next;
 119 | 	ktime_t	blue_timer;
 120 | 	u32     p_drop;
 121 | 	bool	dropping;
 122 | 	bool    ecn_marked;
 123 | };
 124 | 
 125 | enum {
 126 | 	CAKE_SET_NONE = 0,
 127 | 	CAKE_SET_SPARSE,
 128 | 	CAKE_SET_SPARSE_WAIT, /* counted in SPARSE, actually in BULK */
 129 | 	CAKE_SET_BULK,
 130 | 	CAKE_SET_DECAYING
 131 | };
 132 | 
 133 | struct cake_flow {
 134 | 	/* this stuff is all needed per-flow at dequeue time */
 135 | 	struct sk_buff	  *head;
 136 | 	struct sk_buff	  *tail;
 137 | 	struct list_head  flowchain;
 138 | 	s32		  deficit;
 139 | 	u32		  dropped;
 140 | 	struct cobalt_vars cvars;
 141 | 	u16		  srchost; /* index into cake_host table */
 142 | 	u16		  dsthost;
 143 | 	u8		  set;
 144 | }; /* please try to keep this structure <= 64 bytes */
 145 | 
 146 | struct cake_host {
 147 | 	u32 srchost_tag;
 148 | 	u32 dsthost_tag;
 149 | 	u16 srchost_bulk_flow_count;
 150 | 	u16 dsthost_bulk_flow_count;
 151 | };
 152 | 
 153 | struct cake_heap_entry {
 154 | 	u16 t:3, b:10;
 155 | };
 156 | 
 157 | struct cake_tin_data {
 158 | 	struct cake_flow flows[CAKE_QUEUES];
 159 | 	u32	backlogs[CAKE_QUEUES];
 160 | 	u32	tags[CAKE_QUEUES]; /* for set association */
 161 | 	u16	overflow_idx[CAKE_QUEUES];
 162 | 	struct cake_host hosts[CAKE_QUEUES]; /* for triple isolation */
 163 | 	u32	perturb;
 164 | 	u16	flow_quantum;
 165 | 
 166 | 	struct cobalt_params cparams;
 167 | 	u32	drop_overlimit;
 168 | 	u16	bulk_flow_count;
 169 | 	u16	sparse_flow_count;
 170 | 	u16	decaying_flow_count;
 171 | 	u16	unresponsive_flow_count;
 172 | 
 173 | 	u32	max_skblen;
 174 | 
 175 | 	struct list_head new_flows;
 176 | 	struct list_head old_flows;
 177 | 	struct list_head decaying_flows;
 178 | 
 179 | 	/* time_next = time_this + ((len * rate_ns) >> rate_shft) */
 180 | 	ktime_t	time_next_packet;
 181 | 	u64	tin_rate_ns;
 182 | 	u64	tin_rate_bps;
 183 | 	u16	tin_rate_shft;
 184 | 
 185 | 	u16	tin_quantum;
 186 | 	s32	tin_deficit;
 187 | 	u32	tin_backlog;
 188 | 	u32	tin_dropped;
 189 | 	u32	tin_ecn_mark;
 190 | 
 191 | 	u32	packets;
 192 | 	u64	bytes;
 193 | 
 194 | 	u32	ack_drops;
 195 | 
 196 | 	/* moving averages */
 197 | 	u64 avge_delay;
 198 | 	u64 peak_delay;
 199 | 	u64 base_delay;
 200 | 
 201 | 	/* hash function stats */
 202 | 	u32	way_directs;
 203 | 	u32	way_hits;
 204 | 	u32	way_misses;
 205 | 	u32	way_collisions;
 206 | }; /* number of tins is small, so size of this struct doesn't matter much */
 207 | 
 208 | struct cake_sched_data {
 209 | 	struct tcf_proto __rcu *filter_list; /* optional external classifier */
 210 | #if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 13, 0)
 211 | 	struct tcf_block *block;
 212 | #endif
 213 | 	struct cake_tin_data *tins;
 214 | 
 215 | 	struct cake_heap_entry overflow_heap[CAKE_QUEUES * CAKE_MAX_TINS];
 216 | 	u16		overflow_timeout;
 217 | 
 218 | 	u16		tin_cnt;
 219 | 	u8		tin_mode;
 220 | 	u8		flow_mode;
 221 | 	u8		ack_filter;
 222 | 	u8		atm_mode;
 223 | 
 224 | 	u32		fwmark_mask;
 225 | 	u16		fwmark_shft;
 226 | 
 227 | 	/* time_next = time_this + ((len * rate_ns) >> rate_shft) */
 228 | 	u16		rate_shft;
 229 | 	ktime_t		time_next_packet;
 230 | 	ktime_t		failsafe_next_packet;
 231 | 	u64		rate_ns;
 232 | 	u64		rate_bps;
 233 | 	u16		rate_flags;
 234 | 	s16		rate_overhead;
 235 | 	u16		rate_mpu;
 236 | 	u64		interval;
 237 | 	u64		target;
 238 | 
 239 | 	/* resource tracking */
 240 | 	u32		buffer_used;
 241 | 	u32		buffer_max_used;
 242 | 	u32		buffer_limit;
 243 | 	u32		buffer_config_limit;
 244 | 
 245 | 	/* indices for dequeue */
 246 | 	u16		cur_tin;
 247 | 	u16		cur_flow;
 248 | 
 249 | 	struct qdisc_watchdog watchdog;
 250 | 	const u8	*tin_index;
 251 | 	const u8	*tin_order;
 252 | 
 253 | 	/* bandwidth capacity estimate */
 254 | 	ktime_t		last_packet_time;
 255 | 	ktime_t		avg_window_begin;
 256 | 	u64		avg_packet_interval;
 257 | 	u64		avg_window_bytes;
 258 | 	u64		avg_peak_bandwidth;
 259 | 	ktime_t		last_reconfig_time;
 260 | 
 261 | 	/* packet length stats */
 262 | 	u32		avg_netoff;
 263 | 	u16		max_netlen;
 264 | 	u16		max_adjlen;
 265 | 	u16		min_netlen;
 266 | 	u16		min_adjlen;
 267 | };
 268 | 
 269 | enum {
 270 | 	CAKE_FLAG_OVERHEAD	   = BIT(0),
 271 | 	CAKE_FLAG_AUTORATE_INGRESS = BIT(1),
 272 | 	CAKE_FLAG_INGRESS	   = BIT(2),
 273 | 	CAKE_FLAG_WASH		   = BIT(3),
 274 | 	CAKE_FLAG_SPLIT_GSO	   = BIT(4)
 275 | };
 276 | 
 277 | /* COBALT operates the Codel and BLUE algorithms in parallel, in order to
 278 |  * obtain the best features of each.  Codel is excellent on flows which
 279 |  * respond to congestion signals in a TCP-like way.  BLUE is more effective on
 280 |  * unresponsive flows.
 281 |  */
 282 | 
 283 | struct cobalt_skb_cb {
 284 | 	ktime_t enqueue_time;
 285 | 	u32     adjusted_len;
 286 | };
 287 | 
 288 | static u64 us_to_ns(u64 us)
 289 | {
 290 | 	return us * NSEC_PER_USEC;
 291 | }
 292 | 
 293 | static struct cobalt_skb_cb *get_cobalt_cb(const struct sk_buff *skb)
 294 | {
 295 | 	qdisc_cb_private_validate(skb, sizeof(struct cobalt_skb_cb));
 296 | 	return (struct cobalt_skb_cb *)qdisc_skb_cb(skb)->data;
 297 | }
 298 | 
 299 | static ktime_t cobalt_get_enqueue_time(const struct sk_buff *skb)
 300 | {
 301 | 	return get_cobalt_cb(skb)->enqueue_time;
 302 | }
 303 | 
 304 | static void cobalt_set_enqueue_time(struct sk_buff *skb,
 305 | 				    ktime_t now)
 306 | {
 307 | 	get_cobalt_cb(skb)->enqueue_time = now;
 308 | }
 309 | 
 310 | static u16 quantum_div[CAKE_QUEUES + 1] = {0};
 311 | 
 312 | /* Diffserv lookup tables */
 313 | 
 314 | static const u8 precedence[] = {
 315 | 	0, 0, 0, 0, 0, 0, 0, 0,
 316 | 	1, 1, 1, 1, 1, 1, 1, 1,
 317 | 	2, 2, 2, 2, 2, 2, 2, 2,
 318 | 	3, 3, 3, 3, 3, 3, 3, 3,
 319 | 	4, 4, 4, 4, 4, 4, 4, 4,
 320 | 	5, 5, 5, 5, 5, 5, 5, 5,
 321 | 	6, 6, 6, 6, 6, 6, 6, 6,
 322 | 	7, 7, 7, 7, 7, 7, 7, 7,
 323 | };
 324 | 
 325 | static const u8 diffserv8[] = {
 326 | 	2, 0, 1, 2, 4, 2, 2, 2,
 327 | 	1, 2, 1, 2, 1, 2, 1, 2,
 328 | 	5, 2, 4, 2, 4, 2, 4, 2,
 329 | 	3, 2, 3, 2, 3, 2, 3, 2,
 330 | 	6, 2, 3, 2, 3, 2, 3, 2,
 331 | 	6, 2, 2, 2, 6, 2, 6, 2,
 332 | 	7, 2, 2, 2, 2, 2, 2, 2,
 333 | 	7, 2, 2, 2, 2, 2, 2, 2,
 334 | };
 335 | 
 336 | static const u8 diffserv4[] = {
 337 | 	0, 1, 0, 0, 2, 0, 0, 0,
 338 | 	1, 0, 0, 0, 0, 0, 0, 0,
 339 | 	2, 0, 2, 0, 2, 0, 2, 0,
 340 | 	2, 0, 2, 0, 2, 0, 2, 0,
 341 | 	3, 0, 2, 0, 2, 0, 2, 0,
 342 | 	3, 0, 0, 0, 3, 0, 3, 0,
 343 | 	3, 0, 0, 0, 0, 0, 0, 0,
 344 | 	3, 0, 0, 0, 0, 0, 0, 0,
 345 | };
 346 | 
 347 | static const u8 diffserv3[] = {
 348 | 	0, 1, 0, 0, 2, 0, 0, 0,
 349 | 	1, 0, 0, 0, 0, 0, 0, 0,
 350 | 	0, 0, 0, 0, 0, 0, 0, 0,
 351 | 	0, 0, 0, 0, 0, 0, 0, 0,
 352 | 	0, 0, 0, 0, 0, 0, 0, 0,
 353 | 	0, 0, 0, 0, 2, 0, 2, 0,
 354 | 	2, 0, 0, 0, 0, 0, 0, 0,
 355 | 	2, 0, 0, 0, 0, 0, 0, 0,
 356 | };
 357 | 
 358 | static const u8 besteffort[] = {
 359 | 	0, 0, 0, 0, 0, 0, 0, 0,
 360 | 	0, 0, 0, 0, 0, 0, 0, 0,
 361 | 	0, 0, 0, 0, 0, 0, 0, 0,
 362 | 	0, 0, 0, 0, 0, 0, 0, 0,
 363 | 	0, 0, 0, 0, 0, 0, 0, 0,
 364 | 	0, 0, 0, 0, 0, 0, 0, 0,
 365 | 	0, 0, 0, 0, 0, 0, 0, 0,
 366 | 	0, 0, 0, 0, 0, 0, 0, 0,
 367 | };
 368 | 
 369 | /* tin priority order for stats dumping */
 370 | 
 371 | static const u8 normal_order[] = {0, 1, 2, 3, 4, 5, 6, 7};
 372 | static const u8 bulk_order[] = {1, 0, 2, 3};
 373 | 
 374 | #define REC_INV_SQRT_CACHE (16)
 375 | static u32 cobalt_rec_inv_sqrt_cache[REC_INV_SQRT_CACHE] = {0};
 376 | 
 377 | /* http://en.wikipedia.org/wiki/Methods_of_computing_square_roots
 378 |  * new_invsqrt = (invsqrt / 2) * (3 - count * invsqrt^2)
 379 |  *
 380 |  * Here, invsqrt is a fixed point number (< 1.0), 32bit mantissa, aka Q0.32
 381 |  */
 382 | 
 383 | static void cobalt_newton_step(struct cobalt_vars *vars)
 384 | {
 385 | 	u32 invsqrt, invsqrt2;
 386 | 	u64 val;
 387 | 
 388 | 	invsqrt = vars->rec_inv_sqrt;
 389 | 	invsqrt2 = ((u64)invsqrt * invsqrt) >> 32;
 390 | 	val = (3LL << 32) - ((u64)vars->count * invsqrt2);
 391 | 
 392 | 	val >>= 2; /* avoid overflow in following multiply */
 393 | 	val = (val * invsqrt) >> (32 - 2 + 1);
 394 | 
 395 | 	vars->rec_inv_sqrt = val;
 396 | }
 397 | 
 398 | static void cobalt_invsqrt(struct cobalt_vars *vars)
 399 | {
 400 | 	if (vars->count < REC_INV_SQRT_CACHE)
 401 | 		vars->rec_inv_sqrt = cobalt_rec_inv_sqrt_cache[vars->count];
 402 | 	else
 403 | 		cobalt_newton_step(vars);
 404 | }
 405 | 
 406 | /* There is a big difference in timing between the accurate values placed in
 407 |  * the cache and the approximations given by a single Newton step for small
 408 |  * count values, particularly when stepping from count 1 to 2 or vice versa.
 409 |  * Above 16, a single Newton step gives sufficient accuracy in either
 410 |  * direction, given the precision stored.
 411 |  *
 412 |  * The magnitude of the error when stepping up to count 2 is such as to give
 413 |  * the value that *should* have been produced at count 4.
 414 |  */
 415 | 
 416 | static void cobalt_cache_init(void)
 417 | {
 418 | 	struct cobalt_vars v;
 419 | 
 420 | 	memset(&v, 0, sizeof(v));
 421 | 	v.rec_inv_sqrt = ~0U;
 422 | 	cobalt_rec_inv_sqrt_cache[0] = v.rec_inv_sqrt;
 423 | 
 424 | 	for (v.count = 1; v.count < REC_INV_SQRT_CACHE; v.count++) {
 425 | 		cobalt_newton_step(&v);
 426 | 		cobalt_newton_step(&v);
 427 | 		cobalt_newton_step(&v);
 428 | 		cobalt_newton_step(&v);
 429 | 
 430 | 		cobalt_rec_inv_sqrt_cache[v.count] = v.rec_inv_sqrt;
 431 | 	}
 432 | }
 433 | 
 434 | static void cobalt_vars_init(struct cobalt_vars *vars)
 435 | {
 436 | 	memset(vars, 0, sizeof(*vars));
 437 | 
 438 | 	if (!cobalt_rec_inv_sqrt_cache[0]) {
 439 | 		cobalt_cache_init();
 440 | 		cobalt_rec_inv_sqrt_cache[0] = ~0;
 441 | 	}
 442 | }
 443 | 
 444 | /* CoDel control_law is t + interval/sqrt(count)
 445 |  * We maintain in rec_inv_sqrt the reciprocal value of sqrt(count) to avoid
 446 |  * both sqrt() and divide operation.
 447 |  */
 448 | static ktime_t cobalt_control(ktime_t t,
 449 | 			      u64 interval,
 450 | 			      u32 rec_inv_sqrt)
 451 | {
 452 | 	return ktime_add_ns(t, reciprocal_scale(interval,
 453 | 						rec_inv_sqrt));
 454 | }
 455 | 
 456 | /* Call this when a packet had to be dropped due to queue overflow.  Returns
 457 |  * true if the BLUE state was quiescent before but active after this call.
 458 |  */
 459 | static bool cobalt_queue_full(struct cobalt_vars *vars,
 460 | 			      struct cobalt_params *p,
 461 | 			      ktime_t now)
 462 | {
 463 | 	bool up = false;
 464 | 
 465 | 	if (ktime_to_ns(ktime_sub(now, vars->blue_timer)) > p->target) {
 466 | 		up = !vars->p_drop;
 467 | 		vars->p_drop += p->p_inc;
 468 | 		if (vars->p_drop < p->p_inc)
 469 | 			vars->p_drop = ~0;
 470 | 		vars->blue_timer = now;
 471 | 	}
 472 | 	vars->dropping = true;
 473 | 	vars->drop_next = now;
 474 | 	if (!vars->count)
 475 | 		vars->count = 1;
 476 | 
 477 | 	return up;
 478 | }
 479 | 
 480 | /* Call this when the queue was serviced but turned out to be empty.  Returns
 481 |  * true if the BLUE state was active before but quiescent after this call.
 482 |  */
 483 | static bool cobalt_queue_empty(struct cobalt_vars *vars,
 484 | 			       struct cobalt_params *p,
 485 | 			       ktime_t now)
 486 | {
 487 | 	bool down = false;
 488 | 
 489 | 	if (vars->p_drop &&
 490 | 	    ktime_to_ns(ktime_sub(now, vars->blue_timer)) > p->target) {
 491 | 		if (vars->p_drop < p->p_dec)
 492 | 			vars->p_drop = 0;
 493 | 		else
 494 | 			vars->p_drop -= p->p_dec;
 495 | 		vars->blue_timer = now;
 496 | 		down = !vars->p_drop;
 497 | 	}
 498 | 	vars->dropping = false;
 499 | 
 500 | 	if (vars->count && ktime_to_ns(ktime_sub(now, vars->drop_next)) >= 0) {
 501 | 		vars->count--;
 502 | 		cobalt_invsqrt(vars);
 503 | 		vars->drop_next = cobalt_control(vars->drop_next,
 504 | 						 p->interval,
 505 | 						 vars->rec_inv_sqrt);
 506 | 	}
 507 | 
 508 | 	return down;
 509 | }
 510 | 
 511 | static __be16 cake_skb_proto(const struct sk_buff *skb)
 512 | {
 513 | 	unsigned int offset = skb_mac_offset(skb) + sizeof(struct ethhdr);
 514 | 	__be16 proto = skb->protocol;
 515 | 	struct vlan_hdr vhdr, *vh;
 516 | 
 517 | 	while (proto == htons(ETH_P_8021Q) || proto == htons(ETH_P_8021AD)) {
 518 | 		vh = skb_header_pointer(skb, offset, sizeof(vhdr), &vhdr);
 519 | 		if (!vh)
 520 | 			break;
 521 | 
 522 | 		proto = vh->h_vlan_encapsulated_proto;
 523 | 		offset += sizeof(vhdr);
 524 | 	}
 525 | 
 526 | 	return proto;
 527 | }
 528 | 
 529 | static int cake_set_ce(struct sk_buff *skb)
 530 | {
 531 | 	int wlen = skb_network_offset(skb);
 532 | 
 533 | 	switch (cake_skb_proto(skb)) {
 534 | 	case htons(ETH_P_IP):
 535 | 		wlen += sizeof(struct iphdr);
 536 | 		if (!pskb_may_pull(skb, wlen) ||
 537 | 		    skb_try_make_writable(skb, wlen))
 538 | 			return 0;
 539 | 
 540 | 		return IP_ECN_set_ce(ip_hdr(skb));
 541 | 
 542 | 	case htons(ETH_P_IPV6):
 543 | 		wlen += sizeof(struct ipv6hdr);
 544 | 		if (!pskb_may_pull(skb, wlen) ||
 545 | 		    skb_try_make_writable(skb, wlen))
 546 | 			return 0;
 547 | 
 548 | 		return IP6_ECN_set_ce(skb, ipv6_hdr(skb));
 549 | 
 550 | 	default:
 551 | 		return 0;
 552 | 	}
 553 | 
 554 | 	return 0;
 555 | }
 556 | 
 557 | /* Call this with a freshly dequeued packet for possible congestion marking.
 558 |  * Returns true as an instruction to drop the packet, false for delivery.
 559 |  */
 560 | static bool cobalt_should_drop(struct cobalt_vars *vars,
 561 | 			       struct cobalt_params *p,
 562 | 			       ktime_t now,
 563 | 			       struct sk_buff *skb,
 564 | 			       u32 bulk_flows)
 565 | {
 566 | 	bool next_due, over_target, drop = false;
 567 | 	ktime_t schedule;
 568 | 	u64 sojourn;
 569 | 
 570 | /* The 'schedule' variable records, in its sign, whether 'now' is before or
 571 |  * after 'drop_next'.  This allows 'drop_next' to be updated before the next
 572 |  * scheduling decision is actually branched, without destroying that
 573 |  * information.  Similarly, the first 'schedule' value calculated is preserved
 574 |  * in the boolean 'next_due'.
 575 |  *
 576 |  * As for 'drop_next', we take advantage of the fact that 'interval' is both
 577 |  * the delay between first exceeding 'target' and the first signalling event,
 578 |  * *and* the scaling factor for the signalling frequency.  It's therefore very
 579 |  * natural to use a single mechanism for both purposes, and eliminates a
 580 |  * significant amount of reference Codel's spaghetti code.  To help with this,
 581 |  * both the '0' and '1' entries in the invsqrt cache are 0xFFFFFFFF, as close
 582 |  * as possible to 1.0 in fixed-point.
 583 |  */
 584 | 
 585 | 	sojourn = ktime_to_ns(ktime_sub(now, cobalt_get_enqueue_time(skb)));
 586 | 	schedule = ktime_sub(now, vars->drop_next);
 587 | 	over_target = sojourn > p->target &&
 588 | 		      sojourn > p->mtu_time * bulk_flows * 2 &&
 589 | 		      sojourn > p->mtu_time * 4;
 590 | 	next_due = vars->count && ktime_to_ns(schedule) >= 0;
 591 | 
 592 | 	vars->ecn_marked = false;
 593 | 
 594 | 	if (over_target) {
 595 | 		if (!vars->dropping) {
 596 | 			vars->dropping = true;
 597 | 			vars->drop_next = cobalt_control(now,
 598 | 							 p->interval,
 599 | 							 vars->rec_inv_sqrt);
 600 | 		}
 601 | 		if (!vars->count)
 602 | 			vars->count = 1;
 603 | 	} else if (vars->dropping) {
 604 | 		vars->dropping = false;
 605 | 	}
 606 | 
 607 | 	if (next_due && vars->dropping) {
 608 | 		/* Use ECN mark if possible, otherwise drop */
 609 | 		drop = !(vars->ecn_marked = cake_set_ce(skb));
 610 | 
 611 | 		vars->count++;
 612 | 		if (!vars->count)
 613 | 			vars->count--;
 614 | 		cobalt_invsqrt(vars);
 615 | 		vars->drop_next = cobalt_control(vars->drop_next,
 616 | 						 p->interval,
 617 | 						 vars->rec_inv_sqrt);
 618 | 		schedule = ktime_sub(now, vars->drop_next);
 619 | 	} else {
 620 | 		while (next_due) {
 621 | 			vars->count--;
 622 | 			cobalt_invsqrt(vars);
 623 | 			vars->drop_next = cobalt_control(vars->drop_next,
 624 | 							 p->interval,
 625 | 							 vars->rec_inv_sqrt);
 626 | 			schedule = ktime_sub(now, vars->drop_next);
 627 | 			next_due = vars->count && ktime_to_ns(schedule) >= 0;
 628 | 		}
 629 | 	}
 630 | 
 631 | 	/* Simple BLUE implementation.  Lack of ECN is deliberate. */
 632 | 	if (vars->p_drop)
 633 | 		drop |= (prandom_u32() < vars->p_drop);
 634 | 
 635 | 	/* Overload the drop_next field as an activity timeout */
 636 | 	if (!vars->count)
 637 | 		vars->drop_next = ktime_add_ns(now, p->interval);
 638 | 	else if (ktime_to_ns(schedule) > 0 && !drop)
 639 | 		vars->drop_next = now;
 640 | 
 641 | 	return drop;
 642 | }
 643 | 
 644 | #if IS_REACHABLE(CONFIG_NF_CONNTRACK)
 645 | 
 646 | static void cake_update_flowkeys(struct flow_keys *keys,
 647 | 				 const struct sk_buff *skb)
 648 | {
 649 | 	const struct nf_conntrack_tuple *tuple;
 650 | 	enum ip_conntrack_info ctinfo;
 651 | 	struct nf_conn *ct;
 652 | 	bool rev = false;
 653 | 
 654 | 	if (cake_skb_proto(skb) != htons(ETH_P_IP))
 655 | 		return;
 656 | 
 657 | 	ct = nf_ct_get(skb, &ctinfo);
 658 | 	if (ct) {
 659 | 		tuple = nf_ct_tuple(ct, CTINFO2DIR(ctinfo));
 660 | 	} else {
 661 | 		const struct nf_conntrack_tuple_hash *hash;
 662 | 		struct nf_conntrack_tuple srctuple;
 663 | 
 664 | #if KERNEL_VERSION(4, 4, 0) > LINUX_VERSION_CODE
 665 | 		if (!nf_ct_get_tuplepr(skb, skb_network_offset(skb),
 666 | 				       NFPROTO_IPV4, &srctuple))
 667 | #else
 668 | 		if (!nf_ct_get_tuplepr(skb, skb_network_offset(skb),
 669 | 				       NFPROTO_IPV4, dev_net(skb->dev),
 670 | 				       &srctuple))
 671 | #endif
 672 | 			return;
 673 | 
 674 | #if KERNEL_VERSION(4, 3, 0) > LINUX_VERSION_CODE
 675 | 		hash = nf_conntrack_find_get(dev_net(skb->dev),
 676 | 					     NF_CT_DEFAULT_ZONE,
 677 | 					     &srctuple);
 678 | #else
 679 | 		hash = nf_conntrack_find_get(dev_net(skb->dev),
 680 | 					     &nf_ct_zone_dflt,
 681 | 					     &srctuple);
 682 | #endif
 683 | 		if (!hash)
 684 | 			return;
 685 | 
 686 | 		rev = true;
 687 | 		ct = nf_ct_tuplehash_to_ctrack(hash);
 688 | 		tuple = nf_ct_tuple(ct, !hash->tuple.dst.dir);
 689 | 	}
 690 | 
 691 | #if KERNEL_VERSION(4, 2, 0) > LINUX_VERSION_CODE
 692 | 	keys->src = rev ? tuple->dst.u3.ip : tuple->src.u3.ip;
 693 | 	keys->dst = rev ? tuple->src.u3.ip : tuple->dst.u3.ip;
 694 | #else
 695 | 	keys->addrs.v4addrs.src = rev ? tuple->dst.u3.ip : tuple->src.u3.ip;
 696 | 	keys->addrs.v4addrs.dst = rev ? tuple->src.u3.ip : tuple->dst.u3.ip;
 697 | #endif
 698 | 
 699 | #if KERNEL_VERSION(4, 2, 0) > LINUX_VERSION_CODE
 700 | 	if (keys->ports) {
 701 | 		keys->port16[0] = rev ? tuple->dst.u.all : tuple->src.u.all;
 702 | 		keys->port16[1] = rev ? tuple->src.u.all : tuple->dst.u.all;
 703 | 	}
 704 | #else
 705 | 	if (keys->ports.ports) {
 706 | 		keys->ports.src = rev ? tuple->dst.u.all : tuple->src.u.all;
 707 | 		keys->ports.dst = rev ? tuple->src.u.all : tuple->dst.u.all;
 708 | 	}
 709 | #endif
 710 | 	if (rev)
 711 | 		nf_ct_put(ct);
 712 | }
 713 | #else
 714 | static void cake_update_flowkeys(struct flow_keys *keys,
 715 | 				 const struct sk_buff *skb)
 716 | {
 717 | 	/* There is nothing we can do here without CONNTRACK */
 718 | }
 719 | #endif
 720 | 
 721 | /* Cake has several subtle multiple bit settings. In these cases you
 722 |  *  would be matching triple isolate mode as well.
 723 |  */
 724 | 
 725 | static bool cake_dsrc(int flow_mode)
 726 | {
 727 | 	return (flow_mode & CAKE_FLOW_DUAL_SRC) == CAKE_FLOW_DUAL_SRC;
 728 | }
 729 | 
 730 | static bool cake_ddst(int flow_mode)
 731 | {
 732 | 	return (flow_mode & CAKE_FLOW_DUAL_DST) == CAKE_FLOW_DUAL_DST;
 733 | }
 734 | 
 735 | static u32 cake_hash(struct cake_tin_data *q, const struct sk_buff *skb,
 736 | 		     int flow_mode, u16 flow_override, u16 host_override)
 737 | {
 738 | 	u32 flow_hash = 0, srchost_hash = 0, dsthost_hash = 0;
 739 | 	u16 reduced_hash, srchost_idx, dsthost_idx;
 740 | #if KERNEL_VERSION(4, 2, 0) > LINUX_VERSION_CODE
 741 | 	struct flow_keys keys;
 742 | #else
 743 | 	struct flow_keys keys, host_keys;
 744 | #endif
 745 | 
 746 | 	if (unlikely(flow_mode == CAKE_FLOW_NONE))
 747 | 		return 0;
 748 | 
 749 | 	/* If both overrides are set we can skip packet dissection entirely */
 750 | 	if ((flow_override || !(flow_mode & CAKE_FLOW_FLOWS)) &&
 751 | 	    (host_override || !(flow_mode & CAKE_FLOW_HOSTS)))
 752 | 		goto skip_hash;
 753 | 
 754 | #if KERNEL_VERSION(4, 2, 0) > LINUX_VERSION_CODE
 755 | 	skb_flow_dissect(skb, &keys);
 756 | 
 757 | 	if (flow_mode & CAKE_FLOW_NAT_FLAG)
 758 | 		cake_update_flowkeys(&keys, skb);
 759 | 
 760 | 	srchost_hash = jhash_1word((__force u32)keys.src, q->perturb);
 761 | 	dsthost_hash = jhash_1word((__force u32)keys.dst, q->perturb);
 762 | 
 763 | 	if (flow_mode & CAKE_FLOW_FLOWS)
 764 | 		flow_hash = jhash_3words((__force u32)keys.dst, (__force u32)keys.src ^ keys.ip_proto, (__force u32)keys.ports, q->perturb);
 765 | 
 766 | #else
 767 | 
 768 | /* Linux kernel 4.2.x have skb_flow_dissect_flow_keys which takes only 2
 769 |  * arguments
 770 |  */
 771 | #if (KERNEL_VERSION(4, 2, 0) <= LINUX_VERSION_CODE) && (KERNEL_VERSION(4, 3, 0) >  LINUX_VERSION_CODE)
 772 | 	skb_flow_dissect_flow_keys(skb, &keys);
 773 | #else
 774 | 	skb_flow_dissect_flow_keys(skb, &keys,
 775 | 				   FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL);
 776 | #endif
 777 | 
 778 | 	if (flow_mode & CAKE_FLOW_NAT_FLAG)
 779 | 		cake_update_flowkeys(&keys, skb);
 780 | 
 781 | 	/* flow_hash_from_keys() sorts the addresses by value, so we have
 782 | 	 * to preserve their order in a separate data structure to treat
 783 | 	 * src and dst host addresses as independently selectable.
 784 | 	 */
 785 | 	host_keys = keys;
 786 | 	host_keys.ports.ports     = 0;
 787 | 	host_keys.basic.ip_proto  = 0;
 788 | 	host_keys.keyid.keyid     = 0;
 789 | #if LINUX_VERSION_CODE < KERNEL_VERSION(4, 8, 0)
 790 | 	host_keys.tags.vlan_id    = 0;
 791 | #endif
 792 | 	host_keys.tags.flow_label = 0;
 793 | 
 794 | 	switch (host_keys.control.addr_type) {
 795 | 	case FLOW_DISSECTOR_KEY_IPV4_ADDRS:
 796 | 		host_keys.addrs.v4addrs.src = 0;
 797 | 		dsthost_hash = flow_hash_from_keys(&host_keys);
 798 | 		host_keys.addrs.v4addrs.src = keys.addrs.v4addrs.src;
 799 | 		host_keys.addrs.v4addrs.dst = 0;
 800 | 		srchost_hash = flow_hash_from_keys(&host_keys);
 801 | 		break;
 802 | 
 803 | 	case FLOW_DISSECTOR_KEY_IPV6_ADDRS:
 804 | 		memset(&host_keys.addrs.v6addrs.src, 0,
 805 | 		       sizeof(host_keys.addrs.v6addrs.src));
 806 | 		dsthost_hash = flow_hash_from_keys(&host_keys);
 807 | 		host_keys.addrs.v6addrs.src = keys.addrs.v6addrs.src;
 808 | 		memset(&host_keys.addrs.v6addrs.dst, 0,
 809 | 		       sizeof(host_keys.addrs.v6addrs.dst));
 810 | 		srchost_hash = flow_hash_from_keys(&host_keys);
 811 | 		break;
 812 | 
 813 | 	default:
 814 | 		dsthost_hash = 0;
 815 | 		srchost_hash = 0;
 816 | 	}
 817 | 
 818 | 	/* This *must* be after the above switch, since as a
 819 | 	 * side-effect it sorts the src and dst addresses.
 820 | 	 */
 821 | 	if (flow_mode & CAKE_FLOW_FLOWS)
 822 | 		flow_hash = flow_hash_from_keys(&keys);
 823 | #endif
 824 | 
 825 | skip_hash:
 826 | 	if (flow_override)
 827 | 		flow_hash = flow_override - 1;
 828 | 	if (host_override) {
 829 | 		dsthost_hash = host_override - 1;
 830 | 		srchost_hash = host_override - 1;
 831 | 	}
 832 | 
 833 | 	if (!(flow_mode & CAKE_FLOW_FLOWS)) {
 834 | 		if (flow_mode & CAKE_FLOW_SRC_IP)
 835 | 			flow_hash ^= srchost_hash;
 836 | 
 837 | 		if (flow_mode & CAKE_FLOW_DST_IP)
 838 | 			flow_hash ^= dsthost_hash;
 839 | 	}
 840 | 
 841 | 	reduced_hash = flow_hash % CAKE_QUEUES;
 842 | 
 843 | 	/* set-associative hashing */
 844 | 	/* fast path if no hash collision (direct lookup succeeds) */
 845 | 	if (likely(q->tags[reduced_hash] == flow_hash &&
 846 | 		   q->flows[reduced_hash].set)) {
 847 | 		q->way_directs++;
 848 | 	} else {
 849 | 		u32 inner_hash = reduced_hash % CAKE_SET_WAYS;
 850 | 		u32 outer_hash = reduced_hash - inner_hash;
 851 | 		bool allocate_src = false;
 852 | 		bool allocate_dst = false;
 853 | 		u32 i, k;
 854 | 
 855 | 		/* check if any active queue in the set is reserved for
 856 | 		 * this flow.
 857 | 		 */
 858 | 		for (i = 0, k = inner_hash; i < CAKE_SET_WAYS;
 859 | 		     i++, k = (k + 1) % CAKE_SET_WAYS) {
 860 | 			if (q->tags[outer_hash + k] == flow_hash) {
 861 | 				if (i)
 862 | 					q->way_hits++;
 863 | 
 864 | 				if (!q->flows[outer_hash + k].set) {
 865 | 					/* need to increment host refcnts */
 866 | 					allocate_src = cake_dsrc(flow_mode);
 867 | 					allocate_dst = cake_ddst(flow_mode);
 868 | 				}
 869 | 
 870 | 				goto found;
 871 | 			}
 872 | 		}
 873 | 
 874 | 		/* no queue is reserved for this flow, look for an
 875 | 		 * empty one.
 876 | 		 */
 877 | 		for (i = 0; i < CAKE_SET_WAYS;
 878 | 			 i++, k = (k + 1) % CAKE_SET_WAYS) {
 879 | 			if (!q->flows[outer_hash + k].set) {
 880 | 				q->way_misses++;
 881 | 				allocate_src = cake_dsrc(flow_mode);
 882 | 				allocate_dst = cake_ddst(flow_mode);
 883 | 				goto found;
 884 | 			}
 885 | 		}
 886 | 
 887 | 		/* With no empty queues, default to the original
 888 | 		 * queue, accept the collision, update the host tags.
 889 | 		 */
 890 | 		q->way_collisions++;
 891 | 		if (q->flows[outer_hash + k].set == CAKE_SET_BULK) {
 892 | 			q->hosts[q->flows[reduced_hash].srchost].srchost_bulk_flow_count--;
 893 | 			q->hosts[q->flows[reduced_hash].dsthost].dsthost_bulk_flow_count--;
 894 | 		}
 895 | 		allocate_src = cake_dsrc(flow_mode);
 896 | 		allocate_dst = cake_ddst(flow_mode);
 897 | found:
 898 | 		/* reserve queue for future packets in same flow */
 899 | 		reduced_hash = outer_hash + k;
 900 | 		q->tags[reduced_hash] = flow_hash;
 901 | 
 902 | 		if (allocate_src) {
 903 | 			srchost_idx = srchost_hash % CAKE_QUEUES;
 904 | 			inner_hash = srchost_idx % CAKE_SET_WAYS;
 905 | 			outer_hash = srchost_idx - inner_hash;
 906 | 			for (i = 0, k = inner_hash; i < CAKE_SET_WAYS;
 907 | 				i++, k = (k + 1) % CAKE_SET_WAYS) {
 908 | 				if (q->hosts[outer_hash + k].srchost_tag ==
 909 | 				    srchost_hash)
 910 | 					goto found_src;
 911 | 			}
 912 | 			for (i = 0; i < CAKE_SET_WAYS;
 913 | 				i++, k = (k + 1) % CAKE_SET_WAYS) {
 914 | 				if (!q->hosts[outer_hash + k].srchost_bulk_flow_count)
 915 | 					break;
 916 | 			}
 917 | 			q->hosts[outer_hash + k].srchost_tag = srchost_hash;
 918 | found_src:
 919 | 			srchost_idx = outer_hash + k;
 920 | 			if (q->flows[reduced_hash].set == CAKE_SET_BULK)
 921 | 				q->hosts[srchost_idx].srchost_bulk_flow_count++;
 922 | 			q->flows[reduced_hash].srchost = srchost_idx;
 923 | 		}
 924 | 
 925 | 		if (allocate_dst) {
 926 | 			dsthost_idx = dsthost_hash % CAKE_QUEUES;
 927 | 			inner_hash = dsthost_idx % CAKE_SET_WAYS;
 928 | 			outer_hash = dsthost_idx - inner_hash;
 929 | 			for (i = 0, k = inner_hash; i < CAKE_SET_WAYS;
 930 | 			     i++, k = (k + 1) % CAKE_SET_WAYS) {
 931 | 				if (q->hosts[outer_hash + k].dsthost_tag ==
 932 | 				    dsthost_hash)
 933 | 					goto found_dst;
 934 | 			}
 935 | 			for (i = 0; i < CAKE_SET_WAYS;
 936 | 			     i++, k = (k + 1) % CAKE_SET_WAYS) {
 937 | 				if (!q->hosts[outer_hash + k].dsthost_bulk_flow_count)
 938 | 					break;
 939 | 			}
 940 | 			q->hosts[outer_hash + k].dsthost_tag = dsthost_hash;
 941 | found_dst:
 942 | 			dsthost_idx = outer_hash + k;
 943 | 			if (q->flows[reduced_hash].set == CAKE_SET_BULK)
 944 | 				q->hosts[dsthost_idx].dsthost_bulk_flow_count++;
 945 | 			q->flows[reduced_hash].dsthost = dsthost_idx;
 946 | 		}
 947 | 	}
 948 | 
 949 | 	return reduced_hash;
 950 | }
 951 | 
 952 | /* helper functions : might be changed when/if skb use a standard list_head */
 953 | /* remove one skb from head of slot queue */
 954 | 
 955 | static struct sk_buff *dequeue_head(struct cake_flow *flow)
 956 | {
 957 | 	struct sk_buff *skb = flow->head;
 958 | 
 959 | 	if (skb) {
 960 | 		flow->head = skb->next;
 961 | 		skb->next = NULL;
 962 | 	}
 963 | 
 964 | 	return skb;
 965 | }
 966 | 
 967 | /* add skb to flow queue (tail add) */
 968 | 
 969 | static void flow_queue_add(struct cake_flow *flow, struct sk_buff *skb)
 970 | {
 971 | 	if (!flow->head)
 972 | 		flow->head = skb;
 973 | 	else
 974 | 		flow->tail->next = skb;
 975 | 	flow->tail = skb;
 976 | 	skb->next = NULL;
 977 | }
 978 | 
 979 | static struct iphdr *cake_get_iphdr(const struct sk_buff *skb,
 980 | 				    struct ipv6hdr *buf)
 981 | {
 982 | 	unsigned int offset = skb_network_offset(skb);
 983 | 	struct iphdr *iph;
 984 | 
 985 | 	iph = skb_header_pointer(skb, offset, sizeof(struct iphdr), buf);
 986 | 
 987 | 	if (!iph)
 988 | 		return NULL;
 989 | 
 990 | 	if (iph->version == 4 && iph->protocol == IPPROTO_IPV6)
 991 | 		return skb_header_pointer(skb, offset + iph->ihl * 4,
 992 | 					  sizeof(struct ipv6hdr), buf);
 993 | 
 994 | 	else if (iph->version == 4)
 995 | 		return iph;
 996 | 
 997 | 	else if (iph->version == 6)
 998 | 		return skb_header_pointer(skb, offset, sizeof(struct ipv6hdr),
 999 | 					  buf);
1000 | 
1001 | 	return NULL;
1002 | }
1003 | 
1004 | static struct tcphdr *cake_get_tcphdr(const struct sk_buff *skb,
1005 | 				      void *buf, unsigned int bufsize)
1006 | {
1007 | 	unsigned int offset = skb_network_offset(skb);
1008 | 	const struct ipv6hdr *ipv6h;
1009 | 	const struct tcphdr *tcph;
1010 | 	const struct iphdr *iph;
1011 | 	struct ipv6hdr _ipv6h;
1012 | 	struct tcphdr _tcph;
1013 | 
1014 | 	ipv6h = skb_header_pointer(skb, offset, sizeof(_ipv6h), &_ipv6h);
1015 | 
1016 | 	if (!ipv6h)
1017 | 		return NULL;
1018 | 
1019 | 	if (ipv6h->version == 4) {
1020 | 		iph = (struct iphdr *)ipv6h;
1021 | 		offset += iph->ihl * 4;
1022 | 
1023 | 		/* special-case 6in4 tunnelling, as that is a common way to get
1024 | 		 * v6 connectivity in the home
1025 | 		 */
1026 | 		if (iph->protocol == IPPROTO_IPV6) {
1027 | 			ipv6h = skb_header_pointer(skb, offset,
1028 | 						   sizeof(_ipv6h), &_ipv6h);
1029 | 
1030 | 			if (!ipv6h || ipv6h->nexthdr != IPPROTO_TCP)
1031 | 				return NULL;
1032 | 
1033 | 			offset += sizeof(struct ipv6hdr);
1034 | 
1035 | 		} else if (iph->protocol != IPPROTO_TCP) {
1036 | 			return NULL;
1037 | 		}
1038 | 
1039 | 	} else if (ipv6h->version == 6) {
1040 | 		if (ipv6h->nexthdr != IPPROTO_TCP)
1041 | 			return NULL;
1042 | 
1043 | 		offset += sizeof(struct ipv6hdr);
1044 | 	} else {
1045 | 		return NULL;
1046 | 	}
1047 | 
1048 | 	tcph = skb_header_pointer(skb, offset, sizeof(_tcph), &_tcph);
1049 | 	if (!tcph)
1050 | 		return NULL;
1051 | 
1052 | 	return skb_header_pointer(skb, offset,
1053 | 				  min(__tcp_hdrlen(tcph), bufsize), buf);
1054 | }
1055 | 
1056 | static const void *cake_get_tcpopt(const struct tcphdr *tcph,
1057 | 				   int code, int *oplen)
1058 | {
1059 | 	/* inspired by tcp_parse_options in tcp_input.c */
1060 | 	int length = __tcp_hdrlen(tcph) - sizeof(struct tcphdr);
1061 | 	const u8 *ptr = (const u8 *)(tcph + 1);
1062 | 
1063 | 	while (length > 0) {
1064 | 		int opcode = *ptr++;
1065 | 		int opsize;
1066 | 
1067 | 		if (opcode == TCPOPT_EOL)
1068 | 			break;
1069 | 		if (opcode == TCPOPT_NOP) {
1070 | 			length--;
1071 | 			continue;
1072 | 		}
1073 | 		if (length < 2)
1074 | 			break;
1075 | 		opsize = *ptr++;
1076 | 		if (opsize < 2 || opsize > length)
1077 | 			break;
1078 | 
1079 | 		if (opcode == code) {
1080 | 			*oplen = opsize;
1081 | 			return ptr;
1082 | 		}
1083 | 
1084 | 		ptr += opsize - 2;
1085 | 		length -= opsize;
1086 | 	}
1087 | 
1088 | 	return NULL;
1089 | }
1090 | 
1091 | /* Compare two SACK sequences. A sequence is considered greater if it SACKs more
1092 |  * bytes than the other. In the case where both sequences ACKs bytes that the
1093 |  * other doesn't, A is considered greater. DSACKs in A also makes A be
1094 |  * considered greater.
1095 |  *
1096 |  * @return -1, 0 or 1 as normal compare functions
1097 |  */
1098 | static int cake_tcph_sack_compare(const struct tcphdr *tcph_a,
1099 | 				  const struct tcphdr *tcph_b)
1100 | {
1101 | 	const struct tcp_sack_block_wire *sack_a, *sack_b;
1102 | 	u32 ack_seq_a = ntohl(tcph_a->ack_seq);
1103 | 	u32 bytes_a = 0, bytes_b = 0;
1104 | 	int oplen_a, oplen_b;
1105 | 	bool first = true;
1106 | 
1107 | 	sack_a = cake_get_tcpopt(tcph_a, TCPOPT_SACK, &oplen_a);
1108 | 	sack_b = cake_get_tcpopt(tcph_b, TCPOPT_SACK, &oplen_b);
1109 | 
1110 | 	/* pointers point to option contents */
1111 | 	oplen_a -= TCPOLEN_SACK_BASE;
1112 | 	oplen_b -= TCPOLEN_SACK_BASE;
1113 | 
1114 | 	if (sack_a && oplen_a >= sizeof(*sack_a) &&
1115 | 	    (!sack_b || oplen_b < sizeof(*sack_b)))
1116 | 		return -1;
1117 | 	else if (sack_b && oplen_b >= sizeof(*sack_b) &&
1118 | 		 (!sack_a || oplen_a < sizeof(*sack_a)))
1119 | 		return 1;
1120 | 	else if ((!sack_a || oplen_a < sizeof(*sack_a)) &&
1121 | 		 (!sack_b || oplen_b < sizeof(*sack_b)))
1122 | 		return 0;
1123 | 
1124 | 	while (oplen_a >= sizeof(*sack_a)) {
1125 | 		const struct tcp_sack_block_wire *sack_tmp = sack_b;
1126 | 		u32 start_a = get_unaligned_be32(&sack_a->start_seq);
1127 | 		u32 end_a = get_unaligned_be32(&sack_a->end_seq);
1128 | 		int oplen_tmp = oplen_b;
1129 | 		bool found = false;
1130 | 
1131 | 		/* DSACK; always considered greater to prevent dropping */
1132 | 		if (before(start_a, ack_seq_a))
1133 | 			return -1;
1134 | 
1135 | 		bytes_a += end_a - start_a;
1136 | 
1137 | 		while (oplen_tmp >= sizeof(*sack_tmp)) {
1138 | 			u32 start_b = get_unaligned_be32(&sack_tmp->start_seq);
1139 | 			u32 end_b = get_unaligned_be32(&sack_tmp->end_seq);
1140 | 
1141 | 			/* first time through we count the total size */
1142 | 			if (first)
1143 | 				bytes_b += end_b - start_b;
1144 | 
1145 | 			if (!after(start_b, start_a) && !before(end_b, end_a)) {
1146 | 				found = true;
1147 | 				if (!first)
1148 | 					break;
1149 | 			}
1150 | 			oplen_tmp -= sizeof(*sack_tmp);
1151 | 			sack_tmp++;
1152 | 		}
1153 | 
1154 | 		if (!found)
1155 | 			return -1;
1156 | 
1157 | 		oplen_a -= sizeof(*sack_a);
1158 | 		sack_a++;
1159 | 		first = false;
1160 | 	}
1161 | 
1162 | 	/* If we made it this far, all ranges SACKed by A are covered by B, so
1163 | 	 * either the SACKs are equal, or B SACKs more bytes.
1164 | 	 */
1165 | 	return bytes_b > bytes_a ? 1 : 0;
1166 | }
1167 | 
1168 | static void cake_tcph_get_tstamp(const struct tcphdr *tcph,
1169 | 				 u32 *tsval, u32 *tsecr)
1170 | {
1171 | 	const u8 *ptr;
1172 | 	int opsize;
1173 | 
1174 | 	ptr = cake_get_tcpopt(tcph, TCPOPT_TIMESTAMP, &opsize);
1175 | 
1176 | 	if (ptr && opsize == TCPOLEN_TIMESTAMP) {
1177 | 		*tsval = get_unaligned_be32(ptr);
1178 | 		*tsecr = get_unaligned_be32(ptr + 4);
1179 | 	}
1180 | }
1181 | 
1182 | static bool cake_tcph_may_drop(const struct tcphdr *tcph,
1183 | 			       u32 tstamp_new, u32 tsecr_new)
1184 | {
1185 | 	/* inspired by tcp_parse_options in tcp_input.c */
1186 | 	int length = __tcp_hdrlen(tcph) - sizeof(struct tcphdr);
1187 | 	const u8 *ptr = (const u8 *)(tcph + 1);
1188 | 	u32 tstamp, tsecr;
1189 | 
1190 | 	/* 3 reserved flags must be unset to avoid future breakage
1191 | 	 * ACK must be set
1192 | 	 * ECE/CWR are handled separately
1193 | 	 * All other flags URG/PSH/RST/SYN/FIN must be unset
1194 | 	 * 0x0FFF0000 = all TCP flags (confirm ACK=1, others zero)
1195 | 	 * 0x00C00000 = CWR/ECE (handled separately)
1196 | 	 * 0x0F3F0000 = 0x0FFF0000 & ~0x00C00000
1197 | 	 */
1198 | 	if (((tcp_flag_word(tcph) &
1199 | 	      cpu_to_be32(0x0F3F0000)) != TCP_FLAG_ACK))
1200 | 		return false;
1201 | 
1202 | 	while (length > 0) {
1203 | 		int opcode = *ptr++;
1204 | 		int opsize;
1205 | 
1206 | 		if (opcode == TCPOPT_EOL)
1207 | 			break;
1208 | 		if (opcode == TCPOPT_NOP) {
1209 | 			length--;
1210 | 			continue;
1211 | 		}
1212 | 		if (length < 2)
1213 | 			break;
1214 | 		opsize = *ptr++;
1215 | 		if (opsize < 2 || opsize > length)
1216 | 			break;
1217 | 
1218 | 		switch (opcode) {
1219 | 		case TCPOPT_MD5SIG: /* doesn't influence state */
1220 | 			break;
1221 | 
1222 | 		case TCPOPT_SACK: /* stricter checking performed later */
1223 | 			if (opsize % 8 != 2)
1224 | 				return false;
1225 | 			break;
1226 | 
1227 | 		case TCPOPT_TIMESTAMP:
1228 | 			/* only drop timestamps lower than new */
1229 | 			if (opsize != TCPOLEN_TIMESTAMP)
1230 | 				return false;
1231 | 			tstamp = get_unaligned_be32(ptr);
1232 | 			tsecr = get_unaligned_be32(ptr + 4);
1233 | 			if (after(tstamp, tstamp_new) ||
1234 | 			    after(tsecr, tsecr_new))
1235 | 				return false;
1236 | 			break;
1237 | 
1238 | 		case TCPOPT_MSS:  /* these should only be set on SYN */
1239 | 		case TCPOPT_WINDOW:
1240 | 		case TCPOPT_SACK_PERM:
1241 | 		case TCPOPT_FASTOPEN:
1242 | 		case TCPOPT_EXP:
1243 | 		default: /* don't drop if any unknown options are present */
1244 | 			return false;
1245 | 		}
1246 | 
1247 | 		ptr += opsize - 2;
1248 | 		length -= opsize;
1249 | 	}
1250 | 
1251 | 	return true;
1252 | }
1253 | 
1254 | static struct sk_buff *cake_ack_filter(struct cake_sched_data *q,
1255 | 				       struct cake_flow *flow)
1256 | {
1257 | 	bool aggressive = q->ack_filter == CAKE_ACK_AGGRESSIVE;
1258 | 	struct sk_buff *elig_ack = NULL, *elig_ack_prev = NULL;
1259 | 	struct sk_buff *skb_check, *skb_prev = NULL;
1260 | 	const struct ipv6hdr *ipv6h, *ipv6h_check;
1261 | 	unsigned char _tcph[64], _tcph_check[64];
1262 | 	const struct tcphdr *tcph, *tcph_check;
1263 | 	const struct iphdr *iph, *iph_check;
1264 | 	struct ipv6hdr _iph, _iph_check;
1265 | 	const struct sk_buff *skb;
1266 | 	int seglen, num_found = 0;
1267 | 	u32 tstamp = 0, tsecr = 0;
1268 | 	__be32 elig_flags = 0;
1269 | 	int sack_comp;
1270 | 
1271 | 	/* no other possible ACKs to filter */
1272 | 	if (flow->head == flow->tail)
1273 | 		return NULL;
1274 | 
1275 | 	skb = flow->tail;
1276 | 	tcph = cake_get_tcphdr(skb, _tcph, sizeof(_tcph));
1277 | 	iph = cake_get_iphdr(skb, &_iph);
1278 | 	if (!tcph)
1279 | 		return NULL;
1280 | 
1281 | 	cake_tcph_get_tstamp(tcph, &tstamp, &tsecr);
1282 | 
1283 | 	/* the 'triggering' packet need only have the ACK flag set.
1284 | 	 * also check that SYN is not set, as there won't be any previous ACKs.
1285 | 	 */
1286 | 	if ((tcp_flag_word(tcph) &
1287 | 	     (TCP_FLAG_ACK | TCP_FLAG_SYN)) != TCP_FLAG_ACK)
1288 | 		return NULL;
1289 | 
1290 | 	/* the 'triggering' ACK is at the tail of the queue, we have already
1291 | 	 * returned if it is the only packet in the flow. loop through the rest
1292 | 	 * of the queue looking for pure ACKs with the same 5-tuple as the
1293 | 	 * triggering one.
1294 | 	 */
1295 | 	for (skb_check = flow->head;
1296 | 	     skb_check && skb_check != skb;
1297 | 	     skb_prev = skb_check, skb_check = skb_check->next) {
1298 | 		iph_check = cake_get_iphdr(skb_check, &_iph_check);
1299 | 		tcph_check = cake_get_tcphdr(skb_check, &_tcph_check,
1300 | 					     sizeof(_tcph_check));
1301 | 
1302 | 		/* only TCP packets with matching 5-tuple are eligible, and only
1303 | 		 * drop safe headers
1304 | 		 */
1305 | 		if (!tcph_check || iph->version != iph_check->version ||
1306 | 		    tcph_check->source != tcph->source ||
1307 | 		    tcph_check->dest != tcph->dest)
1308 | 			continue;
1309 | 
1310 | 		if (iph_check->version == 4) {
1311 | 			if (iph_check->saddr != iph->saddr ||
1312 | 			    iph_check->daddr != iph->daddr)
1313 | 				continue;
1314 | 
1315 | 			seglen = ntohs(iph_check->tot_len) -
1316 | 				       (4 * iph_check->ihl);
1317 | 		} else if (iph_check->version == 6) {
1318 | 			ipv6h = (struct ipv6hdr *)iph;
1319 | 			ipv6h_check = (struct ipv6hdr *)iph_check;
1320 | 
1321 | 			if (ipv6_addr_cmp(&ipv6h_check->saddr, &ipv6h->saddr) ||
1322 | 			    ipv6_addr_cmp(&ipv6h_check->daddr, &ipv6h->daddr))
1323 | 				continue;
1324 | 
1325 | 			seglen = ntohs(ipv6h_check->payload_len);
1326 | 		} else {
1327 | 			WARN_ON(1);  /* shouldn't happen */
1328 | 			continue;
1329 | 		}
1330 | 
1331 | 		/* If the ECE/CWR flags changed from the previous eligible
1332 | 		 * packet in the same flow, we should no longer be dropping that
1333 | 		 * previous packet as this would lose information.
1334 | 		 */
1335 | 		if (elig_ack && (tcp_flag_word(tcph_check) &
1336 | 				 (TCP_FLAG_ECE | TCP_FLAG_CWR)) != elig_flags) {
1337 | 			elig_ack = NULL;
1338 | 			elig_ack_prev = NULL;
1339 | 			num_found--;
1340 | 		}
1341 | 
1342 | 		/* Check TCP options and flags, don't drop ACKs with segment
1343 | 		 * data, and don't drop ACKs with a higher cumulative ACK
1344 | 		 * counter than the triggering packet. Check ACK seqno here to
1345 | 		 * avoid parsing SACK options of packets we are going to exclude
1346 | 		 * anyway.
1347 | 		 */
1348 | 		if (!cake_tcph_may_drop(tcph_check, tstamp, tsecr) ||
1349 | 		    (seglen - __tcp_hdrlen(tcph_check)) != 0 ||
1350 | 		    after(ntohl(tcph_check->ack_seq), ntohl(tcph->ack_seq)))
1351 | 			continue;
1352 | 
1353 | 		/* Check SACK options. The triggering packet must SACK more data
1354 | 		 * than the ACK under consideration, or SACK the same range but
1355 | 		 * have a larger cumulative ACK counter. The latter is a
1356 | 		 * pathological case, but is contained in the following check
1357 | 		 * anyway, just to be safe.
1358 | 		 */
1359 | 		sack_comp = cake_tcph_sack_compare(tcph_check, tcph);
1360 | 
1361 | 		if (sack_comp < 0 ||
1362 | 		    (ntohl(tcph_check->ack_seq) == ntohl(tcph->ack_seq) &&
1363 | 		     sack_comp == 0))
1364 | 			continue;
1365 | 
1366 | 		/* At this point we have found an eligible pure ACK to drop; if
1367 | 		 * we are in aggressive mode, we are done. Otherwise, keep
1368 | 		 * searching unless this is the second eligible ACK we
1369 | 		 * found.
1370 | 		 *
1371 | 		 * Since we want to drop ACK closest to the head of the queue,
1372 | 		 * save the first eligible ACK we find, even if we need to loop
1373 | 		 * again.
1374 | 		 */
1375 | 		if (!elig_ack) {
1376 | 			elig_ack = skb_check;
1377 | 			elig_ack_prev = skb_prev;
1378 | 			elig_flags = (tcp_flag_word(tcph_check)
1379 | 				      & (TCP_FLAG_ECE | TCP_FLAG_CWR));
1380 | 		}
1381 | 
1382 | 		if (num_found++ > 0)
1383 | 			goto found;
1384 | 	}
1385 | 
1386 | 	/* We made it through the queue without finding two eligible ACKs . If
1387 | 	 * we found a single eligible ACK we can drop it in aggressive mode if
1388 | 	 * we can guarantee that this does not interfere with ECN flag
1389 | 	 * information. We ensure this by dropping it only if the enqueued
1390 | 	 * packet is consecutive with the eligible ACK, and their flags match.
1391 | 	 */
1392 | 	if (elig_ack && aggressive && elig_ack->next == skb &&
1393 | 	    (elig_flags == (tcp_flag_word(tcph) &
1394 | 			    (TCP_FLAG_ECE | TCP_FLAG_CWR))))
1395 | 		goto found;
1396 | 
1397 | 	return NULL;
1398 | 
1399 | found:
1400 | 	if (elig_ack_prev)
1401 | 		elig_ack_prev->next = elig_ack->next;
1402 | 	else
1403 | 		flow->head = elig_ack->next;
1404 | 
1405 | 	elig_ack->next = NULL;
1406 | 
1407 | 	return elig_ack;
1408 | }
1409 | 
1410 | static u64 cake_ewma(u64 avg, u64 sample, u32 shift)
1411 | {
1412 | 	avg -= avg >> shift;
1413 | 	avg += sample >> shift;
1414 | 	return avg;
1415 | }
1416 | 
1417 | static u32 cake_calc_overhead(struct cake_sched_data *q, u32 len, u32 off)
1418 | {
1419 | 	if (q->rate_flags & CAKE_FLAG_OVERHEAD)
1420 | 		len -= off;
1421 | 
1422 | 	if (q->max_netlen < len)
1423 | 		q->max_netlen = len;
1424 | 	if (q->min_netlen > len)
1425 | 		q->min_netlen = len;
1426 | 
1427 | 	len += q->rate_overhead;
1428 | 
1429 | 	if (len < q->rate_mpu)
1430 | 		len = q->rate_mpu;
1431 | 
1432 | 	if (q->atm_mode == CAKE_ATM_ATM) {
1433 | 		len += 47;
1434 | 		len /= 48;
1435 | 		len *= 53;
1436 | 	} else if (q->atm_mode == CAKE_ATM_PTM) {
1437 | 		/* Add one byte per 64 bytes or part thereof.
1438 | 		 * This is conservative and easier to calculate than the
1439 | 		 * precise value.
1440 | 		 */
1441 | 		len += (len + 63) / 64;
1442 | 	}
1443 | 
1444 | 	if (q->max_adjlen < len)
1445 | 		q->max_adjlen = len;
1446 | 	if (q->min_adjlen > len)
1447 | 		q->min_adjlen = len;
1448 | 
1449 | 	return len;
1450 | }
1451 | 
1452 | static u32 cake_overhead(struct cake_sched_data *q, const struct sk_buff *skb)
1453 | {
1454 | 	const struct skb_shared_info *shinfo = skb_shinfo(skb);
1455 | 	unsigned int hdr_len, last_len = 0;
1456 | 	u32 off = skb_network_offset(skb);
1457 | 	u32 len = qdisc_pkt_len(skb);
1458 | 	u16 segs = 1;
1459 | 
1460 | 	q->avg_netoff = cake_ewma(q->avg_netoff, off << 16, 8);
1461 | 
1462 | 	if (!shinfo->gso_size)
1463 | 		return cake_calc_overhead(q, len, off);
1464 | 
1465 | 	/* borrowed from qdisc_pkt_len_init() */
1466 | 	hdr_len = skb_transport_header(skb) - skb_mac_header(skb);
1467 | 
1468 | 	/* + transport layer */
1469 | 	if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 |
1470 | 						SKB_GSO_TCPV6))) {
1471 | 		const struct tcphdr *th;
1472 | 		struct tcphdr _tcphdr;
1473 | 
1474 | 		th = skb_header_pointer(skb, skb_transport_offset(skb),
1475 | 					sizeof(_tcphdr), &_tcphdr);
1476 | 		if (likely(th))
1477 | 			hdr_len += __tcp_hdrlen(th);
1478 | 	} else {
1479 | 		struct udphdr _udphdr;
1480 | 
1481 | 		if (skb_header_pointer(skb, skb_transport_offset(skb),
1482 | 				       sizeof(_udphdr), &_udphdr))
1483 | 			hdr_len += sizeof(struct udphdr);
1484 | 	}
1485 | 
1486 | 	if (unlikely(shinfo->gso_type & SKB_GSO_DODGY))
1487 | 		segs = DIV_ROUND_UP(skb->len - hdr_len,
1488 | 				    shinfo->gso_size);
1489 | 	else
1490 | 		segs = shinfo->gso_segs;
1491 | 
1492 | 	len = shinfo->gso_size + hdr_len;
1493 | 	last_len = skb->len - shinfo->gso_size * (segs - 1);
1494 | 
1495 | 	return (cake_calc_overhead(q, len, off) * (segs - 1) +
1496 | 		cake_calc_overhead(q, last_len, off));
1497 | }
1498 | 
1499 | static void cake_heap_swap(struct cake_sched_data *q, u16 i, u16 j)
1500 | {
1501 | 	struct cake_heap_entry ii = q->overflow_heap[i];
1502 | 	struct cake_heap_entry jj = q->overflow_heap[j];
1503 | 
1504 | 	q->overflow_heap[i] = jj;
1505 | 	q->overflow_heap[j] = ii;
1506 | 
1507 | 	q->tins[ii.t].overflow_idx[ii.b] = j;
1508 | 	q->tins[jj.t].overflow_idx[jj.b] = i;
1509 | }
1510 | 
1511 | static u32 cake_heap_get_backlog(const struct cake_sched_data *q, u16 i)
1512 | {
1513 | 	struct cake_heap_entry ii = q->overflow_heap[i];
1514 | 
1515 | 	return q->tins[ii.t].backlogs[ii.b];
1516 | }
1517 | 
1518 | static void cake_heapify(struct cake_sched_data *q, u16 i)
1519 | {
1520 | 	static const u32 a = CAKE_MAX_TINS * CAKE_QUEUES;
1521 | 	u32 mb = cake_heap_get_backlog(q, i);
1522 | 	u32 m = i;
1523 | 
1524 | 	while (m < a) {
1525 | 		u32 l = m + m + 1;
1526 | 		u32 r = l + 1;
1527 | 
1528 | 		if (l < a) {
1529 | 			u32 lb = cake_heap_get_backlog(q, l);
1530 | 
1531 | 			if (lb > mb) {
1532 | 				m  = l;
1533 | 				mb = lb;
1534 | 			}
1535 | 		}
1536 | 
1537 | 		if (r < a) {
1538 | 			u32 rb = cake_heap_get_backlog(q, r);
1539 | 
1540 | 			if (rb > mb) {
1541 | 				m  = r;
1542 | 				mb = rb;
1543 | 			}
1544 | 		}
1545 | 
1546 | 		if (m != i) {
1547 | 			cake_heap_swap(q, i, m);
1548 | 			i = m;
1549 | 		} else {
1550 | 			break;
1551 | 		}
1552 | 	}
1553 | }
1554 | 
1555 | static void cake_heapify_up(struct cake_sched_data *q, u16 i)
1556 | {
1557 | 	while (i > 0 && i < CAKE_MAX_TINS * CAKE_QUEUES) {
1558 | 		u16 p = (i - 1) >> 1;
1559 | 		u32 ib = cake_heap_get_backlog(q, i);
1560 | 		u32 pb = cake_heap_get_backlog(q, p);
1561 | 
1562 | 		if (ib > pb) {
1563 | 			cake_heap_swap(q, i, p);
1564 | 			i = p;
1565 | 		} else {
1566 | 			break;
1567 | 		}
1568 | 	}
1569 | }
1570 | 
1571 | static int cake_advance_shaper(struct cake_sched_data *q,
1572 | 			       struct cake_tin_data *b,
1573 | 			       struct sk_buff *skb,
1574 | 			       ktime_t now, bool drop)
1575 | {
1576 | 	u32 len = get_cobalt_cb(skb)->adjusted_len;
1577 | 
1578 | 	/* charge packet bandwidth to this tin
1579 | 	 * and to the global shaper.
1580 | 	 */
1581 | 	if (q->rate_ns) {
1582 | 		u64 tin_dur = (len * b->tin_rate_ns) >> b->tin_rate_shft;
1583 | 		u64 global_dur = (len * q->rate_ns) >> q->rate_shft;
1584 | 		u64 failsafe_dur = global_dur + (global_dur >> 1);
1585 | 
1586 | 		if (ktime_before(b->time_next_packet, now))
1587 | 			b->time_next_packet = ktime_add_ns(b->time_next_packet,
1588 | 							   tin_dur);
1589 | 
1590 | 		else if (ktime_before(b->time_next_packet,
1591 | 				      ktime_add_ns(now, tin_dur)))
1592 | 			b->time_next_packet = ktime_add_ns(now, tin_dur);
1593 | 
1594 | 		q->time_next_packet = ktime_add_ns(q->time_next_packet,
1595 | 						   global_dur);
1596 | 		if (!drop)
1597 | 			q->failsafe_next_packet = \
1598 | 				ktime_add_ns(q->failsafe_next_packet,
1599 | 					     failsafe_dur);
1600 | 	}
1601 | 	return len;
1602 | }
1603 | 
1604 | #if LINUX_VERSION_CODE < KERNEL_VERSION(4, 8, 0)
1605 | static unsigned int cake_drop(struct Qdisc *sch)
1606 | #else
1607 | static unsigned int cake_drop(struct Qdisc *sch, struct sk_buff **to_free)
1608 | #endif
1609 | {
1610 | 	struct cake_sched_data *q = qdisc_priv(sch);
1611 | 	ktime_t now = ktime_get();
1612 | 	u32 idx = 0, tin = 0, len;
1613 | 	struct cake_heap_entry qq;
1614 | 	struct cake_tin_data *b;
1615 | 	struct cake_flow *flow;
1616 | 	struct sk_buff *skb;
1617 | 
1618 | 	if (!q->overflow_timeout) {
1619 | 		int i;
1620 | 		/* Build fresh max-heap */
1621 | 		for (i = CAKE_MAX_TINS * CAKE_QUEUES / 2; i >= 0; i--)
1622 | 			cake_heapify(q, i);
1623 | 	}
1624 | 	q->overflow_timeout = 65535;
1625 | 
1626 | 	/* select longest queue for pruning */
1627 | 	qq  = q->overflow_heap[0];
1628 | 	tin = qq.t;
1629 | 	idx = qq.b;
1630 | 
1631 | 	b = &q->tins[tin];
1632 | 	flow = &b->flows[idx];
1633 | 	skb = dequeue_head(flow);
1634 | 	if (unlikely(!skb)) {
1635 | 		/* heap has gone wrong, rebuild it next time */
1636 | 		q->overflow_timeout = 0;
1637 | 		return idx + (tin << 16);
1638 | 	}
1639 | 
1640 | 	if (cobalt_queue_full(&flow->cvars, &b->cparams, now))
1641 | 		b->unresponsive_flow_count++;
1642 | 
1643 | 	len = qdisc_pkt_len(skb);
1644 | 	q->buffer_used      -= skb->truesize;
1645 | 	b->backlogs[idx]    -= len;
1646 | 	b->tin_backlog      -= len;
1647 | 	sch->qstats.backlog -= len;
1648 | 	qdisc_tree_reduce_backlog(sch, 1, len);
1649 | 
1650 | 	flow->dropped++;
1651 | 	b->tin_dropped++;
1652 | 	sch->qstats.drops++;
1653 | 
1654 | 	if (q->rate_flags & CAKE_FLAG_INGRESS)
1655 | 		cake_advance_shaper(q, b, skb, now, true);
1656 | 
1657 | #if LINUX_VERSION_CODE < KERNEL_VERSION(4, 8, 0)
1658 | 	kfree_skb(skb);
1659 | #else
1660 | 	__qdisc_drop(skb, to_free);
1661 | #endif
1662 | 	sch->q.qlen--;
1663 | 
1664 | 	cake_heapify(q, 0);
1665 | 
1666 | 	return idx + (tin << 16);
1667 | }
1668 | 
1669 | static u8 cake_handle_diffserv(struct sk_buff *skb, bool wash)
1670 | {
1671 | 	const int offset = skb_network_offset(skb);
1672 | 	u16 *buf, buf_;
1673 | 	u8 dscp;
1674 | 
1675 | 	switch (cake_skb_proto(skb)) {
1676 | 	case htons(ETH_P_IP):
1677 | 		buf = skb_header_pointer(skb, offset, sizeof(buf_), &buf_);
1678 | 		if (unlikely(!buf))
1679 | 			return 0;
1680 | 
1681 | 		/* ToS is in the second byte of iphdr */
1682 | 		dscp = ipv4_get_dsfield((struct iphdr *)buf) >> 2;
1683 | 
1684 | 		if (wash && dscp) {
1685 | 			const int wlen = offset + sizeof(struct iphdr);
1686 | 
1687 | 			if (!pskb_may_pull(skb, wlen) ||
1688 | 			    skb_try_make_writable(skb, wlen))
1689 | 				return 0;
1690 | 
1691 | 			ipv4_change_dsfield(ip_hdr(skb), INET_ECN_MASK, 0);
1692 | 		}
1693 | 
1694 | 		return dscp;
1695 | 
1696 | 	case htons(ETH_P_IPV6):
1697 | 		buf = skb_header_pointer(skb, offset, sizeof(buf_), &buf_);
1698 | 		if (unlikely(!buf))
1699 | 			return 0;
1700 | 
1701 | 		/* Traffic class is in the first and second bytes of ipv6hdr */
1702 | 		dscp = ipv6_get_dsfield((struct ipv6hdr *)buf) >> 2;
1703 | 
1704 | 		if (wash && dscp) {
1705 | 			const int wlen = offset + sizeof(struct ipv6hdr);
1706 | 
1707 | 			if (!pskb_may_pull(skb, wlen) ||
1708 | 			    skb_try_make_writable(skb, wlen))
1709 | 				return 0;
1710 | 
1711 | 			ipv6_change_dsfield(ipv6_hdr(skb), INET_ECN_MASK, 0);
1712 | 		}
1713 | 
1714 | 		return dscp;
1715 | 
1716 | 	case htons(ETH_P_ARP):
1717 | 		return 0x38;  /* CS7 - Net Control */
1718 | 
1719 | 	default:
1720 | 		/* If there is no Diffserv field, treat as best-effort */
1721 | 		return 0;
1722 | 	}
1723 | }
1724 | 
1725 | static struct cake_tin_data *cake_select_tin(struct Qdisc *sch,
1726 | 					     struct sk_buff *skb)
1727 | {
1728 | 	struct cake_sched_data *q = qdisc_priv(sch);
1729 | 	u32 tin, mark;
1730 | 	bool wash;
1731 | 	u8 dscp;
1732 | 
1733 | 	/* Tin selection: Default to diffserv-based selection, allow overriding
1734 | 	 * using firewall marks or skb->priority. Call DSCP parsing early if
1735 | 	 * wash is enabled, otherwise defer to below to skip unneeded parsing.
1736 | 	 */
1737 | 	mark = (skb->mark & q->fwmark_mask) >> q->fwmark_shft;
1738 | 	wash = !!(q->rate_flags & CAKE_FLAG_WASH);
1739 | 	if (wash)
1740 | 		dscp = cake_handle_diffserv(skb, wash);
1741 | 
1742 | 	if (q->tin_mode == CAKE_DIFFSERV_BESTEFFORT)
1743 | 		tin = 0;
1744 | 
1745 | 	else if (mark && mark <= q->tin_cnt)
1746 | 		tin = q->tin_order[mark - 1];
1747 | 
1748 | 	else if (TC_H_MAJ(skb->priority) == sch->handle &&
1749 | 		 TC_H_MIN(skb->priority) > 0 &&
1750 | 		 TC_H_MIN(skb->priority) <= q->tin_cnt)
1751 | 		tin = q->tin_order[TC_H_MIN(skb->priority) - 1];
1752 | 
1753 | 	else {
1754 | 		if (!wash)
1755 | 			dscp = cake_handle_diffserv(skb, wash);
1756 | 		tin = q->tin_index[dscp];
1757 | 
1758 | 		if (unlikely(tin >= q->tin_cnt))
1759 | 			tin = 0;
1760 | 	}
1761 | 
1762 | 	return &q->tins[tin];
1763 | }
1764 | 
1765 | static u32 cake_classify(struct Qdisc *sch, struct cake_tin_data **t,
1766 | 			 struct sk_buff *skb, int flow_mode, int *qerr)
1767 | {
1768 | 	struct cake_sched_data *q = qdisc_priv(sch);
1769 | 	struct tcf_proto *filter;
1770 | 	struct tcf_result res;
1771 | 	u16 flow = 0, host = 0;
1772 | 	int result;
1773 | 
1774 | 	filter = rcu_dereference_bh(q->filter_list);
1775 | 	if (!filter)
1776 | 		goto hash;
1777 | 
1778 | 	*qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
1779 | 	result = tcf_classify(skb, filter, &res, false);
1780 | 
1781 | 	if (result >= 0) {
1782 | #ifdef CONFIG_NET_CLS_ACT
1783 | 		switch (result) {
1784 | 		case TC_ACT_STOLEN:
1785 | 		case TC_ACT_QUEUED:
1786 | #if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 13, 0)
1787 | 		case TC_ACT_TRAP:
1788 | #endif
1789 | 			*qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
1790 | 			/* fall through */
1791 | 		case TC_ACT_SHOT:
1792 | 			return 0;
1793 | 		}
1794 | #endif
1795 | 		if (TC_H_MIN(res.classid) <= CAKE_QUEUES)
1796 | 			flow = TC_H_MIN(res.classid);
1797 | 		if (TC_H_MAJ(res.classid) <= (CAKE_QUEUES << 16))
1798 | 			host = TC_H_MAJ(res.classid) >> 16;
1799 | 	}
1800 | hash:
1801 | 	*t = cake_select_tin(sch, skb);
1802 | 	return cake_hash(*t, skb, flow_mode, flow, host) + 1;
1803 | }
1804 | 
1805 | static void cake_reconfigure(struct Qdisc *sch);
1806 | 
1807 | #if LINUX_VERSION_CODE < KERNEL_VERSION(4, 8, 0)
1808 | static s32 cake_enqueue(struct sk_buff *skb, struct Qdisc *sch)
1809 | #else
1810 | static s32 cake_enqueue(struct sk_buff *skb, struct Qdisc *sch,
1811 | 			struct sk_buff **to_free)
1812 | #endif
1813 | {
1814 | 	struct cake_sched_data *q = qdisc_priv(sch);
1815 | 	int len = qdisc_pkt_len(skb);
1816 | 	int uninitialized_var(ret);
1817 | 	struct sk_buff *ack = NULL;
1818 | 	ktime_t now = ktime_get();
1819 | 	struct cake_tin_data *b;
1820 | 	struct cake_flow *flow;
1821 | 	u32 idx;
1822 | 
1823 | 	/* choose flow to insert into */
1824 | 	idx = cake_classify(sch, &b, skb, q->flow_mode, &ret);
1825 | 	if (idx == 0) {
1826 | 		if (ret & __NET_XMIT_BYPASS)
1827 | 			qdisc_qstats_drop(sch);
1828 | #if LINUX_VERSION_CODE < KERNEL_VERSION(4, 8, 0)
1829 | 		kfree_skb(skb);
1830 | #else
1831 | 		__qdisc_drop(skb, to_free);
1832 | #endif
1833 | 		return ret;
1834 | 	}
1835 | 	idx--;
1836 | 	flow = &b->flows[idx];
1837 | 
1838 | 	/* ensure shaper state isn't stale */
1839 | 	if (!b->tin_backlog) {
1840 | 		if (ktime_before(b->time_next_packet, now))
1841 | 			b->time_next_packet = now;
1842 | 
1843 | 		if (!sch->q.qlen) {
1844 | 			if (ktime_before(q->time_next_packet, now)) {
1845 | 				q->failsafe_next_packet = now;
1846 | 				q->time_next_packet = now;
1847 | 			} else if (ktime_after(q->time_next_packet, now) &&
1848 | 				   ktime_after(q->failsafe_next_packet, now)) {
1849 | 				u64 next = \
1850 | 					min(ktime_to_ns(q->time_next_packet),
1851 | 					    ktime_to_ns(
1852 | 						   q->failsafe_next_packet));
1853 | 				sch->qstats.overlimits++;
1854 | 				qdisc_watchdog_schedule_ns(&q->watchdog, next);
1855 | 			}
1856 | 		}
1857 | 	}
1858 | 
1859 | 	if (unlikely(len > b->max_skblen))
1860 | 		b->max_skblen = len;
1861 | 
1862 | 	if (skb_is_gso(skb) && q->rate_flags & CAKE_FLAG_SPLIT_GSO) {
1863 | 		struct sk_buff *segs, *nskb;
1864 | 		netdev_features_t features = netif_skb_features(skb);
1865 | 		unsigned int slen = 0, numsegs = 0;
1866 | 
1867 | 		segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK);
1868 | 		if (IS_ERR_OR_NULL(segs))
1869 | #if LINUX_VERSION_CODE < KERNEL_VERSION(4, 8, 0)
1870 | 			return qdisc_reshape_fail(skb, sch);
1871 | #else
1872 | 			return qdisc_drop(skb, sch, to_free);
1873 | #endif
1874 | 
1875 | 		while (segs) {
1876 | 			nskb = segs->next;
1877 | 			segs->next = NULL;
1878 | 			qdisc_skb_cb(segs)->pkt_len = segs->len;
1879 | 			cobalt_set_enqueue_time(segs, now);
1880 | 			get_cobalt_cb(segs)->adjusted_len = cake_overhead(q,
1881 | 									  segs);
1882 | 			flow_queue_add(flow, segs);
1883 | 
1884 | 			sch->q.qlen++;
1885 | 			numsegs++;
1886 | 			slen += segs->len;
1887 | 			q->buffer_used += segs->truesize;
1888 | 			b->packets++;
1889 | 			segs = nskb;
1890 | 		}
1891 | 
1892 | 		/* stats */
1893 | 		b->bytes	    += slen;
1894 | 		b->backlogs[idx]    += slen;
1895 | 		b->tin_backlog      += slen;
1896 | 		sch->qstats.backlog += slen;
1897 | 		q->avg_window_bytes += slen;
1898 | 
1899 | 		qdisc_tree_reduce_backlog(sch, 1-numsegs, len-slen);
1900 | 		consume_skb(skb);
1901 | 	} else {
1902 | 		/* not splitting */
1903 | 		cobalt_set_enqueue_time(skb, now);
1904 | 		get_cobalt_cb(skb)->adjusted_len = cake_overhead(q, skb);
1905 | 		flow_queue_add(flow, skb);
1906 | 
1907 | 		if (q->ack_filter)
1908 | 			ack = cake_ack_filter(q, flow);
1909 | 
1910 | 		if (ack) {
1911 | 			b->ack_drops++;
1912 | 			sch->qstats.drops++;
1913 | 			b->bytes += qdisc_pkt_len(ack);
1914 | 			len -= qdisc_pkt_len(ack);
1915 | 			q->buffer_used += skb->truesize - ack->truesize;
1916 | 			if (q->rate_flags & CAKE_FLAG_INGRESS)
1917 | 				cake_advance_shaper(q, b, ack, now, true);
1918 | 
1919 | 			qdisc_tree_reduce_backlog(sch, 1, qdisc_pkt_len(ack));
1920 | 			consume_skb(ack);
1921 | 		} else {
1922 | 			sch->q.qlen++;
1923 | 			q->buffer_used      += skb->truesize;
1924 | 		}
1925 | 
1926 | 		/* stats */
1927 | 		b->packets++;
1928 | 		b->bytes	    += len;
1929 | 		b->backlogs[idx]    += len;
1930 | 		b->tin_backlog      += len;
1931 | 		sch->qstats.backlog += len;
1932 | 		q->avg_window_bytes += len;
1933 | 	}
1934 | 
1935 | 	if (q->overflow_timeout)
1936 | 		cake_heapify_up(q, b->overflow_idx[idx]);
1937 | 
1938 | 	/* incoming bandwidth capacity estimate */
1939 | 	if (q->rate_flags & CAKE_FLAG_AUTORATE_INGRESS) {
1940 | 		u64 packet_interval = \
1941 | 			ktime_to_ns(ktime_sub(now, q->last_packet_time));
1942 | 
1943 | 		if (packet_interval > NSEC_PER_SEC)
1944 | 			packet_interval = NSEC_PER_SEC;
1945 | 
1946 | 		/* filter out short-term bursts, eg. wifi aggregation */
1947 | 		q->avg_packet_interval = \
1948 | 			cake_ewma(q->avg_packet_interval,
1949 | 				  packet_interval,
1950 | 				  (packet_interval > q->avg_packet_interval ?
1951 | 					  2 : 8));
1952 | 
1953 | 		q->last_packet_time = now;
1954 | 
1955 | 		if (packet_interval > q->avg_packet_interval) {
1956 | 			u64 window_interval = \
1957 | 				ktime_to_ns(ktime_sub(now,
1958 | 						      q->avg_window_begin));
1959 | 			u64 b = q->avg_window_bytes * (u64)NSEC_PER_SEC;
1960 | 
1961 | 			b = div64_u64(b, window_interval);
1962 | 			q->avg_peak_bandwidth =
1963 | 				cake_ewma(q->avg_peak_bandwidth, b,
1964 | 					  b > q->avg_peak_bandwidth ? 2 : 8);
1965 | 			q->avg_window_bytes = 0;
1966 | 			q->avg_window_begin = now;
1967 | 
1968 | 			if (ktime_after(now,
1969 | 					ktime_add_ms(q->last_reconfig_time,
1970 | 						     250))) {
1971 | 				q->rate_bps = (q->avg_peak_bandwidth * 15) >> 4;
1972 | 				cake_reconfigure(sch);
1973 | 			}
1974 | 		}
1975 | 	} else {
1976 | 		q->avg_window_bytes = 0;
1977 | 		q->last_packet_time = now;
1978 | 	}
1979 | 
1980 | 	/* flowchain */
1981 | 	if (!flow->set || flow->set == CAKE_SET_DECAYING) {
1982 | 		struct cake_host *srchost = &b->hosts[flow->srchost];
1983 | 		struct cake_host *dsthost = &b->hosts[flow->dsthost];
1984 | 		u16 host_load = 1;
1985 | 
1986 | 		if (!flow->set) {
1987 | 			list_add_tail(&flow->flowchain, &b->new_flows);
1988 | 		} else {
1989 | 			b->decaying_flow_count--;
1990 | 			list_move_tail(&flow->flowchain, &b->new_flows);
1991 | 		}
1992 | 		flow->set = CAKE_SET_SPARSE;
1993 | 		b->sparse_flow_count++;
1994 | 
1995 | 		if (cake_dsrc(q->flow_mode))
1996 | 			host_load = max(host_load, srchost->srchost_bulk_flow_count);
1997 | 
1998 | 		if (cake_ddst(q->flow_mode))
1999 | 			host_load = max(host_load, dsthost->dsthost_bulk_flow_count);
2000 | 
2001 | 		flow->deficit = (b->flow_quantum *
2002 | 				 quantum_div[host_load]) >> 16;
2003 | 	} else if (flow->set == CAKE_SET_SPARSE_WAIT) {
2004 | 		struct cake_host *srchost = &b->hosts[flow->srchost];
2005 | 		struct cake_host *dsthost = &b->hosts[flow->dsthost];
2006 | 
2007 | 		/* this flow was empty, accounted as a sparse flow, but actually
2008 | 		 * in the bulk rotation.
2009 | 		 */
2010 | 		flow->set = CAKE_SET_BULK;
2011 | 		b->sparse_flow_count--;
2012 | 		b->bulk_flow_count++;
2013 | 
2014 | 		if (cake_dsrc(q->flow_mode))
2015 | 			srchost->srchost_bulk_flow_count++;
2016 | 
2017 | 		if (cake_ddst(q->flow_mode))
2018 | 			dsthost->dsthost_bulk_flow_count++;
2019 | 
2020 | 	}
2021 | 
2022 | 	if (q->buffer_used > q->buffer_max_used)
2023 | 		q->buffer_max_used = q->buffer_used;
2024 | 
2025 | 	if (q->buffer_used > q->buffer_limit) {
2026 | 		u32 dropped = 0;
2027 | 
2028 | 		while (q->buffer_used > q->buffer_limit) {
2029 | 			dropped++;
2030 | #if LINUX_VERSION_CODE < KERNEL_VERSION(4, 8, 0)
2031 | 			cake_drop(sch);
2032 | #else
2033 | 			cake_drop(sch, to_free);
2034 | #endif
2035 | 		}
2036 | 		b->drop_overlimit += dropped;
2037 | 	}
2038 | 	return NET_XMIT_SUCCESS;
2039 | }
2040 | 
2041 | static struct sk_buff *cake_dequeue_one(struct Qdisc *sch)
2042 | {
2043 | 	struct cake_sched_data *q = qdisc_priv(sch);
2044 | 	struct cake_tin_data *b = &q->tins[q->cur_tin];
2045 | 	struct cake_flow *flow = &b->flows[q->cur_flow];
2046 | 	struct sk_buff *skb = NULL;
2047 | 	u32 len;
2048 | 
2049 | 	if (flow->head) {
2050 | 		skb = dequeue_head(flow);
2051 | 		len = qdisc_pkt_len(skb);
2052 | 		b->backlogs[q->cur_flow] -= len;
2053 | 		b->tin_backlog		 -= len;
2054 | 		sch->qstats.backlog      -= len;
2055 | 		q->buffer_used		 -= skb->truesize;
2056 | 		sch->q.qlen--;
2057 | 
2058 | 		if (q->overflow_timeout)
2059 | 			cake_heapify(q, b->overflow_idx[q->cur_flow]);
2060 | 	}
2061 | 	return skb;
2062 | }
2063 | 
2064 | /* Discard leftover packets from a tin no longer in use. */
2065 | static void cake_clear_tin(struct Qdisc *sch, u16 tin)
2066 | {
2067 | 	struct cake_sched_data *q = qdisc_priv(sch);
2068 | 	struct sk_buff *skb;
2069 | 
2070 | 	q->cur_tin = tin;
2071 | 	for (q->cur_flow = 0; q->cur_flow < CAKE_QUEUES; q->cur_flow++)
2072 | 		while (!!(skb = cake_dequeue_one(sch)))
2073 | 			kfree_skb(skb);
2074 | }
2075 | 
2076 | static struct sk_buff *cake_dequeue(struct Qdisc *sch)
2077 | {
2078 | 	struct cake_sched_data *q = qdisc_priv(sch);
2079 | 	struct cake_tin_data *b = &q->tins[q->cur_tin];
2080 | 	struct cake_host *srchost, *dsthost;
2081 | 	ktime_t now = ktime_get();
2082 | 	struct cake_flow *flow;
2083 | 	struct list_head *head;
2084 | 	bool first_flow = true;
2085 | 	struct sk_buff *skb;
2086 | 	u16 host_load;
2087 | 	u64 delay;
2088 | 	u32 len;
2089 | 
2090 | begin:
2091 | 	if (!sch->q.qlen)
2092 | 		return NULL;
2093 | 
2094 | 	/* global hard shaper */
2095 | 	if (ktime_after(q->time_next_packet, now) &&
2096 | 	    ktime_after(q->failsafe_next_packet, now)) {
2097 | 		u64 next = min(ktime_to_ns(q->time_next_packet),
2098 | 			       ktime_to_ns(q->failsafe_next_packet));
2099 | 
2100 | 		sch->qstats.overlimits++;
2101 | 		qdisc_watchdog_schedule_ns(&q->watchdog, next);
2102 | 		return NULL;
2103 | 	}
2104 | 
2105 | 	/* Choose a class to work on. */
2106 | 	if (!q->rate_ns) {
2107 | 		/* In unlimited mode, can't rely on shaper timings, just balance
2108 | 		 * with DRR
2109 | 		 */
2110 | 		bool wrapped = false, empty = true;
2111 | 
2112 | 		while (b->tin_deficit < 0 ||
2113 | 		       !(b->sparse_flow_count + b->bulk_flow_count)) {
2114 | 			if (b->tin_deficit <= 0)
2115 | 				b->tin_deficit += b->tin_quantum;
2116 | 			if (b->sparse_flow_count + b->bulk_flow_count)
2117 | 				empty = false;
2118 | 
2119 | 			q->cur_tin++;
2120 | 			b++;
2121 | 			if (q->cur_tin >= q->tin_cnt) {
2122 | 				q->cur_tin = 0;
2123 | 				b = q->tins;
2124 | 
2125 | 				if (wrapped) {
2126 | 					/* It's possible for q->qlen to be
2127 | 					 * nonzero when we actually have no
2128 | 					 * packets anywhere.
2129 | 					 */
2130 | 					if (empty)
2131 | 						return NULL;
2132 | 				} else {
2133 | 					wrapped = true;
2134 | 				}
2135 | 			}
2136 | 		}
2137 | 	} else {
2138 | 		/* In shaped mode, choose:
2139 | 		 * - Highest-priority tin with queue and meeting schedule, or
2140 | 		 * - The earliest-scheduled tin with queue.
2141 | 		 */
2142 | 		ktime_t best_time = ns_to_ktime(KTIME_MAX);
2143 | 		int tin, best_tin = 0;
2144 | 
2145 | 		for (tin = 0; tin < q->tin_cnt; tin++) {
2146 | 			b = q->tins + tin;
2147 | 			if ((b->sparse_flow_count + b->bulk_flow_count) > 0) {
2148 | 				ktime_t time_to_pkt = \
2149 | 					ktime_sub(b->time_next_packet, now);
2150 | 
2151 | 				if (ktime_to_ns(time_to_pkt) <= 0 ||
2152 | 				    ktime_compare(time_to_pkt,
2153 | 						  best_time) <= 0) {
2154 | 					best_time = time_to_pkt;
2155 | 					best_tin = tin;
2156 | 				}
2157 | 			}
2158 | 		}
2159 | 
2160 | 		q->cur_tin = best_tin;
2161 | 		b = q->tins + best_tin;
2162 | 
2163 | 		/* No point in going further if no packets to deliver. */
2164 | 		if (unlikely(!(b->sparse_flow_count + b->bulk_flow_count)))
2165 | 			return NULL;
2166 | 	}
2167 | 
2168 | retry:
2169 | 	/* service this class */
2170 | 	head = &b->decaying_flows;
2171 | 	if (!first_flow || list_empty(head)) {
2172 | 		head = &b->new_flows;
2173 | 		if (list_empty(head)) {
2174 | 			head = &b->old_flows;
2175 | 			if (unlikely(list_empty(head))) {
2176 | 				head = &b->decaying_flows;
2177 | 				if (unlikely(list_empty(head)))
2178 | 					goto begin;
2179 | 			}
2180 | 		}
2181 | 	}
2182 | 	flow = list_first_entry(head, struct cake_flow, flowchain);
2183 | 	q->cur_flow = flow - b->flows;
2184 | 	first_flow = false;
2185 | 
2186 | 	/* triple isolation (modified DRR++) */
2187 | 	srchost = &b->hosts[flow->srchost];
2188 | 	dsthost = &b->hosts[flow->dsthost];
2189 | 	host_load = 1;
2190 | 
2191 | 	/* flow isolation (DRR++) */
2192 | 	if (flow->deficit <= 0) {
2193 | 		/* Keep all flows with deficits out of the sparse and decaying
2194 | 		 * rotations.  No non-empty flow can go into the decaying
2195 | 		 * rotation, so they can't get deficits
2196 | 		 */
2197 | 		if (flow->set == CAKE_SET_SPARSE) {
2198 | 			if (flow->head) {
2199 | 				b->sparse_flow_count--;
2200 | 				b->bulk_flow_count++;
2201 | 
2202 | 				if (cake_dsrc(q->flow_mode))
2203 | 					srchost->srchost_bulk_flow_count++;
2204 | 
2205 | 				if (cake_ddst(q->flow_mode))
2206 | 					dsthost->dsthost_bulk_flow_count++;
2207 | 
2208 | 				flow->set = CAKE_SET_BULK;
2209 | 			} else {
2210 | 				/* we've moved it to the bulk rotation for
2211 | 				 * correct deficit accounting but we still want
2212 | 				 * to count it as a sparse flow, not a bulk one.
2213 | 				 */
2214 | 				flow->set = CAKE_SET_SPARSE_WAIT;
2215 | 			}
2216 | 		}
2217 | 
2218 | 		if (cake_dsrc(q->flow_mode))
2219 | 			host_load = max(host_load, srchost->srchost_bulk_flow_count);
2220 | 
2221 | 		if (cake_ddst(q->flow_mode))
2222 | 			host_load = max(host_load, dsthost->dsthost_bulk_flow_count);
2223 | 
2224 | 		WARN_ON(host_load > CAKE_QUEUES);
2225 | 
2226 | 		/* The shifted prandom_u32() is a way to apply dithering to
2227 | 		 * avoid accumulating roundoff errors
2228 | 		 */
2229 | 		flow->deficit += (b->flow_quantum * quantum_div[host_load] +
2230 | 				  (prandom_u32() >> 16)) >> 16;
2231 | 		list_move_tail(&flow->flowchain, &b->old_flows);
2232 | 
2233 | 		goto retry;
2234 | 	}
2235 | 
2236 | 	/* Retrieve a packet via the AQM */
2237 | 	while (1) {
2238 | 		skb = cake_dequeue_one(sch);
2239 | 		if (!skb) {
2240 | 			/* this queue was actually empty */
2241 | 			if (cobalt_queue_empty(&flow->cvars, &b->cparams, now))
2242 | 				b->unresponsive_flow_count--;
2243 | 
2244 | 			if (flow->cvars.p_drop || flow->cvars.count ||
2245 | 			    ktime_before(now, flow->cvars.drop_next)) {
2246 | 				/* keep in the flowchain until the state has
2247 | 				 * decayed to rest
2248 | 				 */
2249 | 				list_move_tail(&flow->flowchain,
2250 | 					       &b->decaying_flows);
2251 | 				if (flow->set == CAKE_SET_BULK) {
2252 | 					b->bulk_flow_count--;
2253 | 
2254 | 					if (cake_dsrc(q->flow_mode))
2255 | 						srchost->srchost_bulk_flow_count--;
2256 | 
2257 | 					if (cake_ddst(q->flow_mode))
2258 | 						dsthost->dsthost_bulk_flow_count--;
2259 | 
2260 | 					b->decaying_flow_count++;
2261 | 				} else if (flow->set == CAKE_SET_SPARSE ||
2262 | 					   flow->set == CAKE_SET_SPARSE_WAIT) {
2263 | 					b->sparse_flow_count--;
2264 | 					b->decaying_flow_count++;
2265 | 				}
2266 | 				flow->set = CAKE_SET_DECAYING;
2267 | 			} else {
2268 | 				/* remove empty queue from the flowchain */
2269 | 				list_del_init(&flow->flowchain);
2270 | 				if (flow->set == CAKE_SET_SPARSE ||
2271 | 				    flow->set == CAKE_SET_SPARSE_WAIT)
2272 | 					b->sparse_flow_count--;
2273 | 				else if (flow->set == CAKE_SET_BULK) {
2274 | 					b->bulk_flow_count--;
2275 | 
2276 | 					if (cake_dsrc(q->flow_mode))
2277 | 						srchost->srchost_bulk_flow_count--;
2278 | 
2279 | 					if (cake_ddst(q->flow_mode))
2280 | 						dsthost->dsthost_bulk_flow_count--;
2281 | 
2282 | 				} else
2283 | 					b->decaying_flow_count--;
2284 | 
2285 | 				flow->set = CAKE_SET_NONE;
2286 | 			}
2287 | 			goto begin;
2288 | 		}
2289 | 
2290 | 		/* Last packet in queue may be marked, shouldn't be dropped */
2291 | 		if (!cobalt_should_drop(&flow->cvars, &b->cparams, now, skb,
2292 | 					(b->bulk_flow_count *
2293 | 					 !!(q->rate_flags &
2294 | 					    CAKE_FLAG_INGRESS))) ||
2295 | 		    !flow->head)
2296 | 			break;
2297 | 
2298 | 		/* drop this packet, get another one */
2299 | 		if (q->rate_flags & CAKE_FLAG_INGRESS) {
2300 | 			len = cake_advance_shaper(q, b, skb,
2301 | 						  now, true);
2302 | 			flow->deficit -= len;
2303 | 			b->tin_deficit -= len;
2304 | 		}
2305 | 		flow->dropped++;
2306 | 		b->tin_dropped++;
2307 | 		qdisc_tree_reduce_backlog(sch, 1, qdisc_pkt_len(skb));
2308 | #if LINUX_VERSION_CODE < KERNEL_VERSION(4, 8, 0)
2309 | 		qdisc_drop(skb, sch);
2310 | #else
2311 | 		qdisc_qstats_drop(sch);
2312 | 		kfree_skb(skb);
2313 | #endif
2314 | 		if (q->rate_flags & CAKE_FLAG_INGRESS)
2315 | 			goto retry;
2316 | 	}
2317 | 
2318 | 	b->tin_ecn_mark += !!flow->cvars.ecn_marked;
2319 | 	qdisc_bstats_update(sch, skb);
2320 | 
2321 | 	/* collect delay stats */
2322 | 	delay = ktime_to_ns(ktime_sub(now, cobalt_get_enqueue_time(skb)));
2323 | 	b->avge_delay = cake_ewma(b->avge_delay, delay, 8);
2324 | 	b->peak_delay = cake_ewma(b->peak_delay, delay,
2325 | 				  delay > b->peak_delay ? 2 : 8);
2326 | 	b->base_delay = cake_ewma(b->base_delay, delay,
2327 | 				  delay < b->base_delay ? 2 : 8);
2328 | 
2329 | 	len = cake_advance_shaper(q, b, skb, now, false);
2330 | 	flow->deficit -= len;
2331 | 	b->tin_deficit -= len;
2332 | 
2333 | 	if (ktime_after(q->time_next_packet, now) && sch->q.qlen) {
2334 | 		u64 next = min(ktime_to_ns(q->time_next_packet),
2335 | 			       ktime_to_ns(q->failsafe_next_packet));
2336 | 
2337 | 		qdisc_watchdog_schedule_ns(&q->watchdog, next);
2338 | 	} else if (!sch->q.qlen) {
2339 | 		int i;
2340 | 
2341 | 		for (i = 0; i < q->tin_cnt; i++) {
2342 | 			if (q->tins[i].decaying_flow_count) {
2343 | 				ktime_t next = \
2344 | 					ktime_add_ns(now,
2345 | 						     q->tins[i].cparams.target);
2346 | 
2347 | 				qdisc_watchdog_schedule_ns(&q->watchdog,
2348 | 							   ktime_to_ns(next));
2349 | 				break;
2350 | 			}
2351 | 		}
2352 | 	}
2353 | 
2354 | 	if (q->overflow_timeout)
2355 | 		q->overflow_timeout--;
2356 | 
2357 | 	return skb;
2358 | }
2359 | 
2360 | static void cake_reset(struct Qdisc *sch)
2361 | {
2362 | 	u32 c;
2363 | 
2364 | 	for (c = 0; c < CAKE_MAX_TINS; c++)
2365 | 		cake_clear_tin(sch, c);
2366 | }
2367 | 
2368 | static const struct nla_policy cake_policy[TCA_CAKE_MAX + 1] = {
2369 | 	[TCA_CAKE_BASE_RATE64]   = { .type = NLA_U64 },
2370 | 	[TCA_CAKE_DIFFSERV_MODE] = { .type = NLA_U32 },
2371 | 	[TCA_CAKE_ATM]		 = { .type = NLA_U32 },
2372 | 	[TCA_CAKE_FLOW_MODE]     = { .type = NLA_U32 },
2373 | 	[TCA_CAKE_OVERHEAD]      = { .type = NLA_S32 },
2374 | 	[TCA_CAKE_RTT]		 = { .type = NLA_U32 },
2375 | 	[TCA_CAKE_TARGET]	 = { .type = NLA_U32 },
2376 | 	[TCA_CAKE_AUTORATE]      = { .type = NLA_U32 },
2377 | 	[TCA_CAKE_MEMORY]	 = { .type = NLA_U32 },
2378 | 	[TCA_CAKE_NAT]		 = { .type = NLA_U32 },
2379 | 	[TCA_CAKE_RAW]		 = { .type = NLA_U32 },
2380 | 	[TCA_CAKE_WASH]		 = { .type = NLA_U32 },
2381 | 	[TCA_CAKE_MPU]		 = { .type = NLA_U32 },
2382 | 	[TCA_CAKE_INGRESS]	 = { .type = NLA_U32 },
2383 | 	[TCA_CAKE_ACK_FILTER]	 = { .type = NLA_U32 },
2384 | 	[TCA_CAKE_SPLIT_GSO]	 = { .type = NLA_U32 },
2385 | 	[TCA_CAKE_FWMARK]	 = { .type = NLA_U32 },
2386 | };
2387 | 
2388 | static void cake_set_rate(struct cake_tin_data *b, u64 rate, u32 mtu,
2389 | 			  u64 target_ns, u64 rtt_est_ns)
2390 | {
2391 | 	/* convert byte-rate into time-per-byte
2392 | 	 * so it will always unwedge in reasonable time.
2393 | 	 */
2394 | 	static const u64 MIN_RATE = 64;
2395 | 	u32 byte_target = mtu;
2396 | 	u64 byte_target_ns;
2397 | 	u8  rate_shft = 0;
2398 | 	u64 rate_ns = 0;
2399 | 
2400 | 	b->flow_quantum = 1514;
2401 | 	if (rate) {
2402 | 		b->flow_quantum = max(min(rate >> 12, 1514ULL), 300ULL);
2403 | 		rate_shft = 34;
2404 | 		rate_ns = ((u64)NSEC_PER_SEC) << rate_shft;
2405 | 		rate_ns = div64_u64(rate_ns, max(MIN_RATE, rate));
2406 | 		while (!!(rate_ns >> 34)) {
2407 | 			rate_ns >>= 1;
2408 | 			rate_shft--;
2409 | 		}
2410 | 	} /* else unlimited, ie. zero delay */
2411 | 
2412 | 	b->tin_rate_bps  = rate;
2413 | 	b->tin_rate_ns   = rate_ns;
2414 | 	b->tin_rate_shft = rate_shft;
2415 | 
2416 | 	byte_target_ns = (byte_target * rate_ns) >> rate_shft;
2417 | 
2418 | 	b->cparams.target = max((byte_target_ns * 3) / 2, target_ns);
2419 | 	b->cparams.interval = max(rtt_est_ns +
2420 | 				     b->cparams.target - target_ns,
2421 | 				     b->cparams.target * 2);
2422 | 	b->cparams.mtu_time = byte_target_ns;
2423 | 	b->cparams.p_inc = 1 << 24; /* 1/256 */
2424 | 	b->cparams.p_dec = 1 << 20; /* 1/4096 */
2425 | }
2426 | 
2427 | static int cake_config_besteffort(struct Qdisc *sch)
2428 | {
2429 | 	struct cake_sched_data *q = qdisc_priv(sch);
2430 | 	struct cake_tin_data *b = &q->tins[0];
2431 | 	u32 mtu = psched_mtu(qdisc_dev(sch));
2432 | 	u64 rate = q->rate_bps;
2433 | 
2434 | 	q->tin_cnt = 1;
2435 | 
2436 | 	q->tin_index = besteffort;
2437 | 	q->tin_order = normal_order;
2438 | 
2439 | 	cake_set_rate(b, rate, mtu,
2440 | 		      us_to_ns(q->target), us_to_ns(q->interval));
2441 | 	b->tin_quantum = 65535;
2442 | 
2443 | 	return 0;
2444 | }
2445 | 
2446 | static int cake_config_precedence(struct Qdisc *sch)
2447 | {
2448 | 	/* convert high-level (user visible) parameters into internal format */
2449 | 	struct cake_sched_data *q = qdisc_priv(sch);
2450 | 	u32 mtu = psched_mtu(qdisc_dev(sch));
2451 | 	u64 rate = q->rate_bps;
2452 | 	u32 quantum = 256;
2453 | 	u32 i;
2454 | 
2455 | 	q->tin_cnt = 8;
2456 | 	q->tin_index = precedence;
2457 | 	q->tin_order = normal_order;
2458 | 
2459 | 	for (i = 0; i < q->tin_cnt; i++) {
2460 | 		struct cake_tin_data *b = &q->tins[i];
2461 | 
2462 | 		cake_set_rate(b, rate, mtu, us_to_ns(q->target),
2463 | 			      us_to_ns(q->interval));
2464 | 
2465 | 		b->tin_quantum = max_t(u16, 1U, quantum);
2466 | 
2467 | 		/* calculate next class's parameters */
2468 | 		rate  *= 7;
2469 | 		rate >>= 3;
2470 | 
2471 | 		quantum  *= 7;
2472 | 		quantum >>= 3;
2473 | 	}
2474 | 
2475 | 	return 0;
2476 | }
2477 | 
2478 | /*	List of known Diffserv codepoints:
2479 |  *
2480 |  *	Least Effort (CS1)
2481 |  *	Best Effort (CS0)
2482 |  *	Max Reliability & LLT "Lo" (TOS1)
2483 |  *	Max Throughput (TOS2)
2484 |  *	Min Delay (TOS4)
2485 |  *	LLT "La" (TOS5)
2486 |  *	Assured Forwarding 1 (AF1x) - x3
2487 |  *	Assured Forwarding 2 (AF2x) - x3
2488 |  *	Assured Forwarding 3 (AF3x) - x3
2489 |  *	Assured Forwarding 4 (AF4x) - x3
2490 |  *	Precedence Class 2 (CS2)
2491 |  *	Precedence Class 3 (CS3)
2492 |  *	Precedence Class 4 (CS4)
2493 |  *	Precedence Class 5 (CS5)
2494 |  *	Precedence Class 6 (CS6)
2495 |  *	Precedence Class 7 (CS7)
2496 |  *	Voice Admit (VA)
2497 |  *	Expedited Forwarding (EF)
2498 | 
2499 |  *	Total 25 codepoints.
2500 |  */
2501 | 
2502 | /*	List of traffic classes in RFC 4594:
2503 |  *		(roughly descending order of contended priority)
2504 |  *		(roughly ascending order of uncontended throughput)
2505 |  *
2506 |  *	Network Control (CS6,CS7)      - routing traffic
2507 |  *	Telephony (EF,VA)         - aka. VoIP streams
2508 |  *	Signalling (CS5)               - VoIP setup
2509 |  *	Multimedia Conferencing (AF4x) - aka. video calls
2510 |  *	Realtime Interactive (CS4)     - eg. games
2511 |  *	Multimedia Streaming (AF3x)    - eg. YouTube, NetFlix, Twitch
2512 |  *	Broadcast Video (CS3)
2513 |  *	Low Latency Data (AF2x,TOS4)      - eg. database
2514 |  *	Ops, Admin, Management (CS2,TOS1) - eg. ssh
2515 |  *	Standard Service (CS0 & unrecognised codepoints)
2516 |  *	High Throughput Data (AF1x,TOS2)  - eg. web traffic
2517 |  *	Low Priority Data (CS1)           - eg. BitTorrent
2518 | 
2519 |  *	Total 12 traffic classes.
2520 |  */
2521 | 
2522 | static int cake_config_diffserv8(struct Qdisc *sch)
2523 | {
2524 | /*	Pruned list of traffic classes for typical applications:
2525 |  *
2526 |  *		Network Control          (CS6, CS7)
2527 |  *		Minimum Latency          (EF, VA, CS5, CS4)
2528 |  *		Interactive Shell        (CS2, TOS1)
2529 |  *		Low Latency Transactions (AF2x, TOS4)
2530 |  *		Video Streaming          (AF4x, AF3x, CS3)
2531 |  *		Bog Standard             (CS0 etc.)
2532 |  *		High Throughput          (AF1x, TOS2)
2533 |  *		Background Traffic       (CS1)
2534 |  *
2535 |  *		Total 8 traffic classes.
2536 |  */
2537 | 
2538 | 	struct cake_sched_data *q = qdisc_priv(sch);
2539 | 	u32 mtu = psched_mtu(qdisc_dev(sch));
2540 | 	u64 rate = q->rate_bps;
2541 | 	u32 quantum = 256;
2542 | 	u32 i;
2543 | 
2544 | 	q->tin_cnt = 8;
2545 | 
2546 | 	/* codepoint to class mapping */
2547 | 	q->tin_index = diffserv8;
2548 | 	q->tin_order = normal_order;
2549 | 
2550 | 	/* class characteristics */
2551 | 	for (i = 0; i < q->tin_cnt; i++) {
2552 | 		struct cake_tin_data *b = &q->tins[i];
2553 | 
2554 | 		cake_set_rate(b, rate, mtu, us_to_ns(q->target),
2555 | 			      us_to_ns(q->interval));
2556 | 
2557 | 		b->tin_quantum = max_t(u16, 1U, quantum);
2558 | 
2559 | 		/* calculate next class's parameters */
2560 | 		rate  *= 7;
2561 | 		rate >>= 3;
2562 | 
2563 | 		quantum  *= 7;
2564 | 		quantum >>= 3;
2565 | 	}
2566 | 
2567 | 	return 0;
2568 | }
2569 | 
2570 | static int cake_config_diffserv4(struct Qdisc *sch)
2571 | {
2572 | /*  Further pruned list of traffic classes for four-class system:
2573 |  *
2574 |  *	    Latency Sensitive  (CS7, CS6, EF, VA, CS5, CS4)
2575 |  *	    Streaming Media    (AF4x, AF3x, CS3, AF2x, TOS4, CS2, TOS1)
2576 |  *	    Best Effort        (CS0, AF1x, TOS2, and those not specified)
2577 |  *	    Background Traffic (CS1)
2578 |  *
2579 |  *		Total 4 traffic classes.
2580 |  */
2581 | 
2582 | 	struct cake_sched_data *q = qdisc_priv(sch);
2583 | 	u32 mtu = psched_mtu(qdisc_dev(sch));
2584 | 	u64 rate = q->rate_bps;
2585 | 	u32 quantum = 1024;
2586 | 
2587 | 	q->tin_cnt = 4;
2588 | 
2589 | 	/* codepoint to class mapping */
2590 | 	q->tin_index = diffserv4;
2591 | 	q->tin_order = bulk_order;
2592 | 
2593 | 	/* class characteristics */
2594 | 	cake_set_rate(&q->tins[0], rate, mtu,
2595 | 		      us_to_ns(q->target), us_to_ns(q->interval));
2596 | 	cake_set_rate(&q->tins[1], rate >> 4, mtu,
2597 | 		      us_to_ns(q->target), us_to_ns(q->interval));
2598 | 	cake_set_rate(&q->tins[2], rate >> 1, mtu,
2599 | 		      us_to_ns(q->target), us_to_ns(q->interval));
2600 | 	cake_set_rate(&q->tins[3], rate >> 2, mtu,
2601 | 		      us_to_ns(q->target), us_to_ns(q->interval));
2602 | 
2603 | 	/* bandwidth-sharing weights */
2604 | 	q->tins[0].tin_quantum = quantum;
2605 | 	q->tins[1].tin_quantum = quantum >> 4;
2606 | 	q->tins[2].tin_quantum = quantum >> 1;
2607 | 	q->tins[3].tin_quantum = quantum >> 2;
2608 | 
2609 | 	return 0;
2610 | }
2611 | 
2612 | static int cake_config_diffserv3(struct Qdisc *sch)
2613 | {
2614 | /*  Simplified Diffserv structure with 3 tins.
2615 |  *		Low Priority		(CS1)
2616 |  *		Best Effort
2617 |  *		Latency Sensitive	(TOS4, VA, EF, CS6, CS7)
2618 |  */
2619 | 	struct cake_sched_data *q = qdisc_priv(sch);
2620 | 	u32 mtu = psched_mtu(qdisc_dev(sch));
2621 | 	u64 rate = q->rate_bps;
2622 | 	u32 quantum = 1024;
2623 | 
2624 | 	q->tin_cnt = 3;
2625 | 
2626 | 	/* codepoint to class mapping */
2627 | 	q->tin_index = diffserv3;
2628 | 	q->tin_order = bulk_order;
2629 | 
2630 | 	/* class characteristics */
2631 | 	cake_set_rate(&q->tins[0], rate, mtu,
2632 | 		      us_to_ns(q->target), us_to_ns(q->interval));
2633 | 	cake_set_rate(&q->tins[1], rate >> 4, mtu,
2634 | 		      us_to_ns(q->target), us_to_ns(q->interval));
2635 | 	cake_set_rate(&q->tins[2], rate >> 2, mtu,
2636 | 		      us_to_ns(q->target), us_to_ns(q->interval));
2637 | 
2638 | 	/* bandwidth-sharing weights */
2639 | 	q->tins[0].tin_quantum = quantum;
2640 | 	q->tins[1].tin_quantum = quantum >> 4;
2641 | 	q->tins[2].tin_quantum = quantum >> 2;
2642 | 
2643 | 	return 0;
2644 | }
2645 | 
2646 | static void cake_reconfigure(struct Qdisc *sch)
2647 | {
2648 | 	struct cake_sched_data *q = qdisc_priv(sch);
2649 | 	int c, ft;
2650 | 
2651 | 	switch (q->tin_mode) {
2652 | 	case CAKE_DIFFSERV_BESTEFFORT:
2653 | 		ft = cake_config_besteffort(sch);
2654 | 		break;
2655 | 
2656 | 	case CAKE_DIFFSERV_PRECEDENCE:
2657 | 		ft = cake_config_precedence(sch);
2658 | 		break;
2659 | 
2660 | 	case CAKE_DIFFSERV_DIFFSERV8:
2661 | 		ft = cake_config_diffserv8(sch);
2662 | 		break;
2663 | 
2664 | 	case CAKE_DIFFSERV_DIFFSERV4:
2665 | 		ft = cake_config_diffserv4(sch);
2666 | 		break;
2667 | 
2668 | 	case CAKE_DIFFSERV_DIFFSERV3:
2669 | 	default:
2670 | 		ft = cake_config_diffserv3(sch);
2671 | 		break;
2672 | 	}
2673 | 
2674 | 	for (c = q->tin_cnt; c < CAKE_MAX_TINS; c++) {
2675 | 		cake_clear_tin(sch, c);
2676 | 		q->tins[c].cparams.mtu_time = q->tins[ft].cparams.mtu_time;
2677 | 	}
2678 | 
2679 | 	q->rate_ns   = q->tins[ft].tin_rate_ns;
2680 | 	q->rate_shft = q->tins[ft].tin_rate_shft;
2681 | 
2682 | 	if (q->buffer_config_limit) {
2683 | 		q->buffer_limit = q->buffer_config_limit;
2684 | 	} else if (q->rate_bps) {
2685 | 		u64 t = q->rate_bps * q->interval;
2686 | 
2687 | 		do_div(t, USEC_PER_SEC / 4);
2688 | 		q->buffer_limit = max_t(u32, t, 4U << 20);
2689 | 	} else {
2690 | 		q->buffer_limit = ~0;
2691 | 	}
2692 | 
2693 | 	sch->flags &= ~TCQ_F_CAN_BYPASS;
2694 | 
2695 | 	q->buffer_limit = min(q->buffer_limit,
2696 | 			      max(sch->limit * psched_mtu(qdisc_dev(sch)),
2697 | 				  q->buffer_config_limit));
2698 | }
2699 | 
2700 | #if LINUX_VERSION_CODE < KERNEL_VERSION(4, 16, 0)
2701 | static int cake_change(struct Qdisc *sch, struct nlattr *opt)
2702 | #else
2703 | static int cake_change(struct Qdisc *sch, struct nlattr *opt,
2704 | 		struct netlink_ext_ack *extack)
2705 | #endif
2706 | {
2707 | 	struct cake_sched_data *q = qdisc_priv(sch);
2708 | 	struct nlattr *tb[TCA_CAKE_MAX + 1];
2709 | 	int err;
2710 | 
2711 | 	if (!opt)
2712 | 		return -EINVAL;
2713 | 
2714 | #if LINUX_VERSION_CODE < KERNEL_VERSION(4, 12, 0)
2715 | 	err = nla_parse_nested(tb, TCA_CAKE_MAX, opt, cake_policy);
2716 | #elif LINUX_VERSION_CODE < KERNEL_VERSION(4, 16, 0)
2717 | 	err = nla_parse_nested(tb, TCA_CAKE_MAX, opt, cake_policy, NULL);
2718 | #else
2719 | 	err = nla_parse_nested(tb, TCA_CAKE_MAX, opt, cake_policy, extack);
2720 | #endif
2721 | 	if (err < 0)
2722 | 		return err;
2723 | 
2724 | 	if (tb[TCA_CAKE_NAT]) {
2725 | #if IS_REACHABLE(CONFIG_NF_CONNTRACK)
2726 | 		q->flow_mode &= ~CAKE_FLOW_NAT_FLAG;
2727 | 		q->flow_mode |= CAKE_FLOW_NAT_FLAG *
2728 | 			!!nla_get_u32(tb[TCA_CAKE_NAT]);
2729 | #else
2730 | #if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 16, 0)
2731 | 		NL_SET_ERR_MSG_ATTR(extack, tb[TCA_CAKE_NAT],
2732 | 				    "No conntrack support in kernel");
2733 | #endif
2734 | 		return -EOPNOTSUPP;
2735 | #endif
2736 | 	}
2737 | 
2738 | 	if (tb[TCA_CAKE_BASE_RATE64])
2739 | 		q->rate_bps = nla_get_u64(tb[TCA_CAKE_BASE_RATE64]);
2740 | 
2741 | 	if (tb[TCA_CAKE_DIFFSERV_MODE])
2742 | 		q->tin_mode = nla_get_u32(tb[TCA_CAKE_DIFFSERV_MODE]);
2743 | 
2744 | 	if (tb[TCA_CAKE_WASH]) {
2745 | 		if (!!nla_get_u32(tb[TCA_CAKE_WASH]))
2746 | 			q->rate_flags |= CAKE_FLAG_WASH;
2747 | 		else
2748 | 			q->rate_flags &= ~CAKE_FLAG_WASH;
2749 | 	}
2750 | 
2751 | 	if (tb[TCA_CAKE_FLOW_MODE])
2752 | 		q->flow_mode = ((q->flow_mode & CAKE_FLOW_NAT_FLAG) |
2753 | 				(nla_get_u32(tb[TCA_CAKE_FLOW_MODE]) &
2754 | 					CAKE_FLOW_MASK));
2755 | 
2756 | 	if (tb[TCA_CAKE_ATM])
2757 | 		q->atm_mode = nla_get_u32(tb[TCA_CAKE_ATM]);
2758 | 
2759 | 	if (tb[TCA_CAKE_OVERHEAD]) {
2760 | 		q->rate_overhead = nla_get_s32(tb[TCA_CAKE_OVERHEAD]);
2761 | 		q->rate_flags |= CAKE_FLAG_OVERHEAD;
2762 | 
2763 | 		q->max_netlen = 0;
2764 | 		q->max_adjlen = 0;
2765 | 		q->min_netlen = ~0;
2766 | 		q->min_adjlen = ~0;
2767 | 	}
2768 | 
2769 | 	if (tb[TCA_CAKE_RAW]) {
2770 | 		q->rate_flags &= ~CAKE_FLAG_OVERHEAD;
2771 | 
2772 | 		q->max_netlen = 0;
2773 | 		q->max_adjlen = 0;
2774 | 		q->min_netlen = ~0;
2775 | 		q->min_adjlen = ~0;
2776 | 	}
2777 | 
2778 | 	if (tb[TCA_CAKE_MPU])
2779 | 		q->rate_mpu = nla_get_u32(tb[TCA_CAKE_MPU]);
2780 | 
2781 | 	if (tb[TCA_CAKE_RTT]) {
2782 | 		q->interval = nla_get_u32(tb[TCA_CAKE_RTT]);
2783 | 
2784 | 		if (!q->interval)
2785 | 			q->interval = 1;
2786 | 	}
2787 | 
2788 | 	if (tb[TCA_CAKE_TARGET]) {
2789 | 		q->target = nla_get_u32(tb[TCA_CAKE_TARGET]);
2790 | 
2791 | 		if (!q->target)
2792 | 			q->target = 1;
2793 | 	}
2794 | 
2795 | 	if (tb[TCA_CAKE_AUTORATE]) {
2796 | 		if (!!nla_get_u32(tb[TCA_CAKE_AUTORATE]))
2797 | 			q->rate_flags |= CAKE_FLAG_AUTORATE_INGRESS;
2798 | 		else
2799 | 			q->rate_flags &= ~CAKE_FLAG_AUTORATE_INGRESS;
2800 | 	}
2801 | 
2802 | 	if (tb[TCA_CAKE_INGRESS]) {
2803 | 		if (!!nla_get_u32(tb[TCA_CAKE_INGRESS]))
2804 | 			q->rate_flags |= CAKE_FLAG_INGRESS;
2805 | 		else
2806 | 			q->rate_flags &= ~CAKE_FLAG_INGRESS;
2807 | 	}
2808 | 
2809 | 	if (tb[TCA_CAKE_ACK_FILTER])
2810 | 		q->ack_filter = nla_get_u32(tb[TCA_CAKE_ACK_FILTER]);
2811 | 
2812 | 	if (tb[TCA_CAKE_MEMORY])
2813 | 		q->buffer_config_limit = nla_get_u32(tb[TCA_CAKE_MEMORY]);
2814 | 
2815 | 	if (tb[TCA_CAKE_SPLIT_GSO]) {
2816 | 		if (!!nla_get_u32(tb[TCA_CAKE_SPLIT_GSO]))
2817 | 			q->rate_flags |= CAKE_FLAG_SPLIT_GSO;
2818 | 		else
2819 | 			q->rate_flags &= ~CAKE_FLAG_SPLIT_GSO;
2820 | 	}
2821 | 
2822 | 	if (tb[TCA_CAKE_FWMARK]) {
2823 | 		q->fwmark_mask = nla_get_u32(tb[TCA_CAKE_FWMARK]);
2824 | 		q->fwmark_shft = q->fwmark_mask ? __ffs(q->fwmark_mask) : 0;
2825 | 	}
2826 | 
2827 | 	if (q->tins) {
2828 | 		sch_tree_lock(sch);
2829 | 		cake_reconfigure(sch);
2830 | 		sch_tree_unlock(sch);
2831 | 	}
2832 | 
2833 | 	return 0;
2834 | }
2835 | 
2836 | static void cake_destroy(struct Qdisc *sch)
2837 | {
2838 | 	struct cake_sched_data *q = qdisc_priv(sch);
2839 | 
2840 | 	qdisc_watchdog_cancel(&q->watchdog);
2841 | #if LINUX_VERSION_CODE < KERNEL_VERSION(4, 13, 0)
2842 | 	tcf_destroy_chain(&q->filter_list);
2843 | #else
2844 | 	tcf_block_put(q->block);
2845 | #endif
2846 | 	kvfree(q->tins);
2847 | }
2848 | 
2849 | #if LINUX_VERSION_CODE < KERNEL_VERSION(4, 16, 0)
2850 | static int cake_init(struct Qdisc *sch, struct nlattr *opt)
2851 | #else
2852 | static int cake_init(struct Qdisc *sch, struct nlattr *opt,
2853 | 		struct netlink_ext_ack *extack)
2854 | #endif
2855 | {
2856 | 	struct cake_sched_data *q = qdisc_priv(sch);
2857 | 	int i, j, err;
2858 | 
2859 | 	sch->limit = 10240;
2860 | 	q->tin_mode = CAKE_DIFFSERV_DIFFSERV3;
2861 | 	q->flow_mode  = CAKE_FLOW_TRIPLE;
2862 | 
2863 | 	q->rate_bps = 0; /* unlimited by default */
2864 | 
2865 | 	q->interval = 100000; /* 100ms default */
2866 | 	q->target   =   5000; /* 5ms: codel RFC argues
2867 | 			       * for 5 to 10% of interval
2868 | 			       */
2869 | 	q->rate_flags |= CAKE_FLAG_SPLIT_GSO;
2870 | 	q->cur_tin = 0;
2871 | 	q->cur_flow  = 0;
2872 | 
2873 | 	qdisc_watchdog_init(&q->watchdog, sch);
2874 | 
2875 | 	if (opt) {
2876 | #if LINUX_VERSION_CODE < KERNEL_VERSION(4, 16, 0)
2877 | 		err = cake_change(sch, opt);
2878 | #else
2879 | 		err = cake_change(sch, opt, extack);
2880 | #endif
2881 | 
2882 | 		if (err)
2883 | 			return err;
2884 | 	}
2885 | 
2886 | #if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 13, 0)
2887 | #if LINUX_VERSION_CODE < KERNEL_VERSION(4, 15, 0)
2888 | 	err = tcf_block_get(&q->block, &q->filter_list);
2889 | #elif LINUX_VERSION_CODE < KERNEL_VERSION(4, 16, 0)
2890 | 	err = tcf_block_get(&q->block, &q->filter_list, sch);
2891 | #else
2892 | 	err = tcf_block_get(&q->block, &q->filter_list, sch, extack);
2893 | #endif
2894 | 	if (err)
2895 | 		return err;
2896 | #endif
2897 | 
2898 | 	quantum_div[0] = ~0;
2899 | 	for (i = 1; i <= CAKE_QUEUES; i++)
2900 | 		quantum_div[i] = 65535 / i;
2901 | 
2902 | 	q->tins = kvzalloc(CAKE_MAX_TINS * sizeof(struct cake_tin_data),
2903 | 			   GFP_KERNEL);
2904 | 	if (!q->tins)
2905 | 		goto nomem;
2906 | 
2907 | 	for (i = 0; i < CAKE_MAX_TINS; i++) {
2908 | 		struct cake_tin_data *b = q->tins + i;
2909 | 
2910 | 		b->perturb = prandom_u32();
2911 | 		INIT_LIST_HEAD(&b->new_flows);
2912 | 		INIT_LIST_HEAD(&b->old_flows);
2913 | 		INIT_LIST_HEAD(&b->decaying_flows);
2914 | 		b->sparse_flow_count = 0;
2915 | 		b->bulk_flow_count = 0;
2916 | 		b->decaying_flow_count = 0;
2917 | 
2918 | 		for (j = 0; j < CAKE_QUEUES; j++) {
2919 | 			struct cake_flow *flow = b->flows + j;
2920 | 			u32 k = j * CAKE_MAX_TINS + i;
2921 | 
2922 | 			INIT_LIST_HEAD(&flow->flowchain);
2923 | 			cobalt_vars_init(&flow->cvars);
2924 | 
2925 | 			q->overflow_heap[k].t = i;
2926 | 			q->overflow_heap[k].b = j;
2927 | 			b->overflow_idx[j] = k;
2928 | 		}
2929 | 	}
2930 | 
2931 | 	cake_reconfigure(sch);
2932 | 	q->avg_peak_bandwidth = q->rate_bps;
2933 | 	q->min_netlen = ~0;
2934 | 	q->min_adjlen = ~0;
2935 | 	return 0;
2936 | 
2937 | nomem:
2938 | 	cake_destroy(sch);
2939 | 	return -ENOMEM;
2940 | }
2941 | 
2942 | static int cake_dump(struct Qdisc *sch, struct sk_buff *skb)
2943 | {
2944 | 	struct cake_sched_data *q = qdisc_priv(sch);
2945 | 	struct nlattr *opts;
2946 | 
2947 | 	opts = nla_nest_start(skb, TCA_OPTIONS);
2948 | 	if (!opts)
2949 | 		goto nla_put_failure;
2950 | 
2951 | 	if (nla_put_u64_64bit(skb, TCA_CAKE_BASE_RATE64, q->rate_bps,
2952 | 			      TCA_CAKE_PAD))
2953 | 		goto nla_put_failure;
2954 | 
2955 | 	if (nla_put_u32(skb, TCA_CAKE_FLOW_MODE,
2956 | 			q->flow_mode & CAKE_FLOW_MASK))
2957 | 		goto nla_put_failure;
2958 | 
2959 | 	if (nla_put_u32(skb, TCA_CAKE_RTT, q->interval))
2960 | 		goto nla_put_failure;
2961 | 
2962 | 	if (nla_put_u32(skb, TCA_CAKE_TARGET, q->target))
2963 | 		goto nla_put_failure;
2964 | 
2965 | 	if (nla_put_u32(skb, TCA_CAKE_MEMORY, q->buffer_config_limit))
2966 | 		goto nla_put_failure;
2967 | 
2968 | 	if (nla_put_u32(skb, TCA_CAKE_AUTORATE,
2969 | 			!!(q->rate_flags & CAKE_FLAG_AUTORATE_INGRESS)))
2970 | 		goto nla_put_failure;
2971 | 
2972 | 	if (nla_put_u32(skb, TCA_CAKE_INGRESS,
2973 | 			!!(q->rate_flags & CAKE_FLAG_INGRESS)))
2974 | 		goto nla_put_failure;
2975 | 
2976 | 	if (nla_put_u32(skb, TCA_CAKE_ACK_FILTER, q->ack_filter))
2977 | 		goto nla_put_failure;
2978 | 
2979 | 	if (nla_put_u32(skb, TCA_CAKE_NAT,
2980 | 			!!(q->flow_mode & CAKE_FLOW_NAT_FLAG)))
2981 | 		goto nla_put_failure;
2982 | 
2983 | 	if (nla_put_u32(skb, TCA_CAKE_DIFFSERV_MODE, q->tin_mode))
2984 | 		goto nla_put_failure;
2985 | 
2986 | 	if (nla_put_u32(skb, TCA_CAKE_WASH,
2987 | 			!!(q->rate_flags & CAKE_FLAG_WASH)))
2988 | 		goto nla_put_failure;
2989 | 
2990 | 	if (nla_put_u32(skb, TCA_CAKE_OVERHEAD, q->rate_overhead))
2991 | 		goto nla_put_failure;
2992 | 
2993 | 	if (!(q->rate_flags & CAKE_FLAG_OVERHEAD))
2994 | 		if (nla_put_u32(skb, TCA_CAKE_RAW, 0))
2995 | 			goto nla_put_failure;
2996 | 
2997 | 	if (nla_put_u32(skb, TCA_CAKE_ATM, q->atm_mode))
2998 | 		goto nla_put_failure;
2999 | 
3000 | 	if (nla_put_u32(skb, TCA_CAKE_MPU, q->rate_mpu))
3001 | 		goto nla_put_failure;
3002 | 
3003 | 	if (nla_put_u32(skb, TCA_CAKE_SPLIT_GSO,
3004 | 			!!(q->rate_flags & CAKE_FLAG_SPLIT_GSO)))
3005 | 		goto nla_put_failure;
3006 | 
3007 | 	if (nla_put_u32(skb, TCA_CAKE_FWMARK, q->fwmark_mask))
3008 | 		goto nla_put_failure;
3009 | 
3010 | 	return nla_nest_end(skb, opts);
3011 | 
3012 | nla_put_failure:
3013 | 	return -1;
3014 | }
3015 | 
3016 | static int cake_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
3017 | {
3018 | 	struct nlattr *stats = nla_nest_start(d->skb, TCA_STATS_APP);
3019 | 	struct cake_sched_data *q = qdisc_priv(sch);
3020 | 	struct nlattr *tstats, *ts;
3021 | 	int i;
3022 | 
3023 | 	if (!stats)
3024 | 		return -1;
3025 | 
3026 | #define PUT_STAT_U32(attr, data) do {				       \
3027 | 		if (nla_put_u32(d->skb, TCA_CAKE_STATS_ ## attr, data)) \
3028 | 			goto nla_put_failure;			       \
3029 | 	} while (0)
3030 | #define PUT_STAT_U64(attr, data) do {				       \
3031 | 		if (nla_put_u64_64bit(d->skb, TCA_CAKE_STATS_ ## attr, \
3032 | 					data, TCA_CAKE_STATS_PAD)) \
3033 | 			goto nla_put_failure;			       \
3034 | 	} while (0)
3035 | 
3036 | 	PUT_STAT_U64(CAPACITY_ESTIMATE64, q->avg_peak_bandwidth);
3037 | 	PUT_STAT_U32(MEMORY_LIMIT, q->buffer_limit);
3038 | 	PUT_STAT_U32(MEMORY_USED, q->buffer_max_used);
3039 | 	PUT_STAT_U32(AVG_NETOFF, ((q->avg_netoff + 0x8000) >> 16));
3040 | 	PUT_STAT_U32(MAX_NETLEN, q->max_netlen);
3041 | 	PUT_STAT_U32(MAX_ADJLEN, q->max_adjlen);
3042 | 	PUT_STAT_U32(MIN_NETLEN, q->min_netlen);
3043 | 	PUT_STAT_U32(MIN_ADJLEN, q->min_adjlen);
3044 | 
3045 | #undef PUT_STAT_U32
3046 | #undef PUT_STAT_U64
3047 | 
3048 | 	tstats = nla_nest_start(d->skb, TCA_CAKE_STATS_TIN_STATS);
3049 | 	if (!tstats)
3050 | 		goto nla_put_failure;
3051 | 
3052 | #define PUT_TSTAT_U32(attr, data) do {					\
3053 | 		if (nla_put_u32(d->skb, TCA_CAKE_TIN_STATS_ ## attr, data)) \
3054 | 			goto nla_put_failure;				\
3055 | 	} while (0)
3056 | #define PUT_TSTAT_U64(attr, data) do {					\
3057 | 		if (nla_put_u64_64bit(d->skb, TCA_CAKE_TIN_STATS_ ## attr, \
3058 | 					data, TCA_CAKE_TIN_STATS_PAD))	\
3059 | 			goto nla_put_failure;				\
3060 | 	} while (0)
3061 | 
3062 | 	for (i = 0; i < q->tin_cnt; i++) {
3063 | 		struct cake_tin_data *b = &q->tins[q->tin_order[i]];
3064 | 
3065 | 		ts = nla_nest_start(d->skb, i + 1);
3066 | 		if (!ts)
3067 | 			goto nla_put_failure;
3068 | 
3069 | 		PUT_TSTAT_U64(THRESHOLD_RATE64, b->tin_rate_bps);
3070 | 		PUT_TSTAT_U64(SENT_BYTES64, b->bytes);
3071 | 		PUT_TSTAT_U32(BACKLOG_BYTES, b->tin_backlog);
3072 | 
3073 | 		PUT_TSTAT_U32(TARGET_US,
3074 | 			      ktime_to_us(ns_to_ktime(b->cparams.target)));
3075 | 		PUT_TSTAT_U32(INTERVAL_US,
3076 | 			      ktime_to_us(ns_to_ktime(b->cparams.interval)));
3077 | 
3078 | 		PUT_TSTAT_U32(SENT_PACKETS, b->packets);
3079 | 		PUT_TSTAT_U32(DROPPED_PACKETS, b->tin_dropped);
3080 | 		PUT_TSTAT_U32(ECN_MARKED_PACKETS, b->tin_ecn_mark);
3081 | 		PUT_TSTAT_U32(ACKS_DROPPED_PACKETS, b->ack_drops);
3082 | 
3083 | 		PUT_TSTAT_U32(PEAK_DELAY_US,
3084 | 			      ktime_to_us(ns_to_ktime(b->peak_delay)));
3085 | 		PUT_TSTAT_U32(AVG_DELAY_US,
3086 | 			      ktime_to_us(ns_to_ktime(b->avge_delay)));
3087 | 		PUT_TSTAT_U32(BASE_DELAY_US,
3088 | 			      ktime_to_us(ns_to_ktime(b->base_delay)));
3089 | 
3090 | 		PUT_TSTAT_U32(WAY_INDIRECT_HITS, b->way_hits);
3091 | 		PUT_TSTAT_U32(WAY_MISSES, b->way_misses);
3092 | 		PUT_TSTAT_U32(WAY_COLLISIONS, b->way_collisions);
3093 | 
3094 | 		PUT_TSTAT_U32(SPARSE_FLOWS, b->sparse_flow_count +
3095 | 					    b->decaying_flow_count);
3096 | 		PUT_TSTAT_U32(BULK_FLOWS, b->bulk_flow_count);
3097 | 		PUT_TSTAT_U32(UNRESPONSIVE_FLOWS, b->unresponsive_flow_count);
3098 | 		PUT_TSTAT_U32(MAX_SKBLEN, b->max_skblen);
3099 | 
3100 | 		PUT_TSTAT_U32(FLOW_QUANTUM, b->flow_quantum);
3101 | 		nla_nest_end(d->skb, ts);
3102 | 	}
3103 | 
3104 | #undef PUT_TSTAT_U32
3105 | #undef PUT_TSTAT_U64
3106 | 
3107 | 	nla_nest_end(d->skb, tstats);
3108 | 	return nla_nest_end(d->skb, stats);
3109 | 
3110 | nla_put_failure:
3111 | 	nla_nest_cancel(d->skb, stats);
3112 | #if LINUX_VERSION_CODE < KERNEL_VERSION(4, 8, 0)
3113 | 	sch_tree_unlock(sch);
3114 | #endif
3115 | 	return -1;
3116 | }
3117 | 
3118 | static struct Qdisc *cake_leaf(struct Qdisc *sch, unsigned long arg)
3119 | {
3120 | 	return NULL;
3121 | }
3122 | 
3123 | static unsigned long cake_find(struct Qdisc *sch, u32 classid)
3124 | {
3125 | 	return 0;
3126 | }
3127 | 
3128 | static unsigned long cake_bind(struct Qdisc *sch, unsigned long parent,
3129 | 			       u32 classid)
3130 | {
3131 | 	return 0;
3132 | }
3133 | 
3134 | static void cake_unbind(struct Qdisc *q, unsigned long cl)
3135 | {
3136 | }
3137 | 
3138 | #if LINUX_VERSION_CODE < KERNEL_VERSION(4, 13, 0)
3139 | static struct tcf_proto __rcu **cake_find_tcf(struct Qdisc *sch, unsigned long cl)
3140 | #else
3141 | #if LINUX_VERSION_CODE < KERNEL_VERSION(4, 16, 0)
3142 | static struct tcf_block *cake_tcf_block(struct Qdisc *sch, unsigned long cl)
3143 | #else
3144 | static struct tcf_block *cake_tcf_block(struct Qdisc *sch, unsigned long cl,
3145 | 					struct netlink_ext_ack *extack)
3146 | #endif
3147 | #endif
3148 | {
3149 | 	struct cake_sched_data *q = qdisc_priv(sch);
3150 | 
3151 | 	if (cl)
3152 | 		return NULL;
3153 | #if LINUX_VERSION_CODE < KERNEL_VERSION(4, 13, 0)
3154 | 	return &q->filter_list;
3155 | #else
3156 | 	return q->block;
3157 | #endif
3158 | }
3159 | 
3160 | static int cake_dump_class(struct Qdisc *sch, unsigned long cl,
3161 | 			   struct sk_buff *skb, struct tcmsg *tcm)
3162 | {
3163 | 	tcm->tcm_handle |= TC_H_MIN(cl);
3164 | 	return 0;
3165 | }
3166 | 
3167 | static int cake_dump_class_stats(struct Qdisc *sch, unsigned long cl,
3168 | 				 struct gnet_dump *d)
3169 | {
3170 | 	struct cake_sched_data *q = qdisc_priv(sch);
3171 | 	const struct cake_flow *flow = NULL;
3172 | 	struct gnet_stats_queue qs = { 0 };
3173 | 	struct nlattr *stats;
3174 | 	u32 idx = cl - 1;
3175 | 
3176 | 	if (idx < CAKE_QUEUES * q->tin_cnt) {
3177 | 		const struct cake_tin_data *b = \
3178 | 			&q->tins[q->tin_order[idx / CAKE_QUEUES]];
3179 | 		const struct sk_buff *skb;
3180 | 
3181 | 		flow = &b->flows[idx % CAKE_QUEUES];
3182 | 
3183 | 		if (flow->head) {
3184 | 			cake_maybe_lock(sch);
3185 | 			skb = flow->head;
3186 | 			while (skb) {
3187 | 				qs.qlen++;
3188 | 				skb = skb->next;
3189 | 			}
3190 | 			cake_maybe_unlock(sch);
3191 | 		}
3192 | 		qs.backlog = b->backlogs[idx % CAKE_QUEUES];
3193 | 		qs.drops = flow->dropped;
3194 | 	}
3195 | 	if (gnet_stats_copy_queue(d, NULL, &qs, qs.qlen) < 0)
3196 | 		return -1;
3197 | 	if (flow) {
3198 | 		ktime_t now = ktime_get();
3199 | 
3200 | 		stats = nla_nest_start(d->skb, TCA_STATS_APP);
3201 | 		if (!stats)
3202 | 			return -1;
3203 | 
3204 | #define PUT_STAT_U32(attr, data) do {				       \
3205 | 		if (nla_put_u32(d->skb, TCA_CAKE_STATS_ ## attr, data)) \
3206 | 			goto nla_put_failure;			       \
3207 | 	} while (0)
3208 | #define PUT_STAT_S32(attr, data) do {				       \
3209 | 		if (nla_put_s32(d->skb, TCA_CAKE_STATS_ ## attr, data)) \
3210 | 			goto nla_put_failure;			       \
3211 | 	} while (0)
3212 | 
3213 | 		PUT_STAT_S32(DEFICIT, flow->deficit);
3214 | 		PUT_STAT_U32(DROPPING, flow->cvars.dropping);
3215 | 		PUT_STAT_U32(COBALT_COUNT, flow->cvars.count);
3216 | 		PUT_STAT_U32(P_DROP, flow->cvars.p_drop);
3217 | 		if (flow->cvars.p_drop) {
3218 | 			PUT_STAT_S32(BLUE_TIMER_US,
3219 | 				     ktime_to_us(
3220 | 					     ktime_sub(now,
3221 | 						       flow->cvars.blue_timer)));
3222 | 		}
3223 | 		if (flow->cvars.dropping) {
3224 | 			PUT_STAT_S32(DROP_NEXT_US,
3225 | 				     ktime_to_us(
3226 | 					     ktime_sub(now,
3227 | 						       flow->cvars.drop_next)));
3228 | 		}
3229 | 
3230 | 		if (nla_nest_end(d->skb, stats) < 0)
3231 | 			return -1;
3232 | 	}
3233 | 
3234 | 	return 0;
3235 | 
3236 | nla_put_failure:
3237 | 	nla_nest_cancel(d->skb, stats);
3238 | 	return -1;
3239 | }
3240 | 
3241 | static void cake_walk(struct Qdisc *sch, struct qdisc_walker *arg)
3242 | {
3243 | 	struct cake_sched_data *q = qdisc_priv(sch);
3244 | 	unsigned int i, j;
3245 | 
3246 | 	if (arg->stop)
3247 | 		return;
3248 | 
3249 | 	for (i = 0; i < q->tin_cnt; i++) {
3250 | 		struct cake_tin_data *b = &q->tins[q->tin_order[i]];
3251 | 
3252 | 		for (j = 0; j < CAKE_QUEUES; j++) {
3253 | 			if (list_empty(&b->flows[j].flowchain) ||
3254 | 			    arg->count < arg->skip) {
3255 | 				arg->count++;
3256 | 				continue;
3257 | 			}
3258 | 			if (arg->fn(sch, i * CAKE_QUEUES + j + 1, arg) < 0) {
3259 | 				arg->stop = 1;
3260 | 				break;
3261 | 			}
3262 | 			arg->count++;
3263 | 		}
3264 | 	}
3265 | }
3266 | 
3267 | static const struct Qdisc_class_ops cake_class_ops = {
3268 | #if LINUX_VERSION_CODE < KERNEL_VERSION(4, 13, 0)
3269 | 	.tcf_chain	=	cake_find_tcf,
3270 | #else
3271 | 	.tcf_block	=	cake_tcf_block,
3272 | #endif
3273 | #if LINUX_VERSION_CODE < KERNEL_VERSION(4, 14, 0)
3274 | 	.get		=	cake_find,
3275 | 	.put		=	cake_unbind,
3276 | #else
3277 | 	.find		=	cake_find,
3278 | #endif
3279 | 	.unbind_tcf	=	cake_unbind,
3280 | 	.bind_tcf	=	cake_bind,
3281 | 	.leaf		=	cake_leaf,
3282 | 	.dump		=	cake_dump_class,
3283 | 	.dump_stats	=	cake_dump_class_stats,
3284 | 	.walk		=	cake_walk,
3285 | };
3286 | 
3287 | static struct Qdisc_ops cake_qdisc_ops __read_mostly = {
3288 | 	.cl_ops		=	&cake_class_ops,
3289 | 	.id		=	"cake",
3290 | 	.priv_size	=	sizeof(struct cake_sched_data),
3291 | 	.enqueue	=	cake_enqueue,
3292 | 	.dequeue	=	cake_dequeue,
3293 | 	.peek		=	qdisc_peek_dequeued,
3294 | #if LINUX_VERSION_CODE < KERNEL_VERSION(4, 8, 0)
3295 | 	.drop		=	cake_drop,
3296 | #endif
3297 | 	.init		=	cake_init,
3298 | 	.reset		=	cake_reset,
3299 | 	.destroy	=	cake_destroy,
3300 | 	.change		=	cake_change,
3301 | 	.dump		=	cake_dump,
3302 | 	.dump_stats	=	cake_dump_stats,
3303 | 	.owner		=	THIS_MODULE,
3304 | };
3305 | 
3306 | static int __init cake_module_init(void)
3307 | {
3308 | 	return register_qdisc(&cake_qdisc_ops);
3309 | }
3310 | 
3311 | static void __exit cake_module_exit(void)
3312 | {
3313 | 	unregister_qdisc(&cake_qdisc_ops);
3314 | }
3315 | 
3316 | module_init(cake_module_init)
3317 | module_exit(cake_module_exit)
3318 | MODULE_AUTHOR("Jonathan Morton");
3319 | MODULE_LICENSE("Dual BSD/GPL");
3320 | MODULE_DESCRIPTION("The CAKE shaper.");
3321 | 


--------------------------------------------------------------------------------