├── .clang-format
├── .gitignore
├── .gitmodules
├── CITATION.cff
├── LICENSE
├── Makefile
├── README.md
├── bpf
├── .gitignore
├── Makefile
├── README.md
├── bpf_ca_helpers.h
├── powertcp.bpf.c
├── powertcp.cpp
├── powertcp_tcp-int.bpf.c
└── powertcp_tcp-int_head.bpf.c
├── dkms.conf
├── doc
├── code-structure.md
└── module.md
├── powertcp.c
├── powertcp_defs.h
├── powertcp_head.c
├── powertcp_int.c
├── powertcp_no-int.c
├── powertcp_no-int_head.c
├── powertcp_trace.h
├── tcp_powertcp.c
├── tcp_powertcp_trace.h
└── tools
├── README.md
├── bpf_tracer
├── gro_experiment
├── gro_plot
├── iperf_csv
├── iratio_experiment
├── iratio_plot
├── mtu_experiment
├── mtu_plot
├── plot
├── powertcp_experiment
├── powertcp_plot
├── reinsmod
├── screen
├── iperf-client.screen
└── iperf-servers.screen
├── send_something
├── setup-bpf
├── setup-module
├── tracing
└── to_csv
└── tune-eth
/.clang-format:
--------------------------------------------------------------------------------
1 | # SPDX-License-Identifier: GPL-2.0
2 | #
3 | # clang-format configuration file. Intended for clang-format >= 4.
4 | #
5 | # For more information, see:
6 | #
7 | # Documentation/process/clang-format.rst
8 | # https://clang.llvm.org/docs/ClangFormat.html
9 | # https://clang.llvm.org/docs/ClangFormatStyleOptions.html
10 | #
11 | ---
12 | AccessModifierOffset: -4
13 | AlignAfterOpenBracket: Align
14 | AlignConsecutiveAssignments: false
15 | AlignConsecutiveDeclarations: false
16 | #AlignEscapedNewlines: Left # Unknown to clang-format-4.0
17 | AlignOperands: true
18 | AlignTrailingComments: false
19 | AllowAllParametersOfDeclarationOnNextLine: false
20 | AllowShortBlocksOnASingleLine: false
21 | AllowShortCaseLabelsOnASingleLine: false
22 | AllowShortFunctionsOnASingleLine: None
23 | AllowShortIfStatementsOnASingleLine: false
24 | AllowShortLoopsOnASingleLine: false
25 | AlwaysBreakAfterDefinitionReturnType: None
26 | AlwaysBreakAfterReturnType: None
27 | AlwaysBreakBeforeMultilineStrings: false
28 | AlwaysBreakTemplateDeclarations: false
29 | BinPackArguments: true
30 | BinPackParameters: true
31 | BraceWrapping:
32 | AfterClass: false
33 | AfterControlStatement: false
34 | AfterEnum: false
35 | AfterFunction: true
36 | AfterNamespace: true
37 | AfterObjCDeclaration: false
38 | AfterStruct: false
39 | AfterUnion: false
40 | #AfterExternBlock: false # Unknown to clang-format-5.0
41 | BeforeCatch: false
42 | BeforeElse: false
43 | IndentBraces: false
44 | #SplitEmptyFunction: true # Unknown to clang-format-4.0
45 | #SplitEmptyRecord: true # Unknown to clang-format-4.0
46 | #SplitEmptyNamespace: true # Unknown to clang-format-4.0
47 | BreakBeforeBinaryOperators: None
48 | BreakBeforeBraces: Custom
49 | #BreakBeforeInheritanceComma: false # Unknown to clang-format-4.0
50 | BreakBeforeTernaryOperators: false
51 | BreakConstructorInitializersBeforeComma: false
52 | #BreakConstructorInitializers: BeforeComma # Unknown to clang-format-4.0
53 | BreakAfterJavaFieldAnnotations: false
54 | BreakStringLiterals: false
55 | ColumnLimit: 80
56 | CommentPragmas: '^ IWYU pragma:'
57 | #CompactNamespaces: false # Unknown to clang-format-4.0
58 | ConstructorInitializerAllOnOneLineOrOnePerLine: false
59 | ConstructorInitializerIndentWidth: 8
60 | ContinuationIndentWidth: 8
61 | Cpp11BracedListStyle: false
62 | DerivePointerAlignment: false
63 | DisableFormat: false
64 | ExperimentalAutoDetectBinPacking: false
65 | #FixNamespaceComments: false # Unknown to clang-format-4.0
66 |
67 | # Taken from:
68 | # git grep -h '^#define [^[:space:]]*for_each[^[:space:]]*(' include/ \
69 | # | sed "s,^#define \([^[:space:]]*for_each[^[:space:]]*\)(.*$, - '\1'," \
70 | # | sort | uniq
71 | ForEachMacros:
72 | - 'apei_estatus_for_each_section'
73 | - 'ata_for_each_dev'
74 | - 'ata_for_each_link'
75 | - '__ata_qc_for_each'
76 | - 'ata_qc_for_each'
77 | - 'ata_qc_for_each_raw'
78 | - 'ata_qc_for_each_with_internal'
79 | - 'ax25_for_each'
80 | - 'ax25_uid_for_each'
81 | - '__bio_for_each_bvec'
82 | - 'bio_for_each_bvec'
83 | - 'bio_for_each_bvec_all'
84 | - 'bio_for_each_integrity_vec'
85 | - '__bio_for_each_segment'
86 | - 'bio_for_each_segment'
87 | - 'bio_for_each_segment_all'
88 | - 'bio_list_for_each'
89 | - 'bip_for_each_vec'
90 | - 'bitmap_for_each_clear_region'
91 | - 'bitmap_for_each_set_region'
92 | - 'blkg_for_each_descendant_post'
93 | - 'blkg_for_each_descendant_pre'
94 | - 'blk_queue_for_each_rl'
95 | - 'bond_for_each_slave'
96 | - 'bond_for_each_slave_rcu'
97 | - 'bpf_for_each_spilled_reg'
98 | - 'btree_for_each_safe128'
99 | - 'btree_for_each_safe32'
100 | - 'btree_for_each_safe64'
101 | - 'btree_for_each_safel'
102 | - 'card_for_each_dev'
103 | - 'cgroup_taskset_for_each'
104 | - 'cgroup_taskset_for_each_leader'
105 | - 'cpufreq_for_each_entry'
106 | - 'cpufreq_for_each_entry_idx'
107 | - 'cpufreq_for_each_valid_entry'
108 | - 'cpufreq_for_each_valid_entry_idx'
109 | - 'css_for_each_child'
110 | - 'css_for_each_descendant_post'
111 | - 'css_for_each_descendant_pre'
112 | - 'device_for_each_child_node'
113 | - 'displayid_iter_for_each'
114 | - 'dma_fence_chain_for_each'
115 | - 'do_for_each_ftrace_op'
116 | - 'drm_atomic_crtc_for_each_plane'
117 | - 'drm_atomic_crtc_state_for_each_plane'
118 | - 'drm_atomic_crtc_state_for_each_plane_state'
119 | - 'drm_atomic_for_each_plane_damage'
120 | - 'drm_client_for_each_connector_iter'
121 | - 'drm_client_for_each_modeset'
122 | - 'drm_connector_for_each_possible_encoder'
123 | - 'drm_for_each_bridge_in_chain'
124 | - 'drm_for_each_connector_iter'
125 | - 'drm_for_each_crtc'
126 | - 'drm_for_each_crtc_reverse'
127 | - 'drm_for_each_encoder'
128 | - 'drm_for_each_encoder_mask'
129 | - 'drm_for_each_fb'
130 | - 'drm_for_each_legacy_plane'
131 | - 'drm_for_each_plane'
132 | - 'drm_for_each_plane_mask'
133 | - 'drm_for_each_privobj'
134 | - 'drm_mm_for_each_hole'
135 | - 'drm_mm_for_each_node'
136 | - 'drm_mm_for_each_node_in_range'
137 | - 'drm_mm_for_each_node_safe'
138 | - 'flow_action_for_each'
139 | - 'for_each_acpi_dev_match'
140 | - 'for_each_active_dev_scope'
141 | - 'for_each_active_drhd_unit'
142 | - 'for_each_active_iommu'
143 | - 'for_each_aggr_pgid'
144 | - 'for_each_available_child_of_node'
145 | - 'for_each_bio'
146 | - 'for_each_board_func_rsrc'
147 | - 'for_each_bvec'
148 | - 'for_each_card_auxs'
149 | - 'for_each_card_auxs_safe'
150 | - 'for_each_card_components'
151 | - 'for_each_card_dapms'
152 | - 'for_each_card_pre_auxs'
153 | - 'for_each_card_prelinks'
154 | - 'for_each_card_rtds'
155 | - 'for_each_card_rtds_safe'
156 | - 'for_each_card_widgets'
157 | - 'for_each_card_widgets_safe'
158 | - 'for_each_cgroup_storage_type'
159 | - 'for_each_child_of_node'
160 | - 'for_each_clear_bit'
161 | - 'for_each_clear_bit_from'
162 | - 'for_each_cmsghdr'
163 | - 'for_each_compatible_node'
164 | - 'for_each_component_dais'
165 | - 'for_each_component_dais_safe'
166 | - 'for_each_comp_order'
167 | - 'for_each_console'
168 | - 'for_each_cpu'
169 | - 'for_each_cpu_and'
170 | - 'for_each_cpu_not'
171 | - 'for_each_cpu_wrap'
172 | - 'for_each_dapm_widgets'
173 | - 'for_each_dev_addr'
174 | - 'for_each_dev_scope'
175 | - 'for_each_dma_cap_mask'
176 | - 'for_each_dpcm_be'
177 | - 'for_each_dpcm_be_rollback'
178 | - 'for_each_dpcm_be_safe'
179 | - 'for_each_dpcm_fe'
180 | - 'for_each_drhd_unit'
181 | - 'for_each_dss_dev'
182 | - 'for_each_dtpm_table'
183 | - 'for_each_efi_memory_desc'
184 | - 'for_each_efi_memory_desc_in_map'
185 | - 'for_each_element'
186 | - 'for_each_element_extid'
187 | - 'for_each_element_id'
188 | - 'for_each_endpoint_of_node'
189 | - 'for_each_evictable_lru'
190 | - 'for_each_fib6_node_rt_rcu'
191 | - 'for_each_fib6_walker_rt'
192 | - 'for_each_free_mem_pfn_range_in_zone'
193 | - 'for_each_free_mem_pfn_range_in_zone_from'
194 | - 'for_each_free_mem_range'
195 | - 'for_each_free_mem_range_reverse'
196 | - 'for_each_func_rsrc'
197 | - 'for_each_hstate'
198 | - 'for_each_if'
199 | - 'for_each_iommu'
200 | - 'for_each_ip_tunnel_rcu'
201 | - 'for_each_irq_nr'
202 | - 'for_each_link_codecs'
203 | - 'for_each_link_cpus'
204 | - 'for_each_link_platforms'
205 | - 'for_each_lru'
206 | - 'for_each_matching_node'
207 | - 'for_each_matching_node_and_match'
208 | - 'for_each_member'
209 | - 'for_each_memcg_cache_index'
210 | - 'for_each_mem_pfn_range'
211 | - '__for_each_mem_range'
212 | - 'for_each_mem_range'
213 | - '__for_each_mem_range_rev'
214 | - 'for_each_mem_range_rev'
215 | - 'for_each_mem_region'
216 | - 'for_each_migratetype_order'
217 | - 'for_each_msi_entry'
218 | - 'for_each_msi_entry_safe'
219 | - 'for_each_net'
220 | - 'for_each_net_continue_reverse'
221 | - 'for_each_netdev'
222 | - 'for_each_netdev_continue'
223 | - 'for_each_netdev_continue_rcu'
224 | - 'for_each_netdev_continue_reverse'
225 | - 'for_each_netdev_feature'
226 | - 'for_each_netdev_in_bond_rcu'
227 | - 'for_each_netdev_rcu'
228 | - 'for_each_netdev_reverse'
229 | - 'for_each_netdev_safe'
230 | - 'for_each_net_rcu'
231 | - 'for_each_new_connector_in_state'
232 | - 'for_each_new_crtc_in_state'
233 | - 'for_each_new_mst_mgr_in_state'
234 | - 'for_each_new_plane_in_state'
235 | - 'for_each_new_private_obj_in_state'
236 | - 'for_each_node'
237 | - 'for_each_node_by_name'
238 | - 'for_each_node_by_type'
239 | - 'for_each_node_mask'
240 | - 'for_each_node_state'
241 | - 'for_each_node_with_cpus'
242 | - 'for_each_node_with_property'
243 | - 'for_each_nonreserved_multicast_dest_pgid'
244 | - 'for_each_of_allnodes'
245 | - 'for_each_of_allnodes_from'
246 | - 'for_each_of_cpu_node'
247 | - 'for_each_of_pci_range'
248 | - 'for_each_old_connector_in_state'
249 | - 'for_each_old_crtc_in_state'
250 | - 'for_each_old_mst_mgr_in_state'
251 | - 'for_each_oldnew_connector_in_state'
252 | - 'for_each_oldnew_crtc_in_state'
253 | - 'for_each_oldnew_mst_mgr_in_state'
254 | - 'for_each_oldnew_plane_in_state'
255 | - 'for_each_oldnew_plane_in_state_reverse'
256 | - 'for_each_oldnew_private_obj_in_state'
257 | - 'for_each_old_plane_in_state'
258 | - 'for_each_old_private_obj_in_state'
259 | - 'for_each_online_cpu'
260 | - 'for_each_online_node'
261 | - 'for_each_online_pgdat'
262 | - 'for_each_pci_bridge'
263 | - 'for_each_pci_dev'
264 | - 'for_each_pci_msi_entry'
265 | - 'for_each_pcm_streams'
266 | - 'for_each_physmem_range'
267 | - 'for_each_populated_zone'
268 | - 'for_each_possible_cpu'
269 | - 'for_each_present_cpu'
270 | - 'for_each_prime_number'
271 | - 'for_each_prime_number_from'
272 | - 'for_each_process'
273 | - 'for_each_process_thread'
274 | - 'for_each_prop_codec_conf'
275 | - 'for_each_prop_dai_codec'
276 | - 'for_each_prop_dai_cpu'
277 | - 'for_each_prop_dlc_codecs'
278 | - 'for_each_prop_dlc_cpus'
279 | - 'for_each_prop_dlc_platforms'
280 | - 'for_each_property_of_node'
281 | - 'for_each_registered_fb'
282 | - 'for_each_requested_gpio'
283 | - 'for_each_requested_gpio_in_range'
284 | - 'for_each_reserved_mem_range'
285 | - 'for_each_reserved_mem_region'
286 | - 'for_each_rtd_codec_dais'
287 | - 'for_each_rtd_components'
288 | - 'for_each_rtd_cpu_dais'
289 | - 'for_each_rtd_dais'
290 | - 'for_each_set_bit'
291 | - 'for_each_set_bit_from'
292 | - 'for_each_set_clump8'
293 | - 'for_each_sg'
294 | - 'for_each_sg_dma_page'
295 | - 'for_each_sg_page'
296 | - 'for_each_sgtable_dma_page'
297 | - 'for_each_sgtable_dma_sg'
298 | - 'for_each_sgtable_page'
299 | - 'for_each_sgtable_sg'
300 | - 'for_each_sibling_event'
301 | - 'for_each_subelement'
302 | - 'for_each_subelement_extid'
303 | - 'for_each_subelement_id'
304 | - '__for_each_thread'
305 | - 'for_each_thread'
306 | - 'for_each_unicast_dest_pgid'
307 | - 'for_each_vsi'
308 | - 'for_each_wakeup_source'
309 | - 'for_each_zone'
310 | - 'for_each_zone_zonelist'
311 | - 'for_each_zone_zonelist_nodemask'
312 | - 'fwnode_for_each_available_child_node'
313 | - 'fwnode_for_each_child_node'
314 | - 'fwnode_graph_for_each_endpoint'
315 | - 'gadget_for_each_ep'
316 | - 'genradix_for_each'
317 | - 'genradix_for_each_from'
318 | - 'hash_for_each'
319 | - 'hash_for_each_possible'
320 | - 'hash_for_each_possible_rcu'
321 | - 'hash_for_each_possible_rcu_notrace'
322 | - 'hash_for_each_possible_safe'
323 | - 'hash_for_each_rcu'
324 | - 'hash_for_each_safe'
325 | - 'hctx_for_each_ctx'
326 | - 'hlist_bl_for_each_entry'
327 | - 'hlist_bl_for_each_entry_rcu'
328 | - 'hlist_bl_for_each_entry_safe'
329 | - 'hlist_for_each'
330 | - 'hlist_for_each_entry'
331 | - 'hlist_for_each_entry_continue'
332 | - 'hlist_for_each_entry_continue_rcu'
333 | - 'hlist_for_each_entry_continue_rcu_bh'
334 | - 'hlist_for_each_entry_from'
335 | - 'hlist_for_each_entry_from_rcu'
336 | - 'hlist_for_each_entry_rcu'
337 | - 'hlist_for_each_entry_rcu_bh'
338 | - 'hlist_for_each_entry_rcu_notrace'
339 | - 'hlist_for_each_entry_safe'
340 | - 'hlist_for_each_entry_srcu'
341 | - '__hlist_for_each_rcu'
342 | - 'hlist_for_each_safe'
343 | - 'hlist_nulls_for_each_entry'
344 | - 'hlist_nulls_for_each_entry_from'
345 | - 'hlist_nulls_for_each_entry_rcu'
346 | - 'hlist_nulls_for_each_entry_safe'
347 | - 'i3c_bus_for_each_i2cdev'
348 | - 'i3c_bus_for_each_i3cdev'
349 | - 'ide_host_for_each_port'
350 | - 'ide_port_for_each_dev'
351 | - 'ide_port_for_each_present_dev'
352 | - 'idr_for_each_entry'
353 | - 'idr_for_each_entry_continue'
354 | - 'idr_for_each_entry_continue_ul'
355 | - 'idr_for_each_entry_ul'
356 | - 'in_dev_for_each_ifa_rcu'
357 | - 'in_dev_for_each_ifa_rtnl'
358 | - 'inet_bind_bucket_for_each'
359 | - 'inet_lhash2_for_each_icsk_rcu'
360 | - 'key_for_each'
361 | - 'key_for_each_safe'
362 | - 'klp_for_each_func'
363 | - 'klp_for_each_func_safe'
364 | - 'klp_for_each_func_static'
365 | - 'klp_for_each_object'
366 | - 'klp_for_each_object_safe'
367 | - 'klp_for_each_object_static'
368 | - 'kunit_suite_for_each_test_case'
369 | - 'kvm_for_each_memslot'
370 | - 'kvm_for_each_vcpu'
371 | - 'list_for_each'
372 | - 'list_for_each_codec'
373 | - 'list_for_each_codec_safe'
374 | - 'list_for_each_continue'
375 | - 'list_for_each_entry'
376 | - 'list_for_each_entry_continue'
377 | - 'list_for_each_entry_continue_rcu'
378 | - 'list_for_each_entry_continue_reverse'
379 | - 'list_for_each_entry_from'
380 | - 'list_for_each_entry_from_rcu'
381 | - 'list_for_each_entry_from_reverse'
382 | - 'list_for_each_entry_lockless'
383 | - 'list_for_each_entry_rcu'
384 | - 'list_for_each_entry_reverse'
385 | - 'list_for_each_entry_safe'
386 | - 'list_for_each_entry_safe_continue'
387 | - 'list_for_each_entry_safe_from'
388 | - 'list_for_each_entry_safe_reverse'
389 | - 'list_for_each_entry_srcu'
390 | - 'list_for_each_prev'
391 | - 'list_for_each_prev_safe'
392 | - 'list_for_each_safe'
393 | - 'llist_for_each'
394 | - 'llist_for_each_entry'
395 | - 'llist_for_each_entry_safe'
396 | - 'llist_for_each_safe'
397 | - 'mci_for_each_dimm'
398 | - 'media_device_for_each_entity'
399 | - 'media_device_for_each_intf'
400 | - 'media_device_for_each_link'
401 | - 'media_device_for_each_pad'
402 | - 'nanddev_io_for_each_page'
403 | - 'netdev_for_each_lower_dev'
404 | - 'netdev_for_each_lower_private'
405 | - 'netdev_for_each_lower_private_rcu'
406 | - 'netdev_for_each_mc_addr'
407 | - 'netdev_for_each_uc_addr'
408 | - 'netdev_for_each_upper_dev_rcu'
409 | - 'netdev_hw_addr_list_for_each'
410 | - 'nft_rule_for_each_expr'
411 | - 'nla_for_each_attr'
412 | - 'nla_for_each_nested'
413 | - 'nlmsg_for_each_attr'
414 | - 'nlmsg_for_each_msg'
415 | - 'nr_neigh_for_each'
416 | - 'nr_neigh_for_each_safe'
417 | - 'nr_node_for_each'
418 | - 'nr_node_for_each_safe'
419 | - 'of_for_each_phandle'
420 | - 'of_property_for_each_string'
421 | - 'of_property_for_each_u32'
422 | - 'pci_bus_for_each_resource'
423 | - 'pcl_for_each_chunk'
424 | - 'pcl_for_each_segment'
425 | - 'pcm_for_each_format'
426 | - 'ping_portaddr_for_each_entry'
427 | - 'plist_for_each'
428 | - 'plist_for_each_continue'
429 | - 'plist_for_each_entry'
430 | - 'plist_for_each_entry_continue'
431 | - 'plist_for_each_entry_safe'
432 | - 'plist_for_each_safe'
433 | - 'pnp_for_each_card'
434 | - 'pnp_for_each_dev'
435 | - 'protocol_for_each_card'
436 | - 'protocol_for_each_dev'
437 | - 'queue_for_each_hw_ctx'
438 | - 'radix_tree_for_each_slot'
439 | - 'radix_tree_for_each_tagged'
440 | - 'rb_for_each'
441 | - 'rbtree_postorder_for_each_entry_safe'
442 | - 'rdma_for_each_block'
443 | - 'rdma_for_each_port'
444 | - 'rdma_umem_for_each_dma_block'
445 | - 'resource_list_for_each_entry'
446 | - 'resource_list_for_each_entry_safe'
447 | - 'rhl_for_each_entry_rcu'
448 | - 'rhl_for_each_rcu'
449 | - 'rht_for_each'
450 | - 'rht_for_each_entry'
451 | - 'rht_for_each_entry_from'
452 | - 'rht_for_each_entry_rcu'
453 | - 'rht_for_each_entry_rcu_from'
454 | - 'rht_for_each_entry_safe'
455 | - 'rht_for_each_from'
456 | - 'rht_for_each_rcu'
457 | - 'rht_for_each_rcu_from'
458 | - '__rq_for_each_bio'
459 | - 'rq_for_each_bvec'
460 | - 'rq_for_each_segment'
461 | - 'scsi_for_each_prot_sg'
462 | - 'scsi_for_each_sg'
463 | - 'sctp_for_each_hentry'
464 | - 'sctp_skb_for_each'
465 | - 'shdma_for_each_chan'
466 | - '__shost_for_each_device'
467 | - 'shost_for_each_device'
468 | - 'sk_for_each'
469 | - 'sk_for_each_bound'
470 | - 'sk_for_each_entry_offset_rcu'
471 | - 'sk_for_each_from'
472 | - 'sk_for_each_rcu'
473 | - 'sk_for_each_safe'
474 | - 'sk_nulls_for_each'
475 | - 'sk_nulls_for_each_from'
476 | - 'sk_nulls_for_each_rcu'
477 | - 'snd_array_for_each'
478 | - 'snd_pcm_group_for_each_entry'
479 | - 'snd_soc_dapm_widget_for_each_path'
480 | - 'snd_soc_dapm_widget_for_each_path_safe'
481 | - 'snd_soc_dapm_widget_for_each_sink_path'
482 | - 'snd_soc_dapm_widget_for_each_source_path'
483 | - 'tb_property_for_each'
484 | - 'tcf_exts_for_each_action'
485 | - 'udp_portaddr_for_each_entry'
486 | - 'udp_portaddr_for_each_entry_rcu'
487 | - 'usb_hub_for_each_child'
488 | - 'v4l2_device_for_each_subdev'
489 | - 'v4l2_m2m_for_each_dst_buf'
490 | - 'v4l2_m2m_for_each_dst_buf_safe'
491 | - 'v4l2_m2m_for_each_src_buf'
492 | - 'v4l2_m2m_for_each_src_buf_safe'
493 | - 'virtio_device_for_each_vq'
494 | - 'while_for_each_ftrace_op'
495 | - 'xa_for_each'
496 | - 'xa_for_each_marked'
497 | - 'xa_for_each_range'
498 | - 'xa_for_each_start'
499 | - 'xas_for_each'
500 | - 'xas_for_each_conflict'
501 | - 'xas_for_each_marked'
502 | - 'xbc_array_for_each_value'
503 | - 'xbc_for_each_key_value'
504 | - 'xbc_node_for_each_array_value'
505 | - 'xbc_node_for_each_child'
506 | - 'xbc_node_for_each_key_value'
507 | - 'zorro_for_each_dev'
508 |
509 | #IncludeBlocks: Preserve # Unknown to clang-format-5.0
510 | IncludeCategories:
511 | - Regex: '.*'
512 | Priority: 1
513 | IncludeIsMainRegex: '(Test)?$'
514 | IndentCaseLabels: false
515 | #IndentPPDirectives: None # Unknown to clang-format-5.0
516 | IndentWidth: 8
517 | IndentWrappedFunctionNames: false
518 | JavaScriptQuotes: Leave
519 | JavaScriptWrapImports: true
520 | KeepEmptyLinesAtTheStartOfBlocks: false
521 | MacroBlockBegin: ''
522 | MacroBlockEnd: ''
523 | MaxEmptyLinesToKeep: 1
524 | NamespaceIndentation: None
525 | #ObjCBinPackProtocolList: Auto # Unknown to clang-format-5.0
526 | ObjCBlockIndentWidth: 8
527 | ObjCSpaceAfterProperty: true
528 | ObjCSpaceBeforeProtocolList: true
529 |
530 | # Taken from git's rules
531 | #PenaltyBreakAssignment: 10 # Unknown to clang-format-4.0
532 | PenaltyBreakBeforeFirstCallParameter: 30
533 | PenaltyBreakComment: 10
534 | PenaltyBreakFirstLessLess: 0
535 | PenaltyBreakString: 10
536 | PenaltyExcessCharacter: 100
537 | PenaltyReturnTypeOnItsOwnLine: 60
538 |
539 | PointerAlignment: Right
540 | ReflowComments: false
541 | SortIncludes: false
542 | #SortUsingDeclarations: false # Unknown to clang-format-4.0
543 | SpaceAfterCStyleCast: false
544 | SpaceAfterTemplateKeyword: true
545 | SpaceBeforeAssignmentOperators: true
546 | #SpaceBeforeCtorInitializerColon: true # Unknown to clang-format-5.0
547 | #SpaceBeforeInheritanceColon: true # Unknown to clang-format-5.0
548 | SpaceBeforeParens: ControlStatements
549 | #SpaceBeforeRangeBasedForLoopColon: true # Unknown to clang-format-5.0
550 | SpaceInEmptyParentheses: false
551 | SpacesBeforeTrailingComments: 1
552 | SpacesInAngles: false
553 | SpacesInContainerLiterals: false
554 | SpacesInCStyleCastParentheses: false
555 | SpacesInParentheses: false
556 | SpacesInSquareBrackets: false
557 | Standard: Cpp03
558 | TabWidth: 8
559 | UseTab: Always
560 | ...
561 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | *.cmd
2 | *.d
3 | *.ko
4 | *.mod*
5 | *.o
6 | Module.symvers
7 | modules.order
8 |
--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
1 | [submodule "bpf/tcp-int"]
2 | path = bpf/tcp-int
3 | url = https://github.com/jtdor/p4app-TCP-INT.git
4 |
--------------------------------------------------------------------------------
/CITATION.cff:
--------------------------------------------------------------------------------
1 | cff-version: 1.2.0
2 | authors:
3 | - family-names: "Hinz"
4 | given-names: "Jörn-Thorben"
5 | orcid: "https://orcid.org/0009-0005-6588-3873"
6 | title: "PowerTCP for Linux"
7 | url: "https://github.com/inet-tub/powertcp-linux"
8 | message: "If you use this software, please cite it as below."
9 | preferred-citation:
10 | type: conference-paper
11 | authors:
12 | - family-names: "Hinz"
13 | given-names: "Jörn-Thorben"
14 | orcid: "https://orcid.org/0009-0005-6588-3873"
15 | - family-names: "Vamsi"
16 | given-names: "Addanki"
17 | orcid: "https://orcid.org/0000-0002-0577-0413"
18 | - family-names: "Györgyi"
19 | given-names: "Csaba"
20 | orcid: "https://orcid.org/0000-0002-8083-3277"
21 | - family-names: "Jepsen"
22 | given-names: "Theo"
23 | orcid: "https://orcid.org/0000-0002-5845-5089"
24 | - family-names: "Schmid"
25 | given-names: "Stefan"
26 | orcid: "https://orcid.org/0000-0002-7798-1711"
27 | doi: "10.1145/3609021.3609295"
28 | journal: "eBPF '23: Proceedings of the 1st Workshop on eBPF and Kernel Extensions"
29 | publisher:
30 | name: "Association for Computing Machinery"
31 | month: 9
32 | start: 1
33 | end: 7
34 | title: "TCP's Third Eye: Leveraging eBPF for Telemetry-Powered Congestion Control"
35 | year: 2023
36 | references:
37 | - authors:
38 | - family-names: "Vamsi"
39 | given-names: "Addanki"
40 | - family-names: "Michel"
41 | given-names: "Oliver"
42 | - family-names: "Schmid"
43 | given-names: "Stefan"
44 | title: "PowerTCP: Pushing the Performance Limits of Datacenter Networks"
45 | start: 51
46 | end: 70
47 | journal: "19th USENIX Symposium on Networked Systems Design and Implementation (NSDI 22)"
48 | year: 2022
49 | month: 4
50 | type: conference-paper
51 | - authors:
52 | - family-names: "Jereczek"
53 | given-names: "Grzegorz"
54 | - family-names: "Jepsen"
55 | given-names: "Theo"
56 | - family-names: "Wass"
57 | given-names: "Simon"
58 | - family-names: "Pujari"
59 | given-names: "Bimmy"
60 | - family-names: "Zhen"
61 | given-names: "Jerry"
62 | - family-names: "Lee"
63 | given-names: "Jeongkeun"
64 | title: "TCP-INT: Lightweight Network Telemetry with TCP Transport"
65 | start: 58
66 | end: 60
67 | journal: "Proceedings of the SIGCOMM'22 Poster and Demo Sessions"
68 | year: 2022
69 | month: 10
70 | type: conference-paper
71 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2022 Internet Network Architectures (INET)
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
1 | ifneq ($(KERNELRELEASE),)
2 |
3 | # Without explicitly specifying the source folder as an include dir,
4 | # define_trace.h fails to find our trace header.
5 | ccflags-y := -I$(src)
6 |
7 | obj-m := tcp_powertcp.o
8 |
9 | else
10 |
11 | KDIR ?= /lib/modules/$(shell uname -r)/build
12 |
13 | .PHONY: modules modules_install clean help
14 | modules modules_install clean help:
15 | $(MAKE) -C $(KDIR) M=$$PWD $@
16 |
17 | dkms_package_version := $(shell awk -F= '$$1 == "PACKAGE_VERSION" { gsub("\"", "", $$2); print $$2 }' dkms.conf)
18 |
19 | .PHONY: dkms_install
20 | dkms_install:
21 | dkms install .
22 |
23 | .PHONY: dkms_uninstall
24 | dkms_uninstall:
25 | dkms remove --all powertcp/$(dkms_package_version)
26 | $(RM) -r /usr/src/powertcp-$(dkms_package_version)
27 |
28 | endif
29 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # PowerTCP for Linux
2 | This repository presents an implementation of the PowerTCP congestion control
3 | for the Linux kernel. The two variants of PowerTCP are provided as separate
4 | congestion control algorithms:
5 | - the telemetry-based *PowerTCP* and
6 | - the simplified, timing-based *RTT-PowerTCP* (called *θ-PowerTCP* in the
7 | [paper](#for-powertcp)).
8 |
9 | Please see the [references](#references) for background on this work.
10 |
11 | This repository contains two implementations of PowerTCP: a kernel module and an
12 | eBPF program.
13 |
14 | ## Step-by-step instructions
15 |
16 | The main focus of this work is on the eBPF implementation. Follow its
17 | [instructions](bpf/README.md) for experimenting with it.
18 |
19 | There is also a proof-of-concept implementation as a kernel module, see its
20 | [instructions](doc/module.md).
21 |
22 | ## Implementation details
23 | There is *some* documentation on aspects of the implementation(s) in
24 | [doc/](doc/).
25 |
26 | ## References
27 |
28 | ### For the work in this repository
29 | > Jörn-Thorben Hinz, Vamsi Addanki, Csaba Györgyi, Theo Jepsen, and Stefan Schmid.
30 | > “TCP's Third Eye: Leveraging eBPF for Telemetry-Powered Congestion Control”
31 | > In *Proceedings of the 1st Workshop on eBPF and Kernel Extensions*, pp. 1-7. 2023.
32 |
33 | https://doi.org/10.1145/3609021.3609295
34 |
35 |
36 | Click for BibTex citation
37 |
38 | ```bib
39 | @inproceedings{tcpsthirdeye,
40 | author = {Hinz, J\"{o}rn-Thorben and Addanki, Vamsi and Gy\"{o}rgyi, Csaba and Jepsen, Theo and Schmid, Stefan},
41 | title = {TCP's Third Eye: Leveraging EBPF for Telemetry-Powered Congestion Control},
42 | year = {2023},
43 | isbn = {9798400702938},
44 | publisher = {Association for Computing Machinery},
45 | address = {New York, NY, USA},
46 | url = {https://doi.org/10.1145/3609021.3609295},
47 | doi = {10.1145/3609021.3609295},
48 | booktitle = {Proceedings of the 1st Workshop on EBPF and Kernel Extensions},
49 | pages = {1–7},
50 | numpages = {7},
51 | keywords = {eBPF, datacenter, INT, congestion control, TCP, linux kernel},
52 | location = {New York, NY, USA},
53 | series = {eBPF '23}
54 | }
55 | ```
56 |
57 |
58 |
59 | ### For PowerTCP
60 | > Vamsi Addanki, Oliver Michel, and Stefan Schmid.
61 | > “PowerTCP: Pushing the Performance Limits of Datacenter NEtworks”
62 | > In *19th USENIX Symposium on Networked Systems Design and Implementation (NSDI 22)*, pp. 51-70. 2022.
63 |
64 | https://www.usenix.org/conference/nsdi22/presentation/addanki
65 |
66 |
67 | Click for BibTex citation
68 |
69 | ```bib
70 | @inproceedings{powertcp,
71 | author = {Vamsi Addanki and Oliver Michel and Stefan Schmid},
72 | title = {{PowerTCP}: Pushing the Performance Limits of Datacenter Networks},
73 | booktitle = {19th USENIX Symposium on Networked Systems Design and Implementation (NSDI 22)},
74 | year = {2022},
75 | isbn = {978-1-939133-27-4},
76 | address = {Renton, WA},
77 | pages = {51--70},
78 | url = {https://www.usenix.org/conference/nsdi22/presentation/addanki},
79 | publisher = {USENIX Association},
80 | month = apr
81 | }
82 | ```
83 |
84 |
85 |
86 | ### For TCP-INT
87 | > Grzegorz Jereczek, Theo Jepsen, Simon Wass, Bimmy Pujari, Jerry Zhen, and Jeongkeun Lee.
88 | > “TCP-INT: Lightweight Network Telemetry with TCP Transport”
89 | > In *Proceedings of the SIGCOMM'22 Poster and Demo Sessions*, pp. 58-60. 2022.
90 |
91 | https://doi.org/10.1145/3546037.3546064
92 |
93 |
94 | Click for BibTex citation
95 |
96 | ```bib
97 | @inproceedings{tcpint,
98 | author = {Jereczek, Grzegorz and Jepsen, Theo and Wass, Simon and Pujari, Bimmy and Zhen, Jerry and Lee, Jeongkeun},
99 | title = {TCP-INT: Lightweight Network Telemetry with TCP Transport},
100 | year = {2022},
101 | isbn = {9781450394345},
102 | publisher = {Association for Computing Machinery},
103 | address = {New York, NY, USA},
104 | url = {https://doi.org/10.1145/3546037.3546064},
105 | doi = {10.1145/3546037.3546064},
106 | pages = {58–60},
107 | numpages = {3},
108 | keywords = {in-band network telemetry, network monitoring},
109 | location = {Amsterdam, Netherlands},
110 | series = {SIGCOMM '22}
111 | }
112 | ```
113 |
114 |
115 |
--------------------------------------------------------------------------------
/bpf/.gitignore:
--------------------------------------------------------------------------------
1 | *.skel.h
2 | powertcp
3 | vmlinux.h
4 |
--------------------------------------------------------------------------------
/bpf/Makefile:
--------------------------------------------------------------------------------
1 | BPFTOOL ?= /usr/sbin/bpftool
2 | CLANG ?= clang
3 | LLVM_STRIP ?= llvm-strip
4 | VMLINUX ?= /sys/kernel/btf/vmlinux
5 | TCP_INT_DIR ?= tcp-int/code
6 |
7 | HAVE_WRITABLE_SK_PACING ?= 0
8 | USE_SWLAT_AS_TIMESTAMP ?= 0
9 |
10 | BPF_OBJS := powertcp.bpf.o
11 | BPF_DEPS := $(BPF_OBJS:.o=.d)
12 | BPF_SKELS := $(BPF_OBJS:.bpf.o=.skel.h)
13 | PROGS := powertcp
14 | PROG_DEPS := $(PROGS:=.d)
15 | PROG_OBJS := $(PROGS:=.o)
16 | VMLINUX_H := vmlinux.h
17 |
18 | # Copied from Linux' tools/scripts/Makefile.arch:
19 | ARCH := $(shell uname -m | sed -e s/i.86/x86/ -e s/x86_64/x86/ \
20 | -e s/sun4u/sparc/ -e s/sparc64/sparc/ \
21 | -e /arm64/!s/arm.*/arm/ -e s/sa110/arm/ \
22 | -e s/s390x/s390/ -e s/parisc64/parisc/ \
23 | -e s/ppc.*/powerpc/ -e s/mips.*/mips/ \
24 | -e s/sh[234].*/sh/ -e s/aarch64.*/arm64/ \
25 | -e s/riscv.*/riscv/)
26 |
27 | LIBBPF_CFLAGS := $(shell pkg-config --cflags libbpf)
28 | LIBBPF_LIBS := $(shell pkg-config --libs libbpf)
29 |
30 | BPF_CFLAGS := -g -target bpf -D__TARGET_ARCH_$(ARCH) \
31 | $(LIBBPF_CFLAGS) -c -O2 \
32 | -mcpu=v3 -Wall -DHAVE_WRITABLE_SK_PACING=$(HAVE_WRITABLE_SK_PACING) \
33 | -DUSE_SWLAT_AS_TIMESTAMP=$(USE_SWLAT_AS_TIMESTAMP) \
34 | -I.. -I$(TCP_INT_DIR)/include
35 | CXXFLAGS := -std=gnu++17 -O3 -Wall -Wextra $(LIBBPF_CFLAGS) -I.. -I$(TCP_INT_DIR)/src/tools
36 | DEP_CFLAGS := -M -MG -I.. -I$(TCP_INT_DIR)/include
37 | LDLIBS := $(LIBBPF_LIBS)
38 |
39 | .PHONY: all
40 | all: $(BPF_OBJS) $(PROGS) tcp_int
41 |
42 | .PHONY: clean
43 | clean:
44 | $(MAKE) -C tcp-int/code/src clean
45 | $(RM) $(BPF_DEPS) $(BPF_OBJS) $(BPF_SKELS) $(PROG_DEPS) $(PROG_OBJS) $(PROGS) $(VMLINUX_H)
46 |
47 | $(VMLINUX_H): $(VMLINUX)
48 | $(BPFTOOL) btf dump file $< format c > $@
49 |
50 | %.bpf.o: %.bpf.c
51 | $(CLANG) $(BPF_CFLAGS) $< -o $@
52 | $(LLVM_STRIP) -g $@
53 |
54 | %.skel.h: %.bpf.o
55 | $(BPFTOOL) gen skeleton $< > $@
56 |
57 | $(PROGS): CC=$(CXX)
58 |
59 | $(PROG_DEPS): DEP_CFLAGS += -I$(TCP_INT_DIR)/src/tools
60 |
61 | $(BPF_DEPS): %.d: %.c
62 | $(CC) $(DEP_CFLAGS) $< -MF $@
63 |
64 | $(PROG_DEPS): %.d: %.cpp
65 | $(CXX) $(DEP_CFLAGS) $< -MF $@
66 |
67 | .PHONY: tcp_int
68 | tcp_int:
69 | env --unset=VMLINUX_H $(MAKE) -C tcp-int/code/src
70 |
71 | ifneq ($(MAKECMDGOALS),clean)
72 | -include $(BPF_DEPS) $(PROG_DEPS)
73 | endif
74 |
--------------------------------------------------------------------------------
/bpf/README.md:
--------------------------------------------------------------------------------
1 | # PowerTCP eBPF implementation
2 |
3 | > [!IMPORTANT]
4 | > The `bpf_powertcp` congestion control is fully functional but requires TCP-INT
5 | > to be deployed on your network switches. A description on how to deploy TCP-INT
6 | > is unfortunately out of the scope of this repository. You can find a hint in
7 | > the TCP-INT repository: [Switch Code](https://github.com/p4lang/p4app-TCP-INT/tree/v0.2.0-alpha#switch-code).
8 |
9 | > [!NOTE]
10 | > The `bpf_rttpowertcp` is fully functional when the network interface(s)
11 | > support hardware timestamping. You can check the support
12 | > with (as root/with `sudo`)
13 | > ```
14 | > ethtool -T INTERFACE | grep hardware-receive
15 | > ```
16 | > which should output `hardware-receive`.
17 |
18 | Following are step-by-step instructions on how to use and experiment with the
19 | PowerTCP eBPF implementation. All commands listed here are assumed to be executed
20 | in the root folder of this repository.
21 |
22 | When loaded into the kernel, the congestion control algorithms are called
23 | `bpf_powertcp` and `bpf_rttpowertcp`.
24 |
25 | ## Prerequisites
26 |
27 | ### In the network
28 | - TCP-INT
29 | [deployed](https://github.com/p4lang/p4app-TCP-INT/tree/v0.2.0-alpha#switch-code)
30 | on network switches
31 |
32 | ### On the hosts
33 | - Linux kernel 5.10 or above (ideally 6.0 or above)
34 | - `bpftool` version 5.15 or above
35 | - `clang` version 3.7 or above
36 | - `g++` version 10 or above
37 | - libbpf version 0.5 or above
38 | - `llvm-strip`
39 | - `make`
40 |
41 | The required versions are available starting with Debian 10 (Bullseye) and Ubuntu
42 | 22.04 (Jammy Jellyfish).
43 |
44 | The installation of the required software is shown in the following.
45 |
46 |
47 | Details on the kernel requirements
48 |
49 | The target kernel must be compiled with `CONFIG_DEBUG_INFO_BTF=y`. It usually
50 | is, check with
51 | ```
52 | grep -w CONFIG_DEBUG_INFO_BTF /boot/config-$(uname -r)
53 | ```
54 |
55 | For optimal performance, the target kernel can be
56 | [patched for `sk_pacing_rate` to be writable](https://lore.kernel.org/all/20220622191227.898118-2-jthinz@mailbox.tu-berlin.de/)
57 | from eBPF code. This patch is included in kernel versions 6.0 and following, no
58 | further action is required. If the target kernel is *manually* patched, enable
59 | the usage of the pacing rate in the eBPF programs by appending
60 | `HAVE_WRITABLE_SK_PACING=1` to the below invocation of `make`.
61 |
62 |
63 |
64 | ## After checkout
65 | After checking out this repository, also checkout TCP-INT which is managed as a
66 | Git submodule in the bpf/tcp-int/ subdirectory:
67 | ```
68 | git submodule update --init
69 | ```
70 |
71 | ## Preparation
72 |
73 | The preparation steps need to be executed on both client and server.
74 |
75 | 1. Install required packages (as root/with `sudo`):
76 | ```
77 | apt install 'bpftool|linux-tools-common$' clang g++ gcc libbpf-dev llvm make
78 | ```
79 |
80 | Ideally, tune the network interface *IFACE* for low latency etc. (as root/with
81 | `sudo`):
82 | ```
83 | apt install ethtool procps tuned
84 | ./tools/tune-eth IFACE
85 | ```
86 | 2. Build the PowerTCP BPF program and TCP-INT:
87 | ```
88 | make -C bpf/
89 | ```
90 |
91 | If you are using a modified TCP-INT P4 application that replaces the `swlat`
92 | telemetry field with a timestamp, append `USE_SWLAT_AS_TIMESTAMP=1` to the
93 | above invocation of `make`.
94 |
95 | Disable stripping of the object files (for more human-readable `objdump`
96 | output) by appending `LLVM_STRIP=/bin/true` to the above invocation of `make`.
97 | 3. For `bpf_rttpowertcp`, enable hardware timestamping on the relevant network
98 | interface(s) *IFACE(s)* (as root/with `sudo`):
99 | ```
100 | ./bpf/powertcp enable-hwts IFACE(s)
101 | ```
102 |
103 | ## On the server
104 |
105 | *Close any previously opened screen sessions that were opened this way.*
106 |
107 | Start `iperf` and `iperf3` server instances, ready to use PowerTCP, in a screen
108 | session (as root/with `sudo`):
109 | ```
110 | ./tools/setup-bpf iperf-servers
111 | ```
112 | **Beware: You are root user inside the screen session!**
113 |
114 | Algorithm parameters (see [On the client](#on-the-client)) do not need to be
115 | set on the server, they are irrelevant here.
116 |
117 | ## On the client
118 |
119 | > [!NOTE]
120 | > Applications that want to use `bpf_powertcp` or `bpf_rttpowertcp` must be
121 | > executed in the *tcp-int* cgroup. The `setup-bpf` script takes care of this.
122 |
123 | On the client, you can use PowerTCP in an interactive session or automatically
124 | record traces of the algorithm execution.
125 |
126 | ### Interactive usage
127 |
128 | *Close any previously opened screen sessions that were opened this way.*
129 |
130 | The `setup_bpf` script opens a screen session readily prepared to use PowerTCP.
131 | Applications executed in this screen session are in the *tcp-int* cgroup, as
132 | required.
133 |
134 | You can pass algorithm parameters to `setup-bpf`. You should pass at least
135 | `hop_bw` and `host_bw`, e.g (as root/with `sudo`):
136 | ```
137 | ./tools/setup-bpf iperf-client tracing host_bw=25000 hop_bw=25000 base_rtt=50
138 | ```
139 | For a list of the available parameters see
140 | ```
141 | ./bpf/powertcp -h
142 | ```
143 |
144 | **Beware: You are root user inside the screen session!**
145 |
146 | Inside the screen session, you can, e.g,
147 | - run `iperf3` (or `iperf`, the options differ)
148 | ```
149 | iperf3 -N -C bpf_powertcp -c SERVER_IP
150 | iperf3 -N -C bpf_rttpowertcp -c SERVER_IP
151 | ```
152 | - or watch PowerTCP’s trace output
153 | ```
154 | ./bpf/powertcp trace
155 | ```
156 | (for CSV output append the option `-C`—or see [Record traces](#record-traces))
157 | - or watch TCP-INT’s trace output
158 | ```
159 | ./bpf/tcp-int/code/src/tools/tcp_int trace
160 | ```
161 | - or quickly setup PowerTCP with different parameters
162 | ```
163 | ./bpf/powertcp register -f tracing host_bw=100000 hop_bw=100000 base_rtt=50 gamma=0.7
164 | ```
165 |
166 | ### Record traces
167 |
168 | *To record a trace, close any previously opened screen sessions opened for
169 | [interactive usage](#interactive-usage).*
170 |
171 | Record traces (as CSV files) of running `iperf`/`iperf3` with multiple
172 | **combinations** of algorithm parameters (as root/with `sudo`):
173 | ```
174 | ./tools/bpf_tracer iperf3 -N -c SERVER_IP -C bpf_powertcp -- host_bw=25000 hop_bw=20000 base_rtt=50 beta="2 10" gamma="0.5 0.9"
175 | ```
176 |
177 | `bpf_tracer` takes an `iperf`/`iperf3` command line followed by PowerTCP
178 | algorithm parameters, separated by a `--`:
179 | ```
180 | ./tools/bpf_tracer IPERF(3)_CMDLINE -- POWERTCP_PARAMS
181 | ```
182 | `IPERF(3)_CMDLINE` must contain a full `iperf`/`iperf3` *client* command line;
183 | *it must specify the congestion control algorithm to use*.
184 |
185 | `POWERTCP_PARAMS` can contain any of the parameters listed by
186 | `./bpf/powertcp -h`. Multiple values can be given for each parameter as a
187 | quoted string.
188 |
189 | The above example call produces 4 CSV files:
190 | ```
191 | bpf_powertcp-gamma=0.5 base_rtt=50 hop_bw=20000 beta=10 host_bw=25000.csv
192 | bpf_powertcp-gamma=0.5 base_rtt=50 hop_bw=20000 beta=2 host_bw=25000.csv
193 | bpf_powertcp-gamma=0.9 base_rtt=50 hop_bw=20000 beta=10 host_bw=25000.csv
194 | bpf_powertcp-gamma=0.9 base_rtt=50 hop_bw=20000 beta=2 host_bw=25000.csv
195 | ```
196 |
--------------------------------------------------------------------------------
/bpf/bpf_ca_helpers.h:
--------------------------------------------------------------------------------
1 | // SPDX-License-Identifier: GPL-2.0 OR MIT
2 | /*
3 | * Common helpers for an eBPF CA.
4 | *
5 | * Similar to Linux' tools/testing/selftests/bpf/bpf_tcp_helpers.h but without
6 | * type definitions. vmlinux.h is used here for those. Most parts are copied
7 | * from net/tcp.h.
8 | */
9 |
10 | #ifndef BPF_CA_HELPERS_H
11 | #define BPF_CA_HELPERS_H
12 |
13 | #include "vmlinux.h"
14 |
15 | #define MEGA 1000000UL
16 | #define SO_MAX_PACING_RATE 47
17 | #define SO_TIMESTAMPING_NEW 65
18 | #define SOL_SOCKET 1
19 | #define SOL_TCP 6
20 | #define TCP_INFINITE_SSTHRESH 0x7fffffff
21 | #define USEC_PER_SEC 1000000L
22 | #define NSEC_PER_SEC 1000000000L
23 | #define NSEC_PER_USEC 1000L
24 |
25 | #define __KERNEL_DIV_ROUND_UP(n, d) (((n) + (d)-1) / (d))
26 | #define BITS_PER_BYTE 8
27 | #define BITS_PER_TYPE(type) (sizeof(type) * BITS_PER_BYTE)
28 | #define BITS_TO_BYTES(nr) __KERNEL_DIV_ROUND_UP(nr, BITS_PER_TYPE(char))
29 | #if __STDC_VERSION__ <= 201710L
30 | #define BUILD_BUG_ON(cond) _Static_assert(!(cond), "BUILD BUG: " #cond)
31 | #else
32 | #define BUILD_BUG_ON(cond) static_assert(!(cond), "BUILD BUG: " #cond)
33 | #endif
34 | #define ICSK_CA_PRIV_SIZE \
35 | (sizeof(((struct inet_connection_sock *)NULL)->icsk_ca_priv))
36 | #define max(x, y) (((x) > (y)) ? (x) : (y))
37 | #define max_t(type, x, y) max((type)(x), (type)(y))
38 | #define min(x, y) (((x) < (y)) ? (x) : (y))
39 | #define min_t(type, x, y) min((type)(x), (type)(y))
40 |
41 | static inline bool before(u32 seq1, u32 seq2)
42 | {
43 | return (s32)(seq1 - seq2) < 0;
44 | }
45 | #define after(seq2, seq1) before(seq1, seq2)
46 |
47 | static inline struct inet_connection_sock *inet_csk(const struct sock *sk)
48 | {
49 | return (struct inet_connection_sock *)sk;
50 | }
51 |
52 | static inline void *inet_csk_ca(const struct sock *sk)
53 | {
54 | return (void *)inet_csk(sk)->icsk_ca_priv;
55 | }
56 |
57 | /* Minimum RTT in usec. ~0 means not available. */
58 | static inline u32 tcp_min_rtt(const struct tcp_sock *tp)
59 | {
60 | return tp->rtt_min.s[0].v;
61 | }
62 |
63 | static inline struct tcp_sock *tcp_sk(const struct sock *sk)
64 | {
65 | return (struct tcp_sock *)sk;
66 | }
67 |
68 | static inline u32 tcp_stamp_us_delta(u64 t1, u64 t0)
69 | {
70 | return max_t(s64, t1 - t0, 0);
71 | }
72 |
73 | #endif
74 |
--------------------------------------------------------------------------------
/bpf/powertcp.bpf.c:
--------------------------------------------------------------------------------
1 | // SPDX-License-Identifier: GPL-2.0 OR MIT
2 | /*
3 | * PowerTCP congestion control
4 | *
5 | * Based on the algorithm developed in:
6 | * Addanki, V., O. Michel, and S. Schmid.
7 | * "PowerTCP: Pushing the Performance Limits of Datacenter Networks."
8 | * 19th USENIX Symposium on Networked Systems Design and Implementation (NSDI 22).
9 | * USENIX Association, 2022.
10 | * Available at: https://arxiv.org/pdf/2112.14309.pdf
11 | *
12 | * Implemented by:
13 | * Jörn-Thorben Hinz, TU Berlin, 2022.
14 | */
15 |
16 | #include "bpf_ca_helpers.h"
17 | #include "powertcp_defs.h"
18 |
19 | #include "vmlinux.h"
20 |
21 | #include "../powertcp_trace.h"
22 |
23 | #include
24 | #include
25 |
26 | char _license[] SEC("license") = "Dual MIT/GPL";
27 |
28 | #define ULONG_MAX (-1UL)
29 |
30 | #define POWERTCP_CONG_OPS_ATTRS SEC(".struct_ops")
31 | #define POWERTCP_CONG_OPS_FUNC(name, ...) \
32 | SEC("struct_ops/" __stringify(name)) \
33 | BPF_PROG(name, __VA_ARGS__)
34 | #define POWERTCP_CONG_OPS_FUNC_PTR (void *)
35 | #define POWERTCP_CONG_OPS_NAME_PREFIX bpf_
36 |
37 | /* Configuration variables can only be set before loading the BPF object: */
38 | #define POWERTCP_PARAM_ATTRS const volatile
39 |
40 | #include "powertcp_tcp-int_head.bpf.c"
41 |
42 | #include "../powertcp_head.c"
43 |
44 | POWERTCP_PARAM_ATTRS bool tracing = false;
45 |
46 | extern __u32 LINUX_KERNEL_VERSION __kconfig;
47 |
48 | struct {
49 | __uint(type, BPF_MAP_TYPE_SK_STORAGE);
50 | __uint(map_flags, BPF_F_NO_PREALLOC);
51 | __type(key, int);
52 | __type(value, u64);
53 | } map_powertcp_hwtstamps SEC(".maps");
54 |
55 | struct {
56 | __uint(type, BPF_MAP_TYPE_RINGBUF);
57 | __uint(max_entries, 512 * 1024);
58 | } trace_events SEC(".maps");
59 |
60 | /* Look for the host bandwidth (in Mbit/s). */
61 | static unsigned long get_host_bw(struct sock *sk)
62 | {
63 | return host_bw;
64 | #if 0
65 | const struct dst_entry *dst = sk->sk_dst_cache;
66 | unsigned long bw = fallback_host_bw;
67 |
68 | if (dst && dst->dev) {
69 | struct ethtool_link_ksettings cmd;
70 | int r;
71 |
72 | rtnl_lock();
73 | /* ethtool_params_from_link_mode() would be even simpler.
74 | * But dst->dev->link_mode seems to always be 0 at this point. */
75 | r = __ethtool_get_link_ksettings(dst->dev, &cmd);
76 | rtnl_unlock();
77 | if (r == 0 && cmd.base.speed != SPEED_UNKNOWN) {
78 | bw = cmd.base.speed;
79 | pr_debug("hash=%u: got link speed: %lu Mbit/s\n",
80 | sk->sk_hash, bw);
81 | } else {
82 | pr_warn("link speed unavailable, using fallback: %lu Mbit/s\n",
83 | bw);
84 | }
85 | }
86 |
87 | return bw;
88 | #endif
89 | }
90 |
91 | static u64 get_tstamp(const struct sock *sk)
92 | {
93 | u64 *hwtstamp = bpf_sk_storage_get(&map_powertcp_hwtstamps,
94 | (struct sock *)sk, NULL, 0);
95 | if (hwtstamp && *hwtstamp) {
96 | return *hwtstamp;
97 | }
98 |
99 | return tcp_sk(sk)->tcp_clock_cache;
100 | }
101 |
102 | static void output_trace_event(struct powertcp_trace_event *trace_event)
103 | {
104 | trace_event->time = bpf_ktime_get_ns();
105 | bpf_ringbuf_output(&trace_events, trace_event, sizeof(*trace_event), 0);
106 | }
107 |
108 | void require_hwtstamps(struct sock *sk)
109 | {
110 | /* Nothing to do here. For a BPF program to have __sk_buff.hwtstamp
111 | * populated, only ioctl(SIOCSHWTSTAMP) must be executed on the network
112 | * device. No bpf_setsockopt(SO_TIMESTAMPING_*) is necessary.
113 | */
114 | }
115 |
116 | static void require_pacing(struct sock *sk)
117 | {
118 | /* When using a kernel version before 6.0 that is manually patched with
119 | * https://lore.kernel.org/all/20220622191227.898118-2-jthinz@mailbox.tu-berlin.de/,
120 | * writing to sk_pacing_* can be enabled with HAVE_WRITABLE_SK_PACING=1
121 | * passed to make.
122 | */
123 | if (HAVE_WRITABLE_SK_PACING ||
124 | LINUX_KERNEL_VERSION >= KERNEL_VERSION(6, 0, 0)) {
125 | /* We do want sk_pacing_rate to be respected: */
126 | #if __clang_major__ >= 12
127 | // cmpxchg(&sk->sk_pacing_status, SK_PACING_NONE, SK_PACING_NEEDED);
128 | __sync_bool_compare_and_swap(&sk->sk_pacing_status,
129 | SK_PACING_NONE, SK_PACING_NEEDED);
130 | #else
131 | if (sk->sk_pacing_status == SK_PACING_NONE) {
132 | sk->sk_pacing_status = SK_PACING_NEEDED;
133 | }
134 | #endif
135 | }
136 | }
137 |
138 | /* Set the socket pacing rate (bytes per second). */
139 | static void set_rate(struct sock *sk, unsigned long rate)
140 | {
141 | /* When using a kernel version before 6.0 that is manually patched with
142 | * https://lore.kernel.org/all/20220622191227.898118-2-jthinz@mailbox.tu-berlin.de/,
143 | * writing to sk_pacing_* can be enabled with HAVE_WRITABLE_SK_PACING=1
144 | * passed to make.
145 | *
146 | * With an older and unpatched kernel, it is impossible to control
147 | * sk_pacing_rate here from BPF code.
148 | */
149 | if (HAVE_WRITABLE_SK_PACING ||
150 | LINUX_KERNEL_VERSION >= KERNEL_VERSION(6, 0, 0)) {
151 | sk->sk_pacing_rate = min(rate, sk->sk_max_pacing_rate);
152 | }
153 | }
154 |
155 | static bool tracing_enabled()
156 | {
157 | return tracing;
158 | }
159 |
160 | void POWERTCP_CONG_OPS_FUNC(powertcp_cong_avoid, struct sock *sk, u32 ack,
161 | u32 acked)
162 | {
163 | /* Before, tcp_congestion_ops.cong_avoid was non-optional in
164 | * net/ipv4/bpf_tcp_ca.c, even if it is never used when cong_control is
165 | * also set. This was fixed in Linux 6.0 with
166 | * https://lore.kernel.org/all/20220622191227.898118-3-jthinz@mailbox.tu-berlin.de/.
167 | *
168 | * This stub is kept here for compatibility with older kernels.
169 | */
170 | }
171 |
172 | SEC("cgroup_skb/ingress")
173 | int powertcp_hwtstamp(struct __sk_buff *skb)
174 | {
175 | struct bpf_sock *sk = skb->sk;
176 | if (sk) {
177 | u64 *hwtstamp =
178 | bpf_sk_storage_get(&map_powertcp_hwtstamps, sk, NULL,
179 | BPF_SK_STORAGE_GET_F_CREATE);
180 | if (hwtstamp) {
181 | __u64 hwts = skb->hwtstamp;
182 | __u64 ts = skb->tstamp;
183 | *hwtstamp = hwts > 0 ? hwts : ts;
184 | }
185 | }
186 |
187 | return 1;
188 | }
189 |
190 | #include "powertcp_tcp-int.bpf.c"
191 |
192 | #include "../powertcp.c"
193 |
--------------------------------------------------------------------------------
/bpf/powertcp.cpp:
--------------------------------------------------------------------------------
1 | // SPDX-License-Identifier: GPL-2.0 OR MIT
2 | /*
3 | * Loader and configuration tool for the eBPF implementation of the PowerTCP
4 | * congestion control algorithm.
5 | *
6 | * Author:
7 | * Jörn-Thorben Hinz, TU Berlin, 2022.
8 | */
9 | #include "powertcp.skel.h"
10 | #include "powertcp_defs.h"
11 |
12 | #include "tcp_int.h"
13 |
14 | #include
15 | #include
16 | #if !defined(LIBBPF_MAJOR_VERSION) || LIBBPF_MAJOR_VERSION < 1
17 | #include
18 | #endif
19 | #include
20 | #include
21 | #include
22 | #include
23 | #include
24 | #include
25 | #include
26 | #include
27 | #include
28 | #include
29 | #include
30 | #include
31 | #include
32 | #include
33 | #include
34 | #include
35 | #include
36 | #include
37 | #include
38 | #include
39 | #include
40 | #include
41 | #include
42 | #include
43 | #include
44 | #include
45 |
46 | #include "powertcp_trace.h"
47 |
48 | namespace
49 | {
50 | template
51 | struct delete_func_wrapper {
52 | void operator()(T *ptr) const noexcept
53 | {
54 | DeleteFunc(ptr);
55 | }
56 | };
57 |
58 | using arg_vector = std::vector;
59 |
60 | template
61 | using ptr_with_delete_func =
62 | std::unique_ptr >;
63 |
64 | using powertcp_bpf_ptr =
65 | ptr_with_delete_func;
66 |
67 | struct powertcp_param_bool {
68 | using rodata_type = bool;
69 | std::size_t rodata_off;
70 | };
71 |
72 | struct powertcp_param_double {
73 | using rodata_type = long;
74 | std::size_t rodata_off;
75 | double scale;
76 | };
77 |
78 | struct powertcp_param_long {
79 | using rodata_type = long;
80 | std::size_t rodata_off;
81 | };
82 |
83 | using powertcp_param = std::variant;
85 |
86 | struct powertcp_param_visitor {
87 | const std::string &str;
88 | powertcp_bpf::powertcp_bpf__rodata *rodata;
89 |
90 | void operator()(const powertcp_param_bool &par) const
91 | {
92 | assign_param(true, par, rodata);
93 | }
94 |
95 | void operator()(const powertcp_param_double &par) const
96 | {
97 | assign_param(std::stod(str) * par.scale, par, rodata);
98 | }
99 |
100 | void operator()(const powertcp_param_long &par) const
101 | {
102 | assign_param(std::stol(str), par, rodata);
103 | }
104 |
105 | template
106 | void assign_param(T val, P param,
107 | powertcp_bpf::powertcp_bpf__rodata *rodata) const
108 | {
109 | assert(rodata != nullptr);
110 |
111 | auto &rodata_param =
112 | *reinterpret_cast(
113 | reinterpret_cast(rodata) +
114 | param.rodata_off);
115 | /* TODO: Maybe check if a value is in the allowed range. Or do that in
116 | * the BPF code. */
117 | rodata_param = val;
118 | }
119 | };
120 |
121 | using ring_buffer_ptr = ptr_with_delete_func;
122 |
123 | class unique_fd {
124 | public:
125 | unique_fd() noexcept : fd_{ -1 }
126 | {
127 | }
128 |
129 | explicit unique_fd(int fd) noexcept : fd_{ fd }
130 | {
131 | }
132 |
133 | unique_fd(const unique_fd &) = delete;
134 | unique_fd &operator=(const unique_fd &) = delete;
135 |
136 | unique_fd(unique_fd &&other) noexcept
137 | : fd_{ std::exchange(other.fd_, -1) }
138 | {
139 | }
140 |
141 | unique_fd &operator=(unique_fd &&other) noexcept
142 | {
143 | close();
144 | std::swap(fd_, other.fd_);
145 | return *this;
146 | }
147 |
148 | ~unique_fd()
149 | {
150 | close();
151 | }
152 |
153 | explicit operator bool() const noexcept
154 | {
155 | return fd_ > -1;
156 | }
157 |
158 | void close() noexcept
159 | {
160 | if (fd_ > -1) {
161 | ::close(fd_); /* Ignoring any errors here. */
162 | fd_ = -1;
163 | }
164 | }
165 |
166 | int get() const noexcept
167 | {
168 | return fd_;
169 | }
170 |
171 | private:
172 | int fd_;
173 | };
174 |
175 | using bpf_link_ptr =
176 | std::unique_ptr >;
178 |
179 | #define POWERTCP_RODATA_OFFSET(member) \
180 | offsetof(powertcp_bpf::powertcp_bpf__rodata, member)
181 | const std::unordered_map params = {
182 | { "base_rtt", powertcp_param_long{ POWERTCP_RODATA_OFFSET(base_rtt) } },
183 | { "beta", powertcp_param_long{ POWERTCP_RODATA_OFFSET(beta) } },
184 | { "expected_flows",
185 | powertcp_param_long{ POWERTCP_RODATA_OFFSET(expected_flows) } },
186 | { "gamma",
187 | powertcp_param_double{ POWERTCP_RODATA_OFFSET(gamma), gamma_scale } },
188 | { "hop_bw", powertcp_param_long{ POWERTCP_RODATA_OFFSET(hop_bw) } },
189 | { "host_bw", powertcp_param_long{ POWERTCP_RODATA_OFFSET(host_bw) } },
190 | { "tracing", powertcp_param_bool{ POWERTCP_RODATA_OFFSET(tracing) } },
191 | };
192 | #undef POWERTCP_RODATA_OFFSET
193 |
194 | const std::filesystem::path powertcp_pin_dir = "/sys/fs/bpf/powertcp";
195 |
196 | volatile std::sig_atomic_t running = true;
197 |
198 | void parse_param(std::string_view param_arg,
199 | powertcp_bpf::powertcp_bpf__rodata *rodata)
200 | {
201 | std::istringstream iss(std::string{ param_arg });
202 |
203 | std::string name_tok;
204 | std::getline(iss, name_tok, '=');
205 |
206 | const auto param_iter = params.find(name_tok);
207 | if (param_iter == std::end(params)) {
208 | std::ostringstream oss;
209 | oss << "Unknown algorithm parameter '" << name_tok << "'";
210 | throw std::invalid_argument(oss.str());
211 | }
212 |
213 | std::string value_tok;
214 | std::getline(iss, value_tok, '=');
215 |
216 | try {
217 | std::visit(powertcp_param_visitor{ value_tok, rodata },
218 | param_iter->second);
219 | } catch (const std::invalid_argument &) {
220 | std::ostringstream oss;
221 | oss << "Invalid value '" << value_tok << "' for parameter "
222 | << name_tok << ": invalid number";
223 | throw std::invalid_argument(oss.str());
224 | } catch (const std::out_of_range &) {
225 | std::ostringstream oss;
226 | oss << "Invalid value '" << value_tok << "' for parameter "
227 | << name_tok << ": out of range";
228 | throw std::out_of_range(oss.str());
229 | }
230 | }
231 |
232 | void pin_map(bpf_map *map)
233 | {
234 | assert(map != nullptr);
235 |
236 | const char *map_name = bpf_map__name(map);
237 | const auto pin_path = powertcp_pin_dir / map_name;
238 | if (bpf_map__pin(map, pin_path.c_str())) {
239 | if (errno == EEXIST) {
240 | fprintf(stderr, "%s is already pinned, skipping\n",
241 | map_name);
242 | return;
243 | }
244 |
245 | std::ostringstream oss;
246 | oss << "bpf_map__pin(" << map_name << ")";
247 | throw std::system_error(errno, std::generic_category(),
248 | oss.str());
249 | }
250 | }
251 |
252 | void attach_and_pin_cgroup_prog(bpf_program *prog,
253 | std::filesystem::path cgroup_path)
254 | {
255 | const char *prog_name = bpf_program__name(prog);
256 |
257 | std::filesystem::create_directory(cgroup_path);
258 |
259 | const auto cgroup_fd = unique_fd{ open(cgroup_path.c_str(), O_RDONLY) };
260 | if (!cgroup_fd) {
261 | throw std::system_error(errno, std::generic_category(),
262 | "open(cgroup_path)");
263 | }
264 |
265 | const auto link = bpf_link_ptr{ bpf_program__attach_cgroup(
266 | prog, cgroup_fd.get()) };
267 | if (!link) {
268 | std::ostringstream oss;
269 | oss << "bpf_program__attach_cgroup(" << prog_name << ")";
270 | throw std::system_error(errno, std::generic_category(),
271 | oss.str());
272 | }
273 |
274 | const auto pin_path =
275 | powertcp_pin_dir /
276 | std::filesystem::path{ "link_" }.concat(prog_name);
277 | if (bpf_link__pin(link.get(), pin_path.c_str())) {
278 | if (errno == EEXIST) {
279 | fprintf(stderr, "%s is already pinned, skipping\n",
280 | prog_name);
281 | return;
282 | }
283 |
284 | std::ostringstream oss;
285 | oss << "bpf_link__pin(" << prog_name << ")";
286 | throw std::system_error(errno, std::generic_category(),
287 | oss.str());
288 | }
289 | }
290 |
291 | void attach_struct_ops(bpf_map *struct_ops)
292 | {
293 | auto link = bpf_link_ptr{ bpf_map__attach_struct_ops(struct_ops) };
294 | if (!link) {
295 | if (errno == EEXIST) {
296 | fprintf(stderr, "%s is already registered, skipping\n",
297 | bpf_map__name(struct_ops));
298 | return;
299 | }
300 |
301 | std::ostringstream oss;
302 | oss << "attach_struct_ops(" << bpf_map__name(struct_ops) << ")";
303 | throw std::system_error(errno, std::generic_category(),
304 | oss.str());
305 | }
306 |
307 | /* Have to __disconnect() before __destroy() so the attached struct_ops
308 | * outlive this userspace program.
309 | */
310 | bpf_link__disconnect(link.get());
311 | }
312 |
313 | void delete_struct_ops(std::string_view map_name)
314 | {
315 | unique_fd fd;
316 | __u32 id = 0;
317 |
318 | auto info = bpf_map_info{};
319 | __u32 info_len = sizeof(info);
320 |
321 | while (true) {
322 | if (bpf_map_get_next_id(id, &id)) {
323 | if (errno != ENOENT) {
324 | throw std::system_error(errno,
325 | std::generic_category(),
326 | "map_get_next_id");
327 | }
328 | return;
329 | }
330 |
331 | fd = unique_fd(bpf_map_get_fd_by_id(id));
332 | if (!fd) {
333 | if (errno == ENOENT) {
334 | continue;
335 | }
336 | throw std::system_error(errno, std::generic_category(),
337 | "map_get_fd_by_id");
338 | }
339 |
340 | if (bpf_obj_get_info_by_fd(fd.get(), &info, &info_len)) {
341 | throw std::system_error(errno, std::generic_category(),
342 | "obj_get_info_by_fd");
343 | }
344 |
345 | if (info.type == BPF_MAP_TYPE_STRUCT_OPS &&
346 | map_name == info.name) {
347 | break;
348 | }
349 | }
350 |
351 | constexpr auto zero = 0;
352 | if (bpf_map_delete_elem(fd.get(), &zero)) {
353 | throw std::system_error(errno, std::generic_category(),
354 | "map_delete_elem");
355 | }
356 | }
357 |
358 | void enable_hwts(std::string_view dev)
359 | {
360 | auto fd = unique_fd{ socket(AF_UNIX, SOCK_DGRAM, 0) };
361 | if (!fd) {
362 | throw std::system_error{
363 | std::make_error_code(std::errc{ errno }), "socket"
364 | };
365 | }
366 |
367 | hwtstamp_config hwts_conf = {};
368 | hwts_conf.rx_filter = HWTSTAMP_FILTER_ALL;
369 |
370 | ifreq ifr = {};
371 | ifr.ifr_data = reinterpret_cast<__caddr_t>(&hwts_conf);
372 |
373 | dev.copy(ifr.ifr_name, sizeof(ifr.ifr_name) - 1);
374 | assert(ifr.ifr_name[sizeof(ifr.ifr_name) - 1] == '\0');
375 | if (std::size(dev) != std::strlen(ifr.ifr_name)) {
376 | std::ostringstream oss;
377 | oss << "device name too long: " << dev;
378 | throw std::invalid_argument{ oss.str() };
379 | }
380 |
381 | if (ioctl(fd.get(), SIOCSHWTSTAMP, &ifr)) {
382 | const auto err = std::make_error_code(std::errc{ errno });
383 | std::ostringstream oss;
384 |
385 | if (err == std::errc::not_supported) {
386 | oss << dev << " does not support hardware timestamping";
387 | throw std::runtime_error{ oss.str() };
388 | } else {
389 | oss << dev << ": ioctl(SIOCSHWTSTAMP)";
390 | throw std::system_error{ err, oss.str() };
391 | }
392 | }
393 | }
394 |
395 | void do_enable_hwts(const arg_vector &args)
396 | {
397 | for (auto &&arg : args) {
398 | enable_hwts(arg);
399 | }
400 | }
401 |
402 | void do_register(const arg_vector &args)
403 | {
404 | auto skel = powertcp_bpf_ptr{ powertcp_bpf__open() };
405 | if (!skel) {
406 | throw std::system_error(errno, std::generic_category(), "open");
407 | }
408 |
409 | for (auto &&arg : args) {
410 | parse_param(arg, skel->rodata);
411 | }
412 |
413 | auto map_fd = unique_fd(
414 | bpf_obj_get(TCP_INT_BPF_PIN_PATH "/map_tcp_int_state"));
415 | if (!map_fd) {
416 | throw std::system_error(errno, std::generic_category(),
417 | "obj_get(map_tcp_int_state)");
418 | }
419 |
420 | auto *map_tcp_int_state =
421 | bpf_object__find_map_by_name(skel->obj, "map_tcp_int_state");
422 | if (!map_tcp_int_state) {
423 | throw std::system_error(errno, std::generic_category(),
424 | "find_map_by_name(map_tcp_int_state)");
425 | }
426 |
427 | if (bpf_map__reuse_fd(map_tcp_int_state, map_fd.get()) < 0) {
428 | throw std::system_error(errno, std::generic_category(),
429 | "reuse_fd(map_tcp_int_state)");
430 | }
431 |
432 | if (powertcp_bpf__load(skel.get())) {
433 | throw std::system_error(errno, std::generic_category(), "load");
434 | }
435 |
436 | attach_struct_ops(skel->maps.powertcp);
437 | attach_struct_ops(skel->maps.rttpowertcp);
438 |
439 | attach_and_pin_cgroup_prog(skel->progs.powertcp_hwtstamp,
440 | TCP_INT_CGROUP_PATH);
441 |
442 | /* struct_ops program maps are "pinned"/kept alive in their own way (see
443 | * the comment in attach_struct_ops()), we only want to pin other maps
444 | * here:
445 | */
446 | pin_map(skel->maps.map_powertcp_hwtstamps);
447 | pin_map(skel->maps.trace_events);
448 | }
449 |
450 | int handle_trace_event(void * /* ctx */, void *data, std::size_t /* data_sz */)
451 | {
452 | /* TODO: If it seems appropriate later, merge handle_trace_event() and
453 | * handle_trace_event_csv() and just use two different format strings.
454 | */
455 | const powertcp_trace_event &ev =
456 | *static_cast(data);
457 |
458 | /*
459 | * Desired alignment in the output, showing the maximum value per data type:
460 | *
461 | * # Time (us) Socket hash CWND (segments) Pacing rate (Mbit/s) Norm. power Smoothed power Queue length (bytes) Delta t (ns) Tx. bytes diff RTT grad.
462 | * 18446744073709551615 4294967295 4294967295 xxxxxxxxxx x.yyyyyyyy x.yyyyyyyy 4294967295 4294967295 4294967295 x.yyyyyyyy
463 | */
464 | std::printf(
465 | "%20llu %10u %10u %10lu %10.8f %10.8f %10ld %10u %10u %10.8f\n",
466 | ev.time, ev.sock_hash, ev.cwnd, ev.rate * 8 / 1000000,
467 | static_cast(ev.p_norm) / power_scale,
468 | static_cast(ev.p_smooth) / power_scale, ev.qlen,
469 | ev.delta_t, ev.tx_bytes_diff,
470 | static_cast(ev.rtt_grad) / power_scale);
471 |
472 | return 0;
473 | }
474 |
475 | int handle_trace_event_csv(void * /* ctx */, void *data,
476 | std::size_t /* data_sz */)
477 | {
478 | /* TODO: If it seems appropriate later, merge handle_trace_event() and
479 | * handle_trace_event_csv() and just use two different format strings.
480 | */
481 | const auto &ev = *static_cast(data);
482 |
483 | std::printf("%llu,%u,%u,%lu,%0f,%0f,%ld,%u,%u,%0f\n", ev.time,
484 | ev.sock_hash, ev.cwnd, ev.rate,
485 | static_cast(ev.p_norm) / power_scale,
486 | static_cast(ev.p_smooth) / power_scale, ev.qlen,
487 | ev.delta_t, ev.tx_bytes_diff,
488 | static_cast(ev.rtt_grad) / power_scale);
489 |
490 | return 0;
491 | }
492 |
493 | void do_trace(bool output_csv)
494 | {
495 | auto map_fd = unique_fd{ bpf_obj_get(
496 | (powertcp_pin_dir / "trace_events").c_str()) };
497 | if (!map_fd) {
498 | throw std::system_error(-map_fd.get(), std::generic_category(),
499 | "bpf_obj_get");
500 | }
501 |
502 | auto handle_func =
503 | output_csv ? handle_trace_event_csv : handle_trace_event;
504 | auto ring_buf = ring_buffer_ptr{ ring_buffer__new(
505 | map_fd.get(), handle_func, nullptr, nullptr) };
506 | if (!ring_buf) {
507 | throw std::system_error(errno, std::generic_category(),
508 | "ring_buffer__new");
509 | }
510 |
511 | const char *output_header;
512 | if (output_csv) {
513 | output_header =
514 | "time,hash,cwnd,rate,p_norm,p_smooth,qlen,delta_t,tx_bytes_diff,rtt_grad";
515 | } else {
516 | output_header =
517 | "# Time (us) Socket hash CWND (segments) Pacing rate (Mbit/s) Norm. power Smoothed power Queue length (bytes) Delta t (ns) Tx. bytes diff RTT grad.";
518 | }
519 |
520 | auto repeated_timeout = true;
521 | std::puts(output_header);
522 | while (running) {
523 | if (auto err = ring_buffer__poll(ring_buf.get(), 100);
524 | err < 0 && err != -EINTR) {
525 | throw std::system_error(-err, std::generic_category(),
526 | "ring_buffer__poll");
527 | } else if (err == 0 && !repeated_timeout) {
528 | /* err == 0 is a timeout */
529 | if (!output_csv) {
530 | std::puts(output_header);
531 | }
532 | ::fflush(stdout);
533 | repeated_timeout = true;
534 | } else if (err > 0) {
535 | repeated_timeout = false;
536 | }
537 | }
538 | }
539 |
540 | void do_unregister()
541 | {
542 | delete_struct_ops("powertcp");
543 | delete_struct_ops("rttpowertcp");
544 | std::filesystem::remove_all(powertcp_pin_dir);
545 | }
546 |
547 | void handle_signal(int /* sig */)
548 | {
549 | running = false;
550 | }
551 |
552 | void usage(const char *prog, FILE *outfile)
553 | {
554 | fprintf(outfile,
555 | "Usage: %1$s enable-hwts [DEVICE...]\n"
556 | " %1$s [OPTION...] register [PARAMETER...]\n"
557 | " %1$s [OPTION...] trace | unregister\n"
558 | "\n"
559 | "COMMANDS\n"
560 | " enable-hwts\n"
561 | " Enable hardware timestamping on the given network device(s).\n"
562 | "\n"
563 | " register\n"
564 | " Register the PowerTCP eBPF programs, optionally setting algorithm\n"
565 | " parameters.\n"
566 | "\n"
567 | " trace\n"
568 | " Trace the execution of the algorithm.\n"
569 | "\n"
570 | " unregister\n"
571 | " Unregister the PowerTCP eBPF programs.\n"
572 | "\n"
573 | "OPTIONS\n"
574 | " -C\n"
575 | " Output traced values in CSV format.\n"
576 | "\n"
577 | " -f\n"
578 | " Force an unregister before a register so parameters can be set to\n"
579 | " new values.\n"
580 | "\n"
581 | "PARAMETERS\n"
582 | " The following parameters of the PowerTCP algorithm can be set with the\n"
583 | " register command:\n"
584 | " - base_rtt in µs\n"
585 | " - beta in number of packets\n"
586 | " - expected_flows in number of flows\n"
587 | " - gamma in range 0.0 to 1.0\n"
588 | " - hop_bw in Mbit/s\n"
589 | " - host_bw in Mbit/s\n"
590 | "\n"
591 | " Passing the additional, value-less parameter \"tracing\" enables tracing\n"
592 | " the algorithm with trace command.\n"
593 | "\n"
594 | "EXAMPLE\n"
595 | "\n"
596 | " # %1$s register expected_flows=1\n"
597 | " # %1$s enable-hwts eno1 eno2 eno3\n"
598 | "\n",
599 | prog);
600 | }
601 | } // namespace
602 |
603 | int main(int argc, char *argv[])
604 | {
605 | bool force = false;
606 | auto output_csv = false;
607 |
608 | int opt;
609 | while (-1 != (opt = getopt(argc, argv, "Cfh"))) {
610 | switch (opt) {
611 | case 'C':
612 | output_csv = true;
613 | break;
614 | case 'f':
615 | force = true;
616 | break;
617 | case 'h':
618 | usage(argv[0], stdout);
619 | return EXIT_SUCCESS;
620 | default:
621 | usage(argv[0], stderr);
622 | return EXIT_FAILURE;
623 | }
624 | }
625 |
626 | if (optind >= argc) {
627 | usage(argv[0], stderr);
628 | return EXIT_FAILURE;
629 | }
630 |
631 | struct sigaction sigact = {};
632 | sigact.sa_handler = handle_signal;
633 | sigact.sa_flags = SA_RESETHAND;
634 | if (sigaction(SIGINT, &sigact, nullptr)) {
635 | std::perror("sigaction");
636 | return EXIT_FAILURE;
637 | }
638 |
639 | #if !defined(LIBBPF_MAJOR_VERSION) || LIBBPF_MAJOR_VERSION < 1
640 | if (libbpf_set_strict_mode(LIBBPF_STRICT_ALL)) {
641 | std::perror("libbpf_set_strict_mode");
642 | return EXIT_FAILURE;
643 | }
644 | #endif
645 |
646 | const auto cmd = std::string_view{ argv[optind] };
647 | const auto args = arg_vector(argv + optind + 1, argv + argc);
648 |
649 | if (cmd == "enable-hwts") {
650 | try {
651 | do_enable_hwts(args);
652 | } catch (const std::exception &e) {
653 | fprintf(stderr, "%s\n", e.what());
654 | return EXIT_FAILURE;
655 | }
656 | } else if (cmd == "register") {
657 | if (force) {
658 | try {
659 | do_unregister();
660 | } catch (const std::exception &e) {
661 | fprintf(stderr, "%s\n", e.what());
662 | }
663 | }
664 |
665 | try {
666 | do_register(args);
667 | } catch (const std::exception &e) {
668 | fprintf(stderr, "%s\n", e.what());
669 | return EXIT_FAILURE;
670 | }
671 | } else if (cmd == "trace") {
672 | try {
673 | do_trace(output_csv);
674 | } catch (const std::exception &e) {
675 | fprintf(stderr, "%s\n", e.what());
676 | return EXIT_FAILURE;
677 | }
678 | } else if (cmd == "unregister") {
679 | if (argc - optind > 2) {
680 | fprintf(stderr,
681 | "unexpected argument(s) after 'unregister'\n");
682 | return EXIT_FAILURE;
683 | }
684 | try {
685 | do_unregister();
686 | } catch (const std::exception &e) {
687 | fprintf(stderr, "%s\n", e.what());
688 | return EXIT_FAILURE;
689 | }
690 | } else {
691 | usage(argv[0], stderr);
692 | return EXIT_FAILURE;
693 | }
694 | }
695 |
--------------------------------------------------------------------------------
/bpf/powertcp_tcp-int.bpf.c:
--------------------------------------------------------------------------------
1 | // SPDX-License-Identifier: GPL-2.0 OR MIT
2 |
3 | #include "tcp_int_common.h"
4 | #include "tcp_int_common.bpf.h"
5 |
6 | static const struct powertcp_int *get_int(struct sock *sk,
7 | const struct powertcp_int *prev_int)
8 | {
9 | struct ptcp_powertcp *ca = inet_csk_ca(sk);
10 | struct powertcp_int_impl *int_impl = &ca->int_impl;
11 | const struct tcp_sock *tp = tcp_sk(sk);
12 | /* Not using tcp_int_get_state() here since it uses
13 | * BPF_SK_STORAGE_GET_F_CREATE. We might want to use a missing map entry as
14 | * an indicator to fall back to RTT-PowerTCP.
15 | */
16 | const struct tcp_int_state *tint =
17 | bpf_sk_storage_get(&map_tcp_int_state, sk, NULL, 0);
18 |
19 | if (tint) {
20 | u32 bandwidth = BITS_TO_BYTES(hop_bw);
21 | #if USE_SWLAT_AS_TIMESTAMP
22 | u32 ts = tint->swlat;
23 | #else
24 | u32 ts = get_tstamp(sk);
25 | #endif
26 | u32 dt = (!prev_int ? tp->srtt_us * (1000u >> 3) :
27 | ts - prev_int->hops[0].ts) &
28 | max_ts;
29 |
30 | if (dt == 0) {
31 | int_impl->cached_int.n_hop = 0;
32 | return NULL;
33 | }
34 |
35 | int_impl->cached_int.n_hop = 1;
36 | /* TCP-INT does not provide an identification for the path. */
37 | /* TODO: Evaluate if it makes sense to use the switch ID as path ID.
38 | * Could lead to a too frequently detected path change, though.
39 | */
40 | int_impl->cached_int.path_id = 1;
41 |
42 | int_impl->cached_int.hops[0].bandwidth = bandwidth;
43 | int_impl->cached_int.hops[0].qlen = tint->qdepth;
44 | int_impl->cached_int.hops[0].ts = ts;
45 | /* In lack of a tx_bytes value, we estimate it here. A factor of
46 | * MEGA/USEC_PER_SEC is cancelled in the calculation:
47 | */
48 | int_impl->cached_int.hops[0].tx_bytes =
49 | bandwidth * tint->util / 100 / NSEC_PER_USEC * dt;
50 |
51 | return &int_impl->cached_int;
52 | } else {
53 | int_impl->cached_int.n_hop = 0;
54 | }
55 |
56 | return NULL;
57 | }
58 |
59 | static const struct powertcp_int *get_prev_int(struct sock *sk)
60 | {
61 | struct ptcp_powertcp *ca = inet_csk_ca(sk);
62 | struct powertcp_int_impl *int_impl = &ca->int_impl;
63 | struct powertcp_int *prev_int = &int_impl->prev_int;
64 |
65 | if (prev_int->n_hop) {
66 | /* With TCP-INT, the difference in tx_bytes since last ACK is already
67 | * estimated in get_int(). The previous value must be 0 so
68 | * ptcp_norm_power() does not calculate a second difference with a
69 | * value potentially coming from a different switch.
70 | */
71 | prev_int->hops[0].tx_bytes = 0;
72 | return prev_int;
73 | }
74 |
75 | return NULL;
76 | }
77 |
78 | static int int_impl_init(struct sock *sk)
79 | {
80 | return 0;
81 | }
82 |
83 | static void int_impl_release(struct sock *sk)
84 | {
85 | /* no-op */
86 | }
87 |
88 | static void int_impl_reset(powertcp_int_impl_t *int_impl, enum tcp_ca_event ev)
89 | {
90 | int_impl->prev_int.path_id = 0;
91 | }
92 |
93 | static void int_impl_update_old(powertcp_int_impl_t *int_impl)
94 | {
95 | int_impl->prev_int = int_impl->cached_int;
96 | }
97 |
--------------------------------------------------------------------------------
/bpf/powertcp_tcp-int_head.bpf.c:
--------------------------------------------------------------------------------
1 | // SPDX-License-Identifier: GPL-2.0 OR MIT
2 |
3 | enum { max_n_hops = 1 };
4 |
5 | #include "powertcp_int.c"
6 |
7 | /* TCP-INT's swlat field (which we optionally replace with a timestamp), is
8 | * only 24 bits long.
9 | */
10 | static const unsigned int max_ts = 0xFFFFFFu;
11 |
12 | /* In case the tx_bytes value is taken directly from a less-than-32-bit INT
13 | * field, its maximum value has to be known for correct wrap-around in
14 | * calculations.
15 | */
16 | static const u32 max_tx_bytes = 0xFFFFFFFFu;
17 |
18 | struct powertcp_int_impl {
19 | struct powertcp_int cached_int;
20 | struct powertcp_int prev_int;
21 | };
22 | typedef struct powertcp_int_impl powertcp_int_impl_t;
23 |
--------------------------------------------------------------------------------
/dkms.conf:
--------------------------------------------------------------------------------
1 | PACKAGE_NAME="powertcp"
2 | PACKAGE_VERSION="0.0.1"
3 | BUILT_MODULE_NAME[0]="tcp_powertcp"
4 | DEST_MODULE_LOCATION[0]="/kernel/net/ipv4/"
5 | AUTOINSTALL="yes"
6 |
--------------------------------------------------------------------------------
/doc/code-structure.md:
--------------------------------------------------------------------------------
1 | # Code structure for module and BPF implementation
2 |
3 | This is the common code structure for the module’s `tcp_powertcp.c` and the BPF
4 | implementation’s `powertcp.bpf.c`. It uses direct `#include`s of source files
5 | (instead of using multiple compilation units with headers) to enable full
6 | inlining and other optimizations for both the module and BPF implementation.
7 |
8 | The parts *must* appear in this order:
9 |
10 | 1. General `#include`s, including the required `linux/types.h` (module) or
11 | `vmlinux.h` (BPF)
12 |
13 | 2. An `#include` defining INT-related values and types in this order:
14 | 1. Provide `max_n_hops` as an enumerator
15 |
16 | 2. `#include "powertcp_int.c"` which provides `struct powertcp_int`
17 | (requires `max_n_hops`) and other structs
18 |
19 | 3. Provide a typedef for `powertcp_int_impl_t`, which can alias e.g. a
20 | struct or a pointer, and constants `max_ts` and `max_tx_bytes`
21 |
22 | This include should be named `tcp_powertcp_METHOD_head.c` resp.
23 | `powertcp_METHOD_head.bpf.c`, e.g. `tcp_powertcp_foobar_head.c` or
24 | `powertcp_tcp-int_head.bpf.c`.
25 |
26 | `powertcp_no-int_head.c` shows the required content of this file.
27 |
28 | 3. `#define`s for various `POWERTCP_*` macros as needed; their default values
29 | are defined in `powertcp_head.c`
30 |
31 | 4. `#include "powertcp_head.c"` providing the core `struct powertcp` (requires
32 | `powertcp_int_impl_t`), the variables for the algorithm parameters, and
33 | default `#define`s for the still undefined `POWERTCP_*` macros
34 |
35 | 5. Additional (algorithm) parameter variables, other static/constant variables
36 |
37 | 6. Definitions of the required, module- or BPF-specific functions
38 |
39 | 7. An `#include` defining the INT-related functions
40 |
41 | This include should be named `tcp_powertcp_METHOD.c` resp.
42 | `powertcp_METHOD.bpf.c`, e.g. `tcp_powertcp_foobar.c` or
43 | `powertcp_tcp-int.bpf.c`.
44 |
45 | `powertcp_no-int.c` shows the required content of this file.
46 |
47 | 8. `#include "powertcp.c"` of the algorithm implementation
48 |
49 | 9. Additional definitions of functions requiring the PowerTCP
50 | `tcp_congestion_ops` instances, e.g. `module_init` and `module_exit`
51 |
--------------------------------------------------------------------------------
/doc/module.md:
--------------------------------------------------------------------------------
1 | # PowerTCP kernel module
2 |
3 | > [!IMPORTANT]
4 | > The kernel module is missing a source of telemetry (the integration is
5 | > prepared). Therefore, the `powertcp` congestion control in the module is only a
6 | > proof of concept.
7 |
8 | > [!NOTE]
9 | > The `rttpowertcp` in the kernel module is functional but—due to limitations
10 | > in the kernel—lacks access to higher-precision hardware timestamps.
11 |
12 | Following are step-by-step instructions on how to use and experiment with the
13 | PowerTCP kernel module. All commands listed here are assumed to be executed in
14 | the root folder of this repository.
15 |
16 | When loaded into the kernel, the congestion control algorithms are called
17 | `powertcp` and `rttpowertcp`.
18 |
19 | ## Prerequisites
20 | - Any recent Linux kernel and corresponding kernel headers
21 | - `gcc`
22 | - `make`
23 | - `dkms` (optional)
24 |
25 | ## Preparation
26 |
27 | The preparation steps need to be executed on both client and server.
28 |
29 | 1. Install required packages (as root/with `sudo`):
30 | ```
31 | apt install gcc linux-headers-$(uname -r) make
32 | ```
33 |
34 | Ideally, tune the network interface *IFACE* for low latency etc. (as root/with
35 | `sudo`):
36 | ```
37 | apt install ethtool procps tuned
38 | ./tools/tune-eth IFACE
39 | ```
40 | 2. Build the PowerTCP module implementation:
41 | ```
42 | make
43 | ```
44 |
45 | ## On the server
46 |
47 | *Close any previously opened screen sessions that were opened this way.*
48 |
49 | Start `iperf` and `iperf3` server instances, ready to use PowerTCP, in a screen
50 | session (as root/with `sudo`):
51 | ```
52 | ./tools/setup-module iperf-servers
53 | ```
54 | **Beware: You are root user inside the screen session!**
55 |
56 | Algorithm parameters (see [On the client](#on-the-client)) do not need to be
57 | set on the server, they are irrelevant here.
58 |
59 | ## On the client
60 |
61 | The `setup-module` script opens a screen session readily prepared to use
62 | PowerTCP.
63 |
64 | You can and should pass algorithm parameters to `setup-module`, e.g (as
65 | root/with `sudo`):
66 | ```
67 | ./tools/setup-module iperf-client host_bw=25000 hop_bw=25000 base_rtt=50
68 | ```
69 | For a list of the available parameters see
70 | ```
71 | /sbin/modinfo tcp_powertcp.ko
72 | ```
73 | Note that a value for the `gamma` parameter must be multiplied with the value
74 | of `power_scale` defined in [powertcp_defs.h](../powertcp_defs.h) and rounded
75 | to an integer afterwards.
76 |
77 | **Beware: You are root user inside the screen session!**
78 |
79 | Inside the screen session, you can, e.g,
80 | - run `iperf3` (or `iperf`, the options differ)
81 | ```
82 | iperf3 -N -C rttpowertcp -c SERVER_IP
83 | ```
84 |
85 | ## Installation through DKMS
86 |
87 | The kernel module is [prepared](dkms.conf) for system-wide installation through
88 | DKMS. The [Makefile](Makefile) provides a convenience target for installation
89 | through DKMS (as root/with `sudo`):
90 | ```
91 | make dkms_install
92 | ```
93 |
94 | ## Tracepoints
95 | There are
96 | [tracepoints](https://www.kernel.org/doc/html/latest/trace/tracepoints.html) to
97 | follow the algorithm, mainly for the three core functions defined in the
98 | [paper](#for-powertcp) and the values used and returned by them. The tracepoints
99 | can be found in `/sys/kernel/debug/tracing/events/powertcp`.
100 |
101 | They can be enabled for example (see
102 | [Event Tracing](https://www.kernel.org/doc/html/latest/trace/events.html)) with
103 | (as root/with `sudo`)
104 | ```
105 | echo 1 > /sys/kernel/debug/tracing/events/powertcp/enable
106 | ```
107 | and shown with (as root/with `sudo`)
108 | ```
109 | cat /sys/kernel/debug/tracing/trace_pipe
110 | ```
111 | or used with any other of the available tools, like
112 | [bpftrace](https://github.com/iovisor/bpftrace).
113 |
114 | ## Development Resources
115 | - [Kernel Build System: Building External Modules](https://www.kernel.org/doc/html/latest/kbuild/modules.html)
116 |
--------------------------------------------------------------------------------
/powertcp.c:
--------------------------------------------------------------------------------
1 | // SPDX-License-Identifier: GPL-2.0 OR MIT
2 | /*
3 | * PowerTCP congestion control
4 | *
5 | * Based on the algorithm developed in:
6 | * Addanki, V., O. Michel, and S. Schmid.
7 | * "PowerTCP: Pushing the Performance Limits of Datacenter Networks."
8 | * 19th USENIX Symposium on Networked Systems Design and Implementation (NSDI 22).
9 | * USENIX Association, 2022.
10 | * Available at: https://arxiv.org/pdf/2112.14309.pdf
11 | *
12 | * Implemented by:
13 | * Jörn-Thorben Hinz, TU Berlin, 2022.
14 | */
15 |
16 | #define POWERTCP_CONG_OPS_NAME_CONCAT2(prefix, cong_ops_name) \
17 | prefix##cong_ops_name
18 | #define POWERTCP_CONG_OPS_NAME_CONCAT(prefix, cong_ops_name) \
19 | POWERTCP_CONG_OPS_NAME_CONCAT2(prefix, cong_ops_name)
20 | #define POWERTCP_CONG_OPS_NAME(cong_ops_name) \
21 | __stringify(POWERTCP_CONG_OPS_NAME_CONCAT( \
22 | POWERTCP_CONG_OPS_NAME_PREFIX, cong_ops_name))
23 |
24 | static void clear_old_cwnds(struct sock *sk)
25 | {
26 | struct powertcp *ca = inet_csk_ca(sk);
27 | ca->old_cwnd.cwnd = 0;
28 | ca->old_cwnd.snd_nxt = 0;
29 | }
30 |
31 | static unsigned long ewma(unsigned long weight, unsigned long weight_scale,
32 | unsigned long value, unsigned long old_value)
33 | {
34 | return (weight * value + (weight_scale - weight) * old_value) /
35 | weight_scale;
36 | }
37 |
38 | /* Return the snd_cwnd that was set when the newly acknowledged segment(s) were
39 | * sent.
40 | */
41 | static unsigned long get_cwnd(const struct sock *sk)
42 | {
43 | const struct powertcp *ca = inet_csk_ca(sk);
44 | //const struct tcp_sock *tp = tcp_sk(sk);
45 | //u32 ack_seq = tp->snd_una;
46 |
47 | if (ca->old_cwnd.cwnd != 0 && ca->old_cwnd.snd_nxt != 0 /*&&
48 | before(ca->old_cwnd.snd_nxt, ack_seq)*/) {
49 | return ca->old_cwnd.cwnd;
50 | }
51 |
52 | return ca->snd_cwnd;
53 | }
54 |
55 | /* Return the most recently measured RTT (in us). */
56 | static unsigned long get_rtt(const struct sock *sk,
57 | const struct rate_sample *rs)
58 | {
59 | const struct tcp_sock *tp = tcp_sk(sk);
60 | long rtt = rs->rtt_us; /* This is -1 if unavailable. */
61 | if (rtt < 0) {
62 | rtt = tp->srtt_us >> 3;
63 | }
64 | return rtt;
65 | }
66 |
67 | /* Limit a value to positive, non-zero numbers. */
68 | static unsigned long not_zero(unsigned long val)
69 | {
70 | return max(1UL, val);
71 | }
72 |
73 | static void set_cwnd(struct sock *sk, unsigned long cwnd,
74 | struct powertcp_trace_event *trace_event)
75 | {
76 | struct powertcp *ca = inet_csk_ca(sk);
77 | struct tcp_sock *tp = tcp_sk(sk);
78 |
79 | ca->snd_cwnd = cwnd;
80 | cwnd /= cwnd_scale;
81 | cwnd = min_t(unsigned long, cwnd, tp->snd_cwnd_clamp);
82 | tp->snd_cwnd = not_zero(cwnd);
83 |
84 | if (tracing_enabled() && trace_event) {
85 | trace_event->cwnd = tp->snd_cwnd;
86 | }
87 | }
88 |
89 | /* Look for the base (~= minimum) RTT (in us). */
90 | static void update_base_rtt(struct sock *sk)
91 | {
92 | struct powertcp *ca = inet_csk_ca(sk);
93 | const struct tcp_sock *tp = tcp_sk(sk);
94 | u32 min_rtt;
95 |
96 | if (base_rtt > -1) {
97 | ca->base_rtt = base_rtt;
98 | return;
99 | }
100 |
101 | min_rtt = tcp_min_rtt(tp);
102 | if (min_rtt != ~0U) {
103 | ca->base_rtt = min_rtt;
104 | return;
105 | }
106 |
107 | min_rtt = tp->srtt_us >> 3;
108 | if (min_rtt) {
109 | ca->base_rtt = min_rtt;
110 | return;
111 | }
112 |
113 | /* bbr_init_pacing_rate_from_rtt() also uses this as fallback. */
114 | ca->base_rtt = USEC_PER_SEC;
115 | }
116 |
117 | static void update_beta(struct sock *sk, unsigned long old_base_rtt)
118 | {
119 | struct powertcp *ca = inet_csk_ca(sk);
120 | const struct tcp_sock *tp = tcp_sk(sk);
121 |
122 | if (beta < 0 &&
123 | (ca->base_rtt < old_base_rtt || old_base_rtt == ULONG_MAX)) {
124 | unsigned long new_beta =
125 | BITS_TO_BYTES(cwnd_scale /* * MEGA */ * ca->host_bw *
126 | ca->base_rtt / expected_flows) /
127 | tp->mss_cache /* / USEC_PER_SEC */;
128 | ca->beta = min(ca->beta, new_beta);
129 | }
130 | }
131 |
132 | static void reset(struct sock *sk, enum tcp_ca_event ev)
133 | {
134 | struct powertcp *ca = inet_csk_ca(sk);
135 | struct tcp_sock *tp = tcp_sk(sk);
136 |
137 | if (ev == CA_EVENT_TX_START || ev == CA_EVENT_CWND_RESTART) {
138 | unsigned long old_base_rtt = ca->base_rtt;
139 | update_base_rtt(sk);
140 | update_beta(sk, old_base_rtt);
141 | }
142 |
143 | /* Only reset those values on a CA_EVENT_CWND_RESTART (used on
144 | * initialization). Otherwise we would reset cwnd and rate too frequently if
145 | * there are frequent CA_EVENT_TX_STARTs.
146 | */
147 | if (ev == CA_EVENT_CWND_RESTART) {
148 | unsigned long rate = BITS_TO_BYTES(MEGA * ca->host_bw);
149 | unsigned long cwnd = cwnd_scale * rate * ca->base_rtt /
150 | tp->mss_cache / USEC_PER_SEC;
151 | set_rate(sk, rate);
152 | set_cwnd(sk, cwnd, NULL);
153 | tp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
154 |
155 | ca->p_smooth = 0;
156 |
157 | clear_old_cwnds(sk);
158 | }
159 | }
160 |
161 | /* Update the list of recent snd_cwnds. */
162 | static bool update_old(struct sock *sk, unsigned long p_smooth)
163 | {
164 | struct powertcp *ca = inet_csk_ca(sk);
165 | const struct tcp_sock *tp = tcp_sk(sk);
166 |
167 | if (before(ca->old_cwnd.snd_nxt, tp->snd_una) ||
168 | (ca->old_cwnd.cwnd == 0 && ca->old_cwnd.snd_nxt == 0)) {
169 | ca->old_cwnd.cwnd = ca->snd_cwnd;
170 | ca->old_cwnd.snd_nxt = tp->snd_nxt;
171 | }
172 |
173 | ca->p_smooth = p_smooth;
174 |
175 | return true;
176 | }
177 |
178 | static unsigned long update_window(struct sock *sk, unsigned long cwnd_old,
179 | unsigned long norm_power,
180 | struct powertcp_trace_event *trace_event)
181 | {
182 | const struct powertcp *ca = inet_csk_ca(sk);
183 | const struct tcp_sock *tp = tcp_sk(sk);
184 | unsigned long base_bdp = BITS_TO_BYTES(cwnd_scale) * ca->host_bw *
185 | ca->base_rtt / tp->mss_cache;
186 | unsigned long cwnd;
187 |
188 | norm_power = not_zero(norm_power);
189 | cwnd = ewma(gamma, gamma_scale,
190 | power_scale * cwnd_old / norm_power + ca->beta,
191 | ca->snd_cwnd);
192 | cwnd = not_zero(cwnd);
193 | cwnd = min(cwnd, base_bdp);
194 | set_cwnd(sk, cwnd, trace_event);
195 | return cwnd;
196 | }
197 |
198 | static int ptcp_init(struct sock *sk)
199 | {
200 | return int_impl_init(sk);
201 | }
202 |
203 | static unsigned long ptcp_norm_power(struct sock *sk,
204 | const struct rate_sample *rs,
205 | struct powertcp_trace_event *trace_event)
206 | {
207 | const struct powertcp *ca = inet_csk_ca(sk);
208 | unsigned long delta_t = 0;
209 | unsigned long p_norm = 0;
210 | unsigned long p_smooth = ca->p_smooth;
211 |
212 | const struct powertcp_int *prev_int = get_prev_int(sk);
213 | const struct powertcp_int *this_int = get_int(sk, prev_int);
214 | int i;
215 |
216 | /* TODO: Do something helpful (a full reset?) when the path changes. */
217 | if (!this_int || !prev_int || this_int->path_id != prev_int->path_id) {
218 | /* Power calculations will be skipped for the first one or two ACKs.
219 | * p_smooth will still be 0 then. This is intentional to have power
220 | * smoothing start with a proper value (=p_norm) at the end of this
221 | * function.
222 | */
223 | return 0;
224 | }
225 |
226 | /* for each egress port i on the path */
227 | for (i = 0; i < this_int->n_hop && i < max_n_hops; ++i) {
228 | const struct powertcp_hop_int *hop_int = &this_int->hops[i];
229 | const struct powertcp_hop_int *prev_hop_int =
230 | &prev_int->hops[i];
231 | unsigned long dt =
232 | not_zero((hop_int->ts - prev_hop_int->ts) & max_ts);
233 | long queue_diff =
234 | (long)hop_int->qlen - (long)prev_hop_int->qlen;
235 | u32 tx_bytes_diff =
236 | (hop_int->tx_bytes - prev_hop_int->tx_bytes) &
237 | max_tx_bytes;
238 | /* The variable name "current" instead of lambda would conflict with a
239 | * macro of the same name in asm-generic/current.h.
240 | */
241 | unsigned long lambda =
242 | not_zero((unsigned long)max(
243 | 0l, queue_diff + (long)tx_bytes_diff) *
244 | (NSEC_PER_SEC / dt));
245 | unsigned long bdp = hop_int->bandwidth * ca->base_rtt;
246 | unsigned long voltage = hop_int->qlen + bdp;
247 | unsigned long hop_p = lambda * voltage;
248 | unsigned long equilibrium = not_zero(
249 | (unsigned long)hop_int->bandwidth * hop_int->bandwidth /
250 | power_scale * MEGA * ca->base_rtt);
251 | unsigned long hop_p_norm = hop_p / equilibrium;
252 | if (hop_p_norm > p_norm || i == 0) {
253 | p_norm = hop_p_norm;
254 | delta_t = dt;
255 |
256 | if (tracing_enabled() && trace_event) {
257 | trace_event->qlen = hop_int->qlen;
258 | trace_event->tx_bytes_diff = tx_bytes_diff;
259 | }
260 | }
261 | }
262 |
263 | delta_t = min(delta_t, NSEC_PER_USEC * ca->base_rtt);
264 | p_norm = max(p_norm_cutoff, p_norm);
265 | p_smooth = p_smooth == 0 ? p_norm :
266 | ewma(delta_t, NSEC_PER_USEC * ca->base_rtt,
267 | p_norm, p_smooth);
268 |
269 | if (tracing_enabled() && trace_event) {
270 | trace_event->delta_t = delta_t;
271 | trace_event->p_norm = p_norm;
272 | trace_event->p_smooth = p_smooth;
273 | }
274 |
275 | return p_smooth;
276 | }
277 |
278 | static void ptcp_release(struct sock *sk)
279 | {
280 | int_impl_release(sk);
281 | }
282 |
283 | static void ptcp_reset(struct sock *sk, enum tcp_ca_event ev)
284 | {
285 | struct ptcp_powertcp *ca = inet_csk_ca(sk);
286 | int_impl_reset(&ca->int_impl, ev);
287 | reset(sk, ev);
288 | }
289 |
290 | static bool ptcp_update_old(struct sock *sk, const struct rate_sample *rs,
291 | unsigned long p_smooth)
292 | {
293 | struct ptcp_powertcp *ca = inet_csk_ca(sk);
294 | int_impl_update_old(&ca->int_impl);
295 | return update_old(sk, p_smooth);
296 | }
297 |
298 | static unsigned long
299 | ptcp_update_window(struct sock *sk, unsigned long cwnd_old,
300 | unsigned long norm_power,
301 | struct powertcp_trace_event *trace_event)
302 | {
303 | return update_window(sk, cwnd_old, norm_power, trace_event);
304 | }
305 |
306 | static int rttptcp_init(struct sock *sk)
307 | {
308 | return 0;
309 | }
310 |
311 | static unsigned long
312 | rttptcp_norm_power(struct sock *sk, const struct rate_sample *rs,
313 | struct powertcp_trace_event *trace_event)
314 | {
315 | struct rttptcp_powertcp *ca = inet_csk_ca(sk);
316 | const struct tcp_sock *tp = tcp_sk(sk);
317 | unsigned long dt, rtt_grad, p_norm, delta_t;
318 | unsigned long p_smooth = ca->p_smooth;
319 | unsigned long rtt_us;
320 |
321 | if (before(tp->snd_una, ca->last_updated)) {
322 | return p_smooth;
323 | }
324 |
325 | ca->t = get_tstamp(sk);
326 | rtt_us = get_rtt(sk, rs);
327 | /* Timestamps are always increasing here, logically. So we want to have
328 | * unsigned wrap-around when it's time and don't use tcp_stamp_us_delta().
329 | */
330 | dt = not_zero(ca->t - ca->t_prev);
331 | delta_t = min(dt, ca->base_rtt * NSEC_PER_USEC);
332 | if (ca->prev_rtt_us <= rtt_us) {
333 | rtt_grad = NSEC_PER_USEC * power_scale *
334 | (rtt_us - ca->prev_rtt_us) / dt;
335 | p_norm = (rtt_grad + power_scale) * rtt_us / ca->base_rtt;
336 | } else {
337 | /* Separate code path for negative rtt_grad since BPF does not support
338 | * division by signed numbers.
339 | */
340 | rtt_grad = NSEC_PER_USEC * power_scale *
341 | (ca->prev_rtt_us - rtt_us) / dt;
342 | p_norm = (power_scale - min(power_scale, rtt_grad)) * rtt_us /
343 | ca->base_rtt;
344 | }
345 | p_norm = max(p_norm_cutoff, p_norm);
346 |
347 | /* powertcp.p_smooth is initialized with 0, we don't want to smooth for the
348 | * very first calculation.
349 | */
350 | p_smooth = p_smooth == 0 ? p_norm :
351 | ewma(delta_t, NSEC_PER_USEC * ca->base_rtt,
352 | p_norm, p_smooth);
353 |
354 | if (tracing_enabled() && trace_event) {
355 | trace_event->delta_t = delta_t;
356 | trace_event->p_norm = p_norm;
357 | trace_event->p_smooth = p_smooth;
358 | trace_event->rtt_grad = rtt_grad;
359 | }
360 |
361 | return p_smooth;
362 | }
363 |
364 | static void rttptcp_release(struct sock *sk)
365 | {
366 | /* no-op */
367 | }
368 |
369 | static void rttptcp_reset(struct sock *sk, enum tcp_ca_event ev)
370 | {
371 | struct rttptcp_powertcp *ca = inet_csk_ca(sk);
372 | const struct tcp_sock *tp = tcp_sk(sk);
373 |
374 | reset(sk, ev);
375 |
376 | /* Only reset those on initialization. */
377 | if (ev == CA_EVENT_CWND_RESTART) {
378 | // TODO: Evaluate if it actually improves performance of the algorithm
379 | // to reset those two values only on CA_EVENT_CWND_RESTART:
380 | ca->last_updated = tp->snd_nxt;
381 | ca->prev_rtt_us = tp->srtt_us >> 3;
382 | }
383 |
384 | ca->t_prev = ca->t;
385 | }
386 |
387 | static bool rttptcp_update_old(struct sock *sk, const struct rate_sample *rs,
388 | unsigned long p_smooth)
389 | {
390 | struct rttptcp_powertcp *ca = inet_csk_ca(sk);
391 | const struct tcp_sock *tp = tcp_sk(sk);
392 |
393 | if (before(tp->snd_una, ca->last_updated)) {
394 | return false;
395 | }
396 |
397 | update_old(sk, p_smooth);
398 |
399 | ca->last_updated = tp->snd_nxt;
400 | ca->prev_rtt_us = get_rtt(sk, rs);
401 | // TODO: There are multiple timestamps available here. Is there a better one?
402 | ca->t_prev = ca->t;
403 |
404 | return true;
405 | }
406 |
407 | static unsigned long
408 | rttptcp_update_window(struct sock *sk, unsigned long cwnd_old,
409 | unsigned long norm_power,
410 | struct powertcp_trace_event *trace_event)
411 | {
412 | struct rttptcp_powertcp *ca = inet_csk_ca(sk);
413 | const struct tcp_sock *tp = tcp_sk(sk);
414 |
415 | if (before(tp->snd_una, ca->last_updated)) {
416 | return ca->snd_cwnd;
417 | }
418 |
419 | return update_window(sk, cwnd_old, norm_power, trace_event);
420 | }
421 |
422 | #define DEFINE_POWERTCP_VARIANT(func_prefix, cong_ops_name) \
423 | void POWERTCP_CONG_OPS_FUNC(powertcp_##func_prefix##_cwnd_event, \
424 | struct sock *sk, enum tcp_ca_event ev) \
425 | { \
426 | struct powertcp *ca = inet_csk_ca(sk); \
427 | \
428 | if (POWERTCP_UNLIKELY(ca->host_bw == 0)) { \
429 | return; \
430 | } \
431 | \
432 | if (ev == CA_EVENT_TX_START) { \
433 | func_prefix##_reset(sk, ev); \
434 | } \
435 | } \
436 | \
437 | void POWERTCP_CONG_OPS_FUNC(powertcp_##func_prefix##_init, \
438 | struct sock *sk) \
439 | { \
440 | struct powertcp *ca = inet_csk_ca(sk); \
441 | \
442 | BUILD_BUG_ON(sizeof(struct powertcp) > ICSK_CA_PRIV_SIZE); \
443 | BUILD_BUG_ON(sizeof(struct func_prefix##_powertcp) > \
444 | ICSK_CA_PRIV_SIZE); \
445 | \
446 | func_prefix##_init(sk); \
447 | \
448 | ca->base_rtt = ULONG_MAX; \
449 | ca->beta = beta < 0 ? ULONG_MAX : beta * cwnd_scale; \
450 | ca->host_bw = get_host_bw(sk); \
451 | \
452 | func_prefix##_reset(sk, CA_EVENT_CWND_RESTART); \
453 | \
454 | require_hwtstamps(sk); \
455 | require_pacing(sk); \
456 | } \
457 | \
458 | void POWERTCP_CONG_OPS_FUNC(powertcp_##func_prefix##_cong_control, \
459 | struct sock *sk, \
460 | const struct rate_sample *rs) \
461 | { \
462 | struct powertcp *ca = inet_csk_ca(sk); \
463 | const struct tcp_sock *tp = tcp_sk(sk); \
464 | unsigned long cwnd_old; \
465 | unsigned long norm_power; \
466 | unsigned long cwnd; \
467 | unsigned long rate; \
468 | bool updated; \
469 | struct powertcp_trace_event trace_event = {}; \
470 | \
471 | if (POWERTCP_UNLIKELY(ca->host_bw == 0)) { \
472 | return; \
473 | } \
474 | \
475 | cwnd_old = get_cwnd(sk); \
476 | norm_power = func_prefix##_norm_power(sk, rs, &trace_event); \
477 | if (norm_power) { \
478 | cwnd = func_prefix##_update_window( \
479 | sk, cwnd_old, norm_power, &trace_event); \
480 | rate = (USEC_PER_SEC * cwnd * tp->mss_cache) / \
481 | ca->base_rtt / cwnd_scale; \
482 | set_rate(sk, rate); \
483 | } \
484 | \
485 | updated = func_prefix##_update_old(sk, rs, norm_power); \
486 | \
487 | if (tracing_enabled() && updated && norm_power) { \
488 | trace_event.rate = rate; \
489 | trace_event.sock_hash = sk->__sk_common.skc_hash; \
490 | output_trace_event(&trace_event); \
491 | } \
492 | } \
493 | \
494 | void POWERTCP_CONG_OPS_FUNC(powertcp_##func_prefix##_release, \
495 | struct sock *sk) \
496 | { \
497 | const struct powertcp *ca = inet_csk_ca(sk); \
498 | \
499 | if (POWERTCP_UNLIKELY(ca->host_bw == 0)) { \
500 | return; \
501 | } \
502 | \
503 | clear_old_cwnds(sk); \
504 | \
505 | func_prefix##_release(sk); \
506 | } \
507 | \
508 | POWERTCP_CONG_OPS_ATTRS struct tcp_congestion_ops cong_ops_name = { \
509 | .cong_avoid = POWERTCP_CONG_OPS_FUNC_PTR powertcp_cong_avoid, \
510 | .cong_control = POWERTCP_CONG_OPS_FUNC_PTR \
511 | powertcp_##func_prefix##_cong_control, \
512 | .cwnd_event = POWERTCP_CONG_OPS_FUNC_PTR \
513 | powertcp_##func_prefix##_cwnd_event, \
514 | .init = POWERTCP_CONG_OPS_FUNC_PTR \
515 | powertcp_##func_prefix##_init, \
516 | .name = POWERTCP_CONG_OPS_NAME(cong_ops_name), \
517 | .release = POWERTCP_CONG_OPS_FUNC_PTR \
518 | powertcp_##func_prefix##_release, \
519 | .ssthresh = POWERTCP_CONG_OPS_FUNC_PTR powertcp_ssthresh, \
520 | .undo_cwnd = POWERTCP_CONG_OPS_FUNC_PTR powertcp_undo_cwnd, \
521 | }
522 |
523 | u32 POWERTCP_CONG_OPS_FUNC(powertcp_ssthresh, struct sock *sk)
524 | {
525 | /* We don't do slow starts here! */
526 | return TCP_INFINITE_SSTHRESH;
527 | }
528 |
529 | u32 POWERTCP_CONG_OPS_FUNC(powertcp_undo_cwnd, struct sock *sk)
530 | {
531 | /* Never undo after a loss. */
532 | return tcp_sk(sk)->snd_cwnd;
533 | }
534 |
535 | DEFINE_POWERTCP_VARIANT(ptcp, powertcp);
536 |
537 | /* Cannot name it rtt_powertcp due to the size limit for
538 | * tcp_congestion_ops.name. */
539 | DEFINE_POWERTCP_VARIANT(rttptcp, rttpowertcp);
540 |
--------------------------------------------------------------------------------
/powertcp_defs.h:
--------------------------------------------------------------------------------
1 | /* SPDX-License-Identifier: GPL-2.0 OR MIT */
2 | /*
3 | * Constants and default values common to both PowerTCP implementations.
4 | */
5 | #ifndef POWERTCP_DEFS_H
6 | #define POWERTCP_DEFS_H
7 |
8 | static const unsigned long cwnd_scale = (1UL << 10);
9 | static const unsigned long fallback_host_bw = 1000; /* Mbit/s */
10 | static const unsigned long gamma_scale = (1UL << 10);
11 | static const unsigned long power_scale = (1UL << 16);
12 | static const unsigned long p_norm_cutoff = 0.01 * power_scale;
13 |
14 | /* Avoid an "initializer element is not constant" error with gcc before 8.1 by
15 | * using an enum instead of static const variables. No, I don't want to use
16 | * macros for constants here :-)
17 | */
18 | enum {
19 | default_base_rtt = -1, /* us */
20 | default_beta = -1, /* Number of packets */
21 | default_expected_flows = 10,
22 | default_gamma = 921, /* ~= 0.9 * gamma_scale */
23 | default_hop_bw = 1000, /* Mbit/s */
24 | default_host_bw = 1000, /* Mbit/s */
25 | };
26 |
27 | #endif /* POWERTCP_DEFS_H */
28 |
--------------------------------------------------------------------------------
/powertcp_head.c:
--------------------------------------------------------------------------------
1 | // SPDX-License-Identifier: GPL-2.0 OR MIT
2 |
3 | #ifndef POWERTCP_CONG_OPS_ATTRS
4 | #define POWERTCP_CONG_OPS_ATTRS
5 | #endif
6 |
7 | #ifndef POWERTCP_CONG_OPS_FUNC
8 | #define POWERTCP_CONG_OPS_FUNC(name, args...) name(args)
9 | #endif
10 |
11 | #ifndef POWERTCP_CONG_OPS_FUNC_PTR
12 | #define POWERTCP_CONG_OPS_FUNC_PTR
13 | #endif
14 |
15 | #ifndef POWERTCP_CONG_OPS_NAME_PREFIX
16 | #define POWERTCP_CONG_OPS_NAME_PREFIX
17 | #endif
18 |
19 | #ifndef POWERTCP_LIKELY
20 | #define POWERTCP_LIKELY(cond) cond
21 | #endif
22 |
23 | #ifndef POWERTCP_PARAM_ATTRS
24 | #define POWERTCP_PARAM_ATTRS
25 | #endif
26 |
27 | #ifndef POWERTCP_UNLIKELY
28 | #define POWERTCP_UNLIKELY(cond) cond
29 | #endif
30 |
31 | #ifndef __stringify
32 | #define __stringify_1(x...) #x
33 | #define __stringify(x...) __stringify_1(x)
34 | #endif
35 |
36 | struct old_cwnd {
37 | u32 snd_nxt;
38 | unsigned long cwnd;
39 | };
40 |
41 | #define POWERTCP_STRUCT(struct_name, ...) \
42 | struct struct_name { \
43 | unsigned long base_rtt; \
44 | unsigned long snd_cwnd; \
45 | \
46 | unsigned long beta; /* number of packets scaled by cwnd_scale */ \
47 | \
48 | struct old_cwnd old_cwnd; \
49 | \
50 | unsigned long p_smooth; \
51 | \
52 | /* powertcp_cong_control() seems to (unexpectedly) get called once before \
53 | * powertcp_init(). host_bw is still 0 then, thanks to \
54 | * tcp_assign_congestion_control(), and we use that as an indicator whether \
55 | * we are initialized. \
56 | */ \
57 | unsigned long host_bw; /* Mbit/s */ \
58 | \
59 | __VA_ARGS__ \
60 | }
61 | #define POWERTCP_STRUCT_FIELDS(fields) fields
62 |
63 | // clang-format off
64 | POWERTCP_STRUCT(powertcp);
65 |
66 | POWERTCP_STRUCT(ptcp_powertcp,
67 | POWERTCP_STRUCT_FIELDS(
68 | powertcp_int_impl_t int_impl;
69 | )
70 | );
71 |
72 | POWERTCP_STRUCT(rttptcp_powertcp,
73 | POWERTCP_STRUCT_FIELDS(
74 | u32 last_updated;
75 | unsigned long prev_rtt_us;
76 | u64 t; /* in ns */
77 | u64 t_prev; /* in ns */
78 | )
79 | );
80 | // clang-format on
81 |
82 | #undef POWERTCP_STRUCT
83 | #undef POWERTCP_STRUCT_FIELDS
84 |
85 | POWERTCP_PARAM_ATTRS long base_rtt = default_base_rtt;
86 | POWERTCP_PARAM_ATTRS long beta = default_beta; /* Number of packets */
87 | POWERTCP_PARAM_ATTRS long expected_flows = default_expected_flows;
88 | POWERTCP_PARAM_ATTRS long gamma = default_gamma;
89 | POWERTCP_PARAM_ATTRS long hop_bw = default_hop_bw; /* Mbit/s */
90 | POWERTCP_PARAM_ATTRS long host_bw = fallback_host_bw; /* Mbit/s */
91 |
--------------------------------------------------------------------------------
/powertcp_int.c:
--------------------------------------------------------------------------------
1 | // SPDX-License-Identifier: GPL-2.0 OR MIT
2 |
3 | struct powertcp_hop_int {
4 | u32 bandwidth; /* in MByte/s */
5 | u32 ts; /* careful: in ns */
6 | u32 tx_bytes;
7 | u32 qlen;
8 | };
9 |
10 | struct powertcp_int {
11 | int n_hop;
12 | int path_id;
13 | struct powertcp_hop_int hops[max_n_hops];
14 | };
15 |
--------------------------------------------------------------------------------
/powertcp_no-int.c:
--------------------------------------------------------------------------------
1 | // SPDX-License-Identifier: GPL-2.0 OR MIT
2 |
3 | static const struct powertcp_int *get_int(struct sock *sk,
4 | const struct powertcp_int *prev_int)
5 | {
6 | return NULL;
7 | }
8 |
9 | static const struct powertcp_int *get_prev_int(struct sock *sk)
10 | {
11 | return NULL;
12 | }
13 |
14 | static int int_impl_init(struct sock *sk)
15 | {
16 | return 0;
17 | }
18 |
19 | static void int_impl_release(struct sock *sk)
20 | {
21 | }
22 |
23 | static void int_impl_reset(powertcp_int_impl_t *int_impl, enum tcp_ca_event ev)
24 | {
25 | }
26 |
27 | static void int_impl_update_old(powertcp_int_impl_t *int_impl)
28 | {
29 | }
30 |
31 | static int register_int(struct tcp_congestion_ops *cong_ops)
32 | {
33 | return 0;
34 | }
35 |
36 | static void unregister_int(struct tcp_congestion_ops *cong_ops)
37 | {
38 | }
39 |
--------------------------------------------------------------------------------
/powertcp_no-int_head.c:
--------------------------------------------------------------------------------
1 | // SPDX-License-Identifier: GPL-2.0 OR MIT
2 |
3 | enum { max_n_hops = 1 };
4 |
5 | #include "powertcp_int.c"
6 |
7 | /* In case the ts value is taken directly from a less-than-32-bit INT field,
8 | * its maximum value has to be known for correct wrap-around in calculations.
9 | */
10 | static const unsigned int max_ts = -1;
11 |
12 | /* In case the tx_bytes value is taken directly from a less-than-32-bit INT
13 | * field, its maximum value has to be known for correct wrap-around in
14 | * calculations.
15 | */
16 | static const u32 max_tx_bytes = -1;
17 |
18 | struct powertcp_int_impl {
19 | };
20 |
21 | typedef struct powertcp_int_impl *powertcp_int_impl_t;
22 |
--------------------------------------------------------------------------------
/powertcp_trace.h:
--------------------------------------------------------------------------------
1 | #ifndef POWERTCP_TRACE_H
2 | #define POWERTCP_TRACE_H
3 |
4 | /* This header requires prior inclusion of vmlinux.h or linux/types.h. */
5 |
6 | struct powertcp_trace_event {
7 | __u64 time;
8 | unsigned int sock_hash;
9 | __u32 cwnd;
10 | unsigned long rate;
11 | unsigned long p_norm;
12 | unsigned long p_smooth;
13 | unsigned long qlen;
14 | __u32 tx_bytes_diff;
15 | __u32 delta_t; /* careful: in ns */
16 | long rtt_grad; // long instead of unsigned long might truncate a huge rtt_grad
17 | };
18 |
19 | #endif
20 |
--------------------------------------------------------------------------------
/tcp_powertcp.c:
--------------------------------------------------------------------------------
1 | // SPDX-License-Identifier: GPL-2.0 OR MIT
2 | /*
3 | * PowerTCP congestion control
4 | *
5 | * Based on the algorithm developed in:
6 | * Addanki, V., O. Michel, and S. Schmid.
7 | * "PowerTCP: Pushing the Performance Limits of Datacenter Networks."
8 | * 19th USENIX Symposium on Networked Systems Design and Implementation (NSDI 22).
9 | * USENIX Association, 2022.
10 | * Available at: https://arxiv.org/pdf/2112.14309.pdf
11 | *
12 | * Implemented by:
13 | * Jörn-Thorben Hinz, TU Berlin, 2022.
14 | */
15 |
16 | #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
17 |
18 | #include "powertcp_defs.h"
19 |
20 | #include
21 | #include
22 | #include
23 | #include
24 | #include
25 | #include
26 |
27 | #include "powertcp_trace.h"
28 |
29 | #ifndef MEGA
30 | #define MEGA 1000000UL
31 | #endif
32 |
33 | #define CREATE_TRACE_POINTS
34 | #include "tcp_powertcp_trace.h"
35 |
36 | #ifndef BITS_TO_BYTES
37 | #define BITS_TO_BYTES(nr) __KERNEL_DIV_ROUND_UP(nr, BITS_PER_TYPE(char))
38 | #endif
39 |
40 | #define POWERTCP_CONG_OPS_ATTRS static __read_mostly
41 | #define POWERTCP_CONG_OPS_FUNC_ATTRS static
42 | #define POWERTCP_PARAM_ATTRS static __read_mostly
43 | #define POWERTCP_UNLIKELY unlikely
44 |
45 | #include "powertcp_no-int_head.c"
46 |
47 | #include "powertcp_head.c"
48 |
49 | module_param(base_rtt, long, 0444);
50 | MODULE_PARM_DESC(
51 | base_rtt,
52 | "base (minimum) round-trip time (RTT) in us (default: -1; -1: automatically detect)");
53 | module_param(beta, long, 0444);
54 | MODULE_PARM_DESC(beta,
55 | "additive increase (default: -1; -1: automatically set beta)");
56 | module_param(expected_flows, long, 0444);
57 | MODULE_PARM_DESC(expected_flows,
58 | "expected number of flows sharing the host NIC (default: 10)");
59 | module_param(gamma, long, 0444);
60 | MODULE_PARM_DESC(gamma, "exponential moving average weight, times " __stringify(
61 | gamma_scale) "(default: 921 ~= 0,9)");
62 | module_param(hop_bw, long, 0444);
63 | MODULE_PARM_DESC(hop_bw, "hop bandwidth in Mbit/s");
64 | module_param(host_bw, long, 0444);
65 | MODULE_PARM_DESC(
66 | host_bw,
67 | "host NIC bandwidth in Mbit/s (default: -1; -1: detect from socket)");
68 |
69 | /* Look for the host bandwidth (in Mbit/s). */
70 | static unsigned long get_host_bw(struct sock *sk)
71 | {
72 | const struct dst_entry *dst;
73 | unsigned long bw = fallback_host_bw;
74 |
75 | if (host_bw > 0) {
76 | return host_bw;
77 | }
78 |
79 | dst = __sk_dst_get(sk);
80 | if (dst && dst->dev) {
81 | struct ethtool_link_ksettings cmd;
82 | int r;
83 |
84 | rtnl_lock();
85 | /* ethtool_params_from_link_mode() would be even simpler.
86 | * But dst->dev->link_mode seems to always be 0 at this point. */
87 | r = __ethtool_get_link_ksettings(dst->dev, &cmd);
88 | rtnl_unlock();
89 | if (r == 0 && cmd.base.speed != SPEED_UNKNOWN) {
90 | bw = cmd.base.speed;
91 | pr_debug("hash=%u: got link speed: %lu Mbit/s\n",
92 | sk->sk_hash, bw);
93 | } else {
94 | pr_warn("link speed unavailable, using fallback: %lu Mbit/s\n",
95 | bw);
96 | }
97 | }
98 |
99 | return bw;
100 | }
101 |
102 | static u64 get_tstamp(const struct sock *sk)
103 | {
104 | return tcp_sk(sk)->tcp_clock_cache;
105 | }
106 |
107 | static void output_trace_event(struct powertcp_trace_event *trace_event)
108 | {
109 | trace_event->time = ktime_get_ns();
110 | trace_cong_control(trace_event);
111 | }
112 |
113 | void require_hwtstamps(struct sock *sk)
114 | {
115 | /* TODO: Would it make sense to execute (the equivalent of)
116 | * ioctl(SIOCSHWTSTAMP) for the/a network device here?
117 | */
118 |
119 | int optval = SOF_TIMESTAMPING_RX_HARDWARE;
120 | tcp_setsockopt(sk, SOL_SOCKET, SO_TIMESTAMPING_NEW,
121 | KERNEL_SOCKPTR(&optval), sizeof(optval));
122 | }
123 |
124 | static void require_pacing(struct sock *sk)
125 | {
126 | cmpxchg(&sk->sk_pacing_status, SK_PACING_NONE, SK_PACING_NEEDED);
127 | }
128 |
129 | /* Set the socket pacing rate (bytes per second). */
130 | static void set_rate(struct sock *sk, unsigned long rate)
131 | {
132 | /* Before 4.20, sk_max_pacing_rate was only a u32. Use explicit min_t with
133 | * type here to avoid a warning on those older kernels.
134 | */
135 | sk->sk_pacing_rate = min_t(unsigned long, rate, sk->sk_max_pacing_rate);
136 | }
137 |
138 | static bool tracing_enabled(void)
139 | {
140 | return trace_cong_control_enabled();
141 | }
142 |
143 | /* cong_avoid was previously non-optional in tcp_congestion_ops for a BPF CA.
144 | * For the module implementation it can just be set to a NULL pointer.
145 | */
146 | static const void *const powertcp_cong_avoid = NULL;
147 |
148 | #include "powertcp_no-int.c"
149 |
150 | #include "powertcp.c"
151 |
152 | static int __init powertcp_register(void)
153 | {
154 | int ret;
155 |
156 | powertcp.owner = THIS_MODULE;
157 | ret = tcp_register_congestion_control(&powertcp);
158 | if (ret) {
159 | return ret;
160 | }
161 |
162 | ret = register_int(&powertcp);
163 | if (ret) {
164 | tcp_unregister_congestion_control(&powertcp);
165 | return ret;
166 | }
167 |
168 | rttpowertcp.owner = THIS_MODULE;
169 | ret = tcp_register_congestion_control(&rttpowertcp);
170 | if (ret) {
171 | return ret;
172 | }
173 |
174 | return 0;
175 | }
176 |
177 | static void __exit powertcp_unregister(void)
178 | {
179 | unregister_int(&powertcp);
180 | tcp_unregister_congestion_control(&powertcp);
181 | tcp_unregister_congestion_control(&rttpowertcp);
182 | }
183 |
184 | module_init(powertcp_register);
185 | module_exit(powertcp_unregister);
186 |
187 | MODULE_ALIAS("tcp_rttpowertcp");
188 | MODULE_AUTHOR("Jörn-Thorben Hinz");
189 | MODULE_DESCRIPTION("PowerTCP congestion control");
190 | MODULE_LICENSE("Dual MIT/GPL");
191 |
--------------------------------------------------------------------------------
/tcp_powertcp_trace.h:
--------------------------------------------------------------------------------
1 | // SPDX-License-Identifier: GPL-2.0 OR MIT
2 | #undef TRACE_SYSTEM
3 | #define TRACE_SYSTEM powertcp
4 |
5 | #if !defined(_TRACE_POWERTCP_H) || defined(TRACE_HEADER_MULTI_READ)
6 | #define _TRACE_POWERTCP_H
7 |
8 | #include
9 |
10 | // clang-format off
11 | TRACE_EVENT(cong_control,
12 | TP_PROTO(const struct powertcp_trace_event *ev),
13 | TP_ARGS(ev),
14 | TP_STRUCT__entry(
15 | __field(u64, time)
16 | __field(unsigned int, sock_hash)
17 | __field(u32, cwnd)
18 | __field(unsigned long, rate)
19 | __field(unsigned long, p_norm)
20 | __field(unsigned long, p_smooth)
21 | __field(unsigned long, qlen)
22 | __field(__u32, tx_bytes_diff)
23 | __field(__u32, delta_t)
24 | ),
25 | TP_fast_assign(
26 | __entry->time = ev->time;
27 | __entry->sock_hash = ev->sock_hash;
28 | __entry->cwnd = ev->cwnd;
29 | __entry->rate = ev->rate;
30 | __entry->p_norm = ev->p_norm;
31 | __entry->p_smooth = ev->p_smooth;
32 | __entry->qlen = ev->qlen;
33 | __entry->tx_bytes_diff = ev->tx_bytes_diff;
34 | __entry->delta_t = ev->delta_t;
35 | ),
36 | TP_printk("time=%llu us sock_hash=%u cwnd=%u rate=%ld Mbit/s p_norm=%ld p_smooth=%ld qlen=%ld tx_bytes_diff=%u bytes delta_t=%u ns",
37 | __entry->time,
38 | __entry->sock_hash,
39 | __entry->cwnd,
40 | BITS_PER_BYTE * __entry->rate / MEGA,
41 | __entry->p_norm,
42 | __entry->p_smooth,
43 | __entry->qlen,
44 | __entry->tx_bytes_diff,
45 | __entry->delta_t
46 | )
47 | );
48 | // clang-format on
49 |
50 | #endif /* _TRACE_POWERTCP_H */
51 |
52 | #undef TRACE_INCLUDE_FILE
53 | #define TRACE_INCLUDE_FILE tcp_powertcp_trace
54 | #undef TRACE_INCLUDE_PATH
55 | #define TRACE_INCLUDE_PATH .
56 |
57 | /* This part must be outside protection */
58 | #include
59 |
--------------------------------------------------------------------------------
/tools/README.md:
--------------------------------------------------------------------------------
1 | # `setup-bpf` and `setup-module`
2 | The setup scripts prepare the BPF and module implementation of PowerTCP,
3 | respectively, for usage. This includes (re)loading the implementation and its
4 | prerequisites and opening a screen session for interactive usage, e.g. for
5 | calling `iperf`/`iperf3`.
6 |
7 | **The scripts must be called as root or with `sudo`.**
8 |
9 | ## Usage
10 | ```
11 | setup-bpf SESSION_NAME [PARAMETER...]
12 | setup-module SESSION_NAME [PARAMETER...]
13 | ```
14 |
15 | ### `SESSION_NAME`
16 | Required. Name of a predefined screen session to open. Available sessions:
17 | - `iperf-client`: Opens an empty screen session for `iperf`/`iperf3` *client*
18 | usage.
19 | - `iperf-servers`: Opens a screen session with both `iperf` and `iperf3`
20 | servers readily running inside.
21 |
22 | **Using PowerTCP and its prerequisites outside of the opened screen session
23 | will require additional, manual setup steps (e.g., joining the TCP-INT cgroup
24 | for the BPF implementation).**
25 |
26 | ### `PARAMETER`
27 | Optional. One or multiple PowerTCP algorithm parameters. Available parameters:
28 | - `base_rtt`: Base RTT in µs
29 | - `beta`: Additive increase parameter in number of packets
30 | - `hop_bw`: Link speed of the switches in Mbit/s
31 | - `host_bw`: Link speed of the host in Mbit/s
32 | - `expected_flows`: Expected number of flows on a link
33 | - `gamma`: EWMA weight in range [0.0, 1.0]
34 |
35 | Currently, parameter values passed to `setup-module` need to be scaled with the
36 | constants defined in [powertcp_defs.h](../powertcp_defs.h). `setup-bpf` accepts
37 | values in the units specified above.
38 |
39 | Parameters can be set to different values from within the screen session
40 | without calling `setup-bpf`
41 | ```console
42 | root@host:powertcp-linux# ./bpf/powertcp -f register base_rtt=100 hop_bw=25000 host_bw=25000
43 | ```
44 | or `setup-module` again
45 | ```console
46 | root@host:powertcp-linux# ./tools/reinsmod base_rtt=100 hop_bw=25000 host_bw=25000
47 | ```
48 |
49 | ## Examples
50 | ```console
51 | user@host:powertcp-linux$ ./tools/setup-bpf iperf-client base_rtt=123 hop_bw=100000 host_bw=100000
52 | user@host:powertcp-linux$ ./tools/setup-bpf iperf-servers
53 | user@host:powertcp-linux$ ./tools/setup-module iperf-client base_rtt=456 host_bw=10000
54 | user@host:powertcp-linux$ ./tools/setup-module iperf-servers
55 | ```
56 |
--------------------------------------------------------------------------------
/tools/bpf_tracer:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | #
3 | # Usage:
4 | # bpf_tracer IPERF(3)_CMDLINE -- POWERTCP_PARAMS
5 | #
6 | # Example calls:
7 | # bpf_tracer iperf -N -c 192.168.13.3 -Z bpf_powertcp -- host_bw=12000 hop_bw=25000 beta="2 10" base_rtt=50
8 | # bpf_tracer iperf3 -NZ -c 192.168.13.3 -C bpf_powertcp -- host_bw=12000 hop_bw=25000 beta="2 10" base_rtt=50
9 | # bpf_tracer iperf3 -NZ -c 192.168.13.3 -C bpf_powertcp -- host_bw=12000 hop_bw=25000 beta="1 2 4 8 10" base_rtt="50 500 5000"
10 | # bpf_tracer iperf3 -NZ -c 192.168.13.3 -C bpf_powertcp -- host_bw=12000 hop_bw=25000 beta="1 2 4 8 10" base_rtt=50 gamma="0.5 0.7 0.9"
11 | #
12 |
13 | set -eu
14 |
15 | iterate_param()
16 | {
17 | local inargs=$1
18 | local param=$2
19 | shift 2
20 | local vals
21 | read -r -a vals <<<"${powertcp_params[$param]}"
22 |
23 | local args
24 | for val in "${vals[@]}"; do
25 | args="${inargs:+$inargs }$param=$val"
26 | run "$args" "$@"
27 | done
28 | }
29 |
30 | run()
31 | {
32 | if [[ $# -gt 1 ]]; then
33 | iterate_param "$@"
34 | return
35 | fi
36 |
37 | local args=$1
38 |
39 | local csv_file
40 | printf -v csv_file "bpf_powertcp-%s.csv" "$args"
41 |
42 | [[ -z $args ]] || printf "# %s\n" "$args"
43 |
44 | # shellcheck disable=SC2086
45 | "${repo_dir}/bpf/powertcp" register -f tracing $args
46 |
47 | "${repo_dir}/bpf/powertcp" trace -C > "$csv_file" &
48 | local trace_pid=$!
49 | "${iperf_cmdline[@]}"
50 |
51 | sleep 3
52 | kill "$trace_pid"
53 | wait
54 |
55 | printf "\n"
56 | }
57 |
58 | repo_dir=${0%/*}/..
59 | if [[ ! -d ${repo_dir}/tools ]]; then
60 | printf "I don’t know where I’m called from\n" >&2
61 | exit 2
62 | fi
63 |
64 | iperf_cmdline=()
65 | while [[ $# -gt 0 && $1 != -- ]]; do
66 | iperf_cmdline+=( "$1" )
67 | shift
68 | done
69 |
70 | # Skip --
71 | [[ $# -eq 0 ]] || shift
72 |
73 | declare -A powertcp_params
74 | for arg in "$@"; do
75 | powertcp_params+=( ["${arg%=*}"]="${arg#*=}" )
76 | done
77 |
78 | "${repo_dir}/bpf/powertcp" unregister || :
79 | "${repo_dir}/bpf/tcp-int/code/src/tools/tcp_int" unload || :
80 |
81 | "${repo_dir}/bpf/tcp-int/code/src/tools/tcp_int" load
82 | "${repo_dir}/bpf/tcp-int/code/src/tools/tcp_int" "${TCP_INT_ENABLE:-enable}"
83 |
84 | printf "%d" "$$" > /sys/fs/cgroup/cgroup.tcp-int/cgroup.procs
85 |
86 | run "" "${!powertcp_params[@]}"
87 |
--------------------------------------------------------------------------------
/tools/gro_experiment:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | set -eu
3 |
4 | client_runs()
5 | {
6 | local run_str
7 | printf -v run_str '\e[u\e[Krun %%i/%i' "$runs"
8 |
9 | progress '\e[s'
10 | for ((i = 0; i < runs; ++i)); do
11 | progress "$run_str" "$i"
12 | avg_throughput=$(
13 | iperf3 --client="$srv_ip" \
14 | --congestion="$cca" \
15 | --interval=0 \
16 | --json \
17 | --no-delay \
18 | --omit=1 |
19 | jq .end.sum_received.bits_per_second
20 | )
21 | printf '%s,%s,%s,%s\n' "$cca" "$gro" "$int" "${avg_throughput%%.*}"
22 | sleep 1 # Give the server a moment to be ready again
23 | done
24 | progress "${run_str}\n" "$runs"
25 | }
26 |
27 | die()
28 | {
29 | local r=$1
30 | shift
31 | # shellcheck disable=SC2059
32 | printf "$@" >&2
33 | exit "$r"
34 | }
35 |
36 | progress()
37 | {
38 | # shellcheck disable=SC2059
39 | [[ -t 1 ]] || printf "$@" >&2
40 | }
41 |
42 | server_runs()
43 | {
44 | for ((i = 0; i < runs; ++i)); do
45 | iperf3 --one-off --server
46 | done
47 | }
48 |
49 | readonly role=$1
50 | readonly iface=$2
51 |
52 | if [[ $role == client ]]; then
53 | readonly srv_ip=$3
54 | readonly runs=${4:-10}
55 | readonly ccas=${5:-cubic}
56 |
57 | netcat -q0 "$srv_ip" 5201 <<-EOF || die 1 "Cannot reach server, experiment must be started there first!\n"
58 | ccas="$ccas"
59 | runs="$runs"
60 | EOF
61 | else
62 | # Executing arbitrary remote shell code, what could go wrong O:-)
63 | eval "$(netcat -q0 -l 5201)"
64 | fi
65 |
66 | readonly repo_dir=${0%/*}/..
67 | if [[ ! -d ${repo_dir}/tools ]]; then
68 | printf "I don’t know where I’m called from\n" >&2
69 | exit 2
70 | fi
71 |
72 | "${repo_dir}/bpf/tcp-int/code/src/tools/tcp_int" unload &>/dev/null || :
73 | "${repo_dir}/bpf/tcp-int/code/src/tools/tcp_int" load
74 | printf "%d" "$$" >/sys/fs/cgroup/cgroup.tcp-int/cgroup.procs
75 |
76 | printf 'cca,gro,int,avg_throughput\n'
77 | for cca in $ccas; do
78 | for gro in off on; do
79 | ethtool -K "$iface" gro "$gro"
80 |
81 | for int in disable enable; do
82 | progress 'cca=%s gro=%s int=%s:\t' "$cca" "$gro" "$int"
83 | "${repo_dir}/bpf/tcp-int/code/src/tools/tcp_int" "$int"
84 | "${role}_runs"
85 | done
86 | done
87 | done
88 |
--------------------------------------------------------------------------------
/tools/gro_plot:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 |
3 | import argparse
4 |
5 | import matplotlib as mpl
6 | import matplotlib.pyplot as plt
7 | import matplotlib.ticker as ticker
8 | import pandas as pd
9 |
10 | FIGSIZE = (2.5, 2)
11 |
12 | REPLACEMENTS = {
13 | "bbr": "BBR",
14 | "cubic": "Cubic",
15 | "dctcp": "DCTCP",
16 | "disable": "disabled",
17 | "enable": "enabled",
18 | "gro": "GRO",
19 | "int": "INT",
20 | "reno": "Reno",
21 | }
22 |
23 |
24 | def main():
25 | plt.rcParams.update({"pdf.fonttype": 42})
26 |
27 | try:
28 | mpl.style.use("seaborn-v0_8-colorblind")
29 | except Exception as e:
30 | print("Failed to change matplotlib style: {}".format(e))
31 |
32 | argparser = argparse.ArgumentParser()
33 | argparser.add_argument(
34 | "--link-speed",
35 | default=25,
36 | type=int,
37 | help="speed of the measured link in Gbit/s",
38 | )
39 | argparser.add_argument("csv_file", type=argparse.FileType("r"))
40 | args = argparser.parse_args()
41 |
42 | df = pd.read_csv(args.csv_file).rename(columns=REPLACEMENTS).replace(REPLACEMENTS)
43 | args.csv_file.close() # Be nice and close files :-)
44 |
45 | df["avg_throughput"] /= 10**9
46 |
47 | means = pd.pivot_table(df, columns=["GRO", "INT"], index="cca")
48 |
49 | fig, ax = plt.subplots(figsize=FIGSIZE, layout="constrained")
50 | ax = means["avg_throughput"].plot(
51 | ax=ax,
52 | kind="bar",
53 | rot=0,
54 | xlabel="Congestion control algorithm",
55 | ylabel="Throughput (Gbps)",
56 | )
57 | ax.grid(linestyle="--")
58 | ax.set_ylim(0, args.link_speed)
59 | ax.yaxis.set_major_locator(ticker.FixedLocator([1, 10, 15, 20, 25, 50, 100]))
60 | fig.savefig("cca-gro-int-avg_throughput.pdf")
61 | plt.show()
62 |
63 |
64 | if __name__ == "__main__":
65 | raise SystemExit(main())
66 |
--------------------------------------------------------------------------------
/tools/iperf_csv:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 |
3 | printf 'datetime,srcip,srcport,dstip,dstport,thread,interval,cwnd,rate\n'
4 | iperf -yc -i1 "$@"
5 |
--------------------------------------------------------------------------------
/tools/iratio_experiment:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | #
3 | # Example usage:
4 | # ./tools/iratio_experiment client eno1 192.168.13.3 user@tofino /path/to/private/ssh/key bf-sde-9.7.2/ '1 2 4 8 16'
5 | # ./tools/iratio_experiment client -C reno -r 1 -s 100 -t 60 eno1 192.168.13.3 user@tofino /path/to/private/ssh/key bf-sde-9.7.2/ '1 2 4 8 16'
6 | #
7 | # ./tools/iratio_experiment server eno1
8 | #
9 |
10 | set -eu
11 |
12 | client_runs()
13 | {
14 | local run_str
15 | printf -v run_str '\e[u\e[Krun %%i/%i' "$runs"
16 |
17 | progress '\e[s'
18 | for ((i = 0; i < runs; ++i)); do
19 | progress "$run_str" "$i"
20 | avg_throughput=$(
21 | iperf3 --client="$srv_ip" \
22 | --congestion="$cca" \
23 | --interval=0 \
24 | --json \
25 | --no-delay \
26 | --omit=1 \
27 | --time="$run_duration" |
28 | jq .end.sum_received.bits_per_second
29 | )
30 | printf '%s,%s,%s,%d,%s\n' "$cca" "$int" "$gro" "$iratio" "${avg_throughput%%.*}"
31 | sleep 1 # Give the server a moment to be ready again
32 | done
33 | progress "${run_str}\n" "$runs"
34 | }
35 |
36 | die()
37 | {
38 | local r=$1
39 | shift
40 | # shellcheck disable=SC2059
41 | printf "$@" >&2
42 | exit "$r"
43 | }
44 |
45 | progress()
46 | {
47 | if [[ $role == server || ! -t 1 ]]; then
48 | # shellcheck disable=SC2059
49 | printf "$@" >&2
50 | fi
51 | }
52 |
53 | server_runs()
54 | {
55 | printf '\n'
56 | for ((i = 0; i < runs; ++i)); do
57 | iperf3 --one-off --server
58 | done
59 | }
60 |
61 | readonly role=$1
62 | shift
63 |
64 | if [[ $role == client ]]; then
65 | cca=cubic
66 | link_speed=25
67 | mtu=1500
68 | qdepth_threshold=1500
69 | run_duration=10
70 | runs=10
71 |
72 | while getopts 'C:M:r:s:t:' opt; do
73 | case $opt in
74 | C) cca=$OPTARG ;;
75 | M) mtu=$OPTARG ;;
76 | q) qdepth_threshold=$OPTARG ;;
77 | r) runs=$OPTARG ;;
78 | s) link_speed=$OPTARG ;;
79 | t) run_duration=$OPTARG ;;
80 | ?) exit 2 ;;
81 | esac
82 | done
83 |
84 | readonly cca
85 | readonly link_speed
86 | readonly qdepth_threshold
87 | readonly run_duration
88 | readonly runs
89 |
90 | shift $((OPTIND - 1))
91 | else
92 | while getopts '' opt; do
93 | case $opt in
94 | ?) exit 2 ;;
95 | esac
96 | done
97 | fi
98 |
99 | readonly iface=$1
100 | shift
101 |
102 | if [[ $role == client ]]; then
103 | readonly srv_ip=$1
104 | readonly switch_user_host=$2
105 | readonly private_key_file=$3
106 | readonly switch_sde_dir=$4
107 | readonly iratios=$5
108 |
109 | shift 5
110 | fi
111 |
112 | if [[ $# -gt 0 ]]; then
113 | die 2 'unexpected arguments -- %s\n' "$*"
114 | fi
115 |
116 | if [[ $role == client ]]; then
117 | progress 'Sending experiment parameters to server …\n'
118 | netcat -q0 "$srv_ip" 5201 <<-EOF || die 1 "Cannot reach server, experiment must be started there first!\n"
119 | cca="$cca"
120 | iratios="$iratios"
121 | mtu="$mtu"
122 | runs="$runs"
123 | EOF
124 | else
125 | progress 'Waiting for client to send experiment parameters …\n'
126 | # Executing arbitrary remote shell code, what could go wrong O:-)
127 | eval "$(netcat -q0 -l 5201)"
128 | fi
129 |
130 | readonly repo_dir=${0%/*}/..
131 | if [[ ! -d ${repo_dir}/tools ]]; then
132 | printf "I don’t know where I’m called from\n" >&2
133 | exit 2
134 | fi
135 |
136 | "${repo_dir}/bpf/tcp-int/code/src/tools/tcp_int" unload &>/dev/null || :
137 | "${repo_dir}/bpf/tcp-int/code/src/tools/tcp_int" load
138 | printf "%d" "$$" >/sys/fs/cgroup/cgroup.tcp-int/cgroup.procs
139 |
140 | [[ $role == server ]] || printf 'cca,int,gro,iratio,avg_throughput\n'
141 |
142 | ip link set dev "$iface" mtu "$mtu"
143 |
144 | int=disable
145 | iratio=-1
146 | "${repo_dir}/bpf/tcp-int/code/src/tools/tcp_int" "$int"
147 | for gro in off on; do
148 | ethtool -K "$iface" gro "$gro"
149 |
150 | progress 'cca=%s gro=%s int=%s iratio=%d:\t' "$cca" "$gro" "$int" "$iratio"
151 | "${role}_runs"
152 | done
153 |
154 | int=enable
155 | "${repo_dir}/bpf/tcp-int/code/src/tools/tcp_int" "$int"
156 | for iratio in $iratios; do
157 | if [[ $role == client ]]; then
158 | # shellcheck disable=SC2087
159 | ssh -i "$private_key_file" "$switch_user_host" bash -s <<-EOF
160 | set -e
161 | cd "$switch_sde_dir"
162 | . set_sde.bash >/dev/null
163 | ./pkgsrc/switch-p4-16/scripts/tcp_int_cp.py deploy --link "$link_speed" --iratio "$iratio" --qdepth_th "$qdepth_threshold" &>/dev/null
164 | EOF
165 | fi
166 |
167 | for gro in off on; do
168 | ethtool -K "$iface" gro "$gro"
169 |
170 | progress 'cca=%s gro=%s int=%s iratio=%d:\t' "$cca" "$gro" "$int" "$iratio"
171 | "${role}_runs"
172 | done
173 | done
174 |
--------------------------------------------------------------------------------
/tools/iratio_plot:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 |
3 | import argparse
4 |
5 | import matplotlib as mpl
6 | import matplotlib.legend_handler
7 | import matplotlib.pyplot as plt
8 | import matplotlib.ticker as ticker
9 | import pandas as pd
10 |
11 | FIGSIZE = (2.5, 2)
12 |
13 | LINESTYLES = ["--", "-"]
14 | MARKERS = ["x", "s"]
15 |
16 | REPLACEMENTS = {
17 | "gro": "GRO",
18 | "int": "INT",
19 | }
20 |
21 |
22 | def main():
23 | plt.rcParams.update({"pdf.fonttype": 42})
24 |
25 | try:
26 | mpl.style.use("seaborn-v0_8-colorblind")
27 | except Exception as e:
28 | print("Failed to change matplotlib style: {}".format(e))
29 |
30 | argparser = argparse.ArgumentParser()
31 | argparser.add_argument("csv_file", type=argparse.FileType("r"))
32 | argparser.add_argument(
33 | "--max-iratio", type=int, help="maximum iratio value to plot"
34 | )
35 | args = argparser.parse_args()
36 |
37 | df = pd.read_csv(args.csv_file).rename(columns=REPLACEMENTS).replace(REPLACEMENTS)
38 | args.csv_file.close() # Be nice and close files :-)
39 |
40 | if df["cca"].nunique() > 1:
41 | return "cannot plot for multiple CCAs"
42 | cca = df.loc[0, "cca"]
43 |
44 | max_iratio = max(df["iratio"])
45 | try:
46 | max_iratio = min(args.max_iratio, max_iratio)
47 | except TypeError:
48 | pass
49 |
50 | df["avg_throughput"] /= 10**9
51 | df = df.loc[(df["INT"] == "enable") & (df["iratio"] <= max_iratio)]
52 |
53 | fig, ax = plt.subplots(figsize=FIGSIZE, layout="constrained")
54 | for linestyle, marker, gro_group in zip(LINESTYLES, MARKERS, df.groupby("GRO")):
55 | gro, gro_df = gro_group
56 | gro_df = gro_df.groupby("iratio").mean(numeric_only=True)
57 | ax.semilogx(
58 | "avg_throughput",
59 | data=gro_df,
60 | label=f"GRO {gro}",
61 | linestyle=linestyle,
62 | marker=marker,
63 | )
64 |
65 | ax.grid(linestyle="--")
66 | ax.legend(
67 | bbox_to_anchor=(-0.15, 1.1, 1.15, 0),
68 | borderaxespad=0,
69 | loc="lower left",
70 | mode="expand",
71 | ncols=df["GRO"].nunique(),
72 | )
73 | ax.set_xlabel("tagratio")
74 | ax.set_ylabel("Throughput (Gbps)")
75 |
76 | mid_iratio = df["iratio"].unique()
77 | mid_iratio = mid_iratio[len(mid_iratio) // 2]
78 | ax.set_xticks([1, 4, mid_iratio, max_iratio], [1, 4, mid_iratio, "≈ no INT"])
79 |
80 | ax.xaxis.set_minor_locator(ticker.NullLocator())
81 | ax.yaxis.set_major_locator(ticker.FixedLocator([1, 10, 15, 20, 25, 50, 100]))
82 |
83 | fig.savefig(f"{cca}-iratio-gro-avg_throughput.pdf")
84 |
85 | plt.show()
86 |
87 |
88 | if __name__ == "__main__":
89 | raise SystemExit(main())
90 |
--------------------------------------------------------------------------------
/tools/mtu_experiment:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | set -eu
4 |
5 | client_runs()
6 | {
7 | for ((i = 0; i < runs; ++i)); do
8 | avg_throughput=$(
9 | iperf3 --client="$srv_ip" \
10 | --congestion="$cca" \
11 | --interval=0 \
12 | --json \
13 | --no-delay \
14 | --omit=1 \
15 | --time="$run_duration" |
16 | jq .end.sum_received.bits_per_second
17 | )
18 | printf '%s,%d,%f\n' "$cca" "$mtu" "$avg_throughput"
19 | sleep 1 # Give the server a moment to be ready again
20 | done
21 | }
22 |
23 | die()
24 | {
25 | local r=$1
26 | shift
27 | # shellcheck disable=SC2059
28 | printf "$@" >&2
29 | exit "$r"
30 | }
31 |
32 | progress()
33 | {
34 | if [[ $role == server || ! -t 1 ]]; then
35 | # shellcheck disable=SC2059
36 | printf "$@" >&2
37 | fi
38 | }
39 |
40 | server_runs()
41 | {
42 | for ((i = 0; i < runs; ++i)); do
43 | iperf3 --one-off --server
44 | done
45 | }
46 |
47 | readonly role=$1
48 | shift
49 |
50 | if [[ $role == client ]]; then
51 | cca=cubic
52 | link_speed=25
53 | run_duration=10
54 | runs=10
55 |
56 | while getopts 'C:r:s:t:' opt; do
57 | case $opt in
58 | C) cca=$OPTARG ;;
59 | r) runs=$OPTARG ;;
60 | t) run_duration=$OPTARG ;;
61 | ?) exit 2 ;;
62 | esac
63 | done
64 |
65 | readonly cca
66 | readonly run_duration
67 | readonly runs
68 |
69 | shift $((OPTIND - 1))
70 | else
71 | while getopts '' opt; do
72 | case $opt in
73 | ?) exit 2 ;;
74 | esac
75 | done
76 | fi
77 |
78 | readonly iface=$1
79 | shift
80 |
81 | if [[ $role == client ]]; then
82 | readonly srv_ip=$1
83 | shift
84 |
85 | [[ $# -gt 0 ]] || die 2 'missing MTU(s)\n'
86 | readonly mtus=("$@")
87 | else
88 | if [[ $# -gt 0 ]]; then
89 | die 2 'unexpected arguments -- %s\n' "$*"
90 | fi
91 | fi
92 |
93 | if [[ $role == client ]]; then
94 | progress 'Sending experiment parameters to server …\n'
95 | netcat -q0 "$srv_ip" 5201 <<-EOF || die 1 "Cannot reach server, experiment must be started there first!\n"
96 | cca="$cca"
97 | mtus=(${mtus[*]})
98 | runs="$runs"
99 | EOF
100 | else
101 | progress 'Waiting for client to send experiment parameters …\n'
102 | # Executing arbitrary remote shell code, what could go wrong O:-)
103 | eval "$(netcat -q0 -l 5201)"
104 | fi
105 |
106 | readonly repo_dir=${0%/*}/..
107 | if [[ ! -d ${repo_dir}/tools ]]; then
108 | die 2 "I don’t know where I’m called from\n"
109 | fi
110 |
111 | "${repo_dir}/bpf/tcp-int/code/src/tools/tcp_int" unload &>/dev/null || :
112 | "${repo_dir}/bpf/tcp-int/code/src/tools/tcp_int" load
113 | "${repo_dir}/bpf/tcp-int/code/src/tools/tcp_int" enable
114 | printf '%d' "$$" >/sys/fs/cgroup/cgroup.tcp-int/cgroup.procs
115 |
116 | ethtool -K "$iface" gro on
117 |
118 | [[ $role == server ]] || printf 'cca,mtu,avg_throughput\n'
119 |
120 | for mtu in "${mtus[@]}"; do
121 | ip link set dev "$iface" mtu "$mtu"
122 | sleep 1
123 | "${role}_runs"
124 | done
125 |
--------------------------------------------------------------------------------
/tools/mtu_plot:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 |
3 | import argparse
4 |
5 | import matplotlib as mpl
6 | import matplotlib.legend_handler
7 | import matplotlib.pyplot as plt
8 | import matplotlib.ticker as ticker
9 | import pandas as pd
10 |
11 | FIGSIZE = (2.5, 2)
12 |
13 | REPLACEMENTS = {
14 | "mtu": "MTU",
15 | }
16 |
17 |
18 | def main():
19 | plt.rcParams.update({"pdf.fonttype": 42})
20 |
21 | try:
22 | mpl.style.use("seaborn-v0_8-colorblind")
23 | except Exception as e:
24 | print("Failed to change matplotlib style: {}".format(e))
25 |
26 | argparser = argparse.ArgumentParser()
27 | argparser.add_argument("csv_file", type=argparse.FileType("r"))
28 | argparser.add_argument("--max-mtu", type=int, help="maximum MTU value to plot")
29 | args = argparser.parse_args()
30 |
31 | df = pd.read_csv(args.csv_file).rename(columns=REPLACEMENTS).replace(REPLACEMENTS)
32 | args.csv_file.close() # Be nice and close files :-)
33 |
34 | if df["cca"].nunique() > 1:
35 | return "cannot plot for multiple CCAs"
36 | cca = df.loc[0, "cca"]
37 |
38 | max_mtu = max(df["MTU"])
39 | try:
40 | max_mtu = min(args.max_mtu, max_mtu)
41 | except TypeError:
42 | pass
43 |
44 | df = df.loc[df["MTU"] <= max_mtu]
45 | df = df.groupby("MTU").mean(numeric_only=True)
46 | df["avg_throughput"] /= 10**9
47 |
48 | fig, ax = plt.subplots(figsize=FIGSIZE, layout="constrained")
49 | ax.plot(
50 | [max_mtu], [df.loc[max_mtu]]
51 | ) # Quick-n-dirty force same color as for GRO=on in iratio plot
52 | ax.plot(df.index, df["avg_throughput"], label="GRO on", marker="s")
53 |
54 | ax.grid(linestyle="--")
55 | ax.legend(
56 | bbox_to_anchor=(0.45, 1.1, 0.55, 0),
57 | borderaxespad=0,
58 | loc="lower left",
59 | mode="expand",
60 | )
61 | ax.set_xlabel("MTU (bytes)")
62 | ax.set_ylabel("Throughput (Gbps)")
63 |
64 | ax.xaxis.set_major_locator(ticker.FixedLocator([1500, 4000, 7000, 9000]))
65 | ax.yaxis.set_major_locator(ticker.FixedLocator([1, 10, 15, 20, 25, 50, 100]))
66 |
67 | fig.savefig(f"{cca}-mtu-avg_throughput.pdf")
68 |
69 | plt.show()
70 |
71 |
72 | if __name__ == "__main__":
73 | raise SystemExit(main())
74 |
--------------------------------------------------------------------------------
/tools/plot:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 |
3 | import argparse
4 | import sys
5 | from pathlib import Path
6 |
7 | import matplotlib.pyplot as plt
8 | import matplotlib.ticker as ticker
9 | import numpy as np
10 | import pandas as pd
11 | from scipy.signal import savgol_filter
12 |
13 | # "CN color spec", see https://matplotlib.org/stable/tutorials/colors/colors.html#specifying-colors
14 | COLORS = [f"C{i}" for i in range(11)]
15 |
16 |
17 | COLUMNS = (
18 | "ack_seq",
19 | "base_rtt",
20 | "beta",
21 | "cwnd",
22 | "delta_t",
23 | "ev",
24 | "p_norm",
25 | "p_smooth",
26 | "qlen",
27 | "rate",
28 | "rtt_grad",
29 | "time",
30 | )
31 |
32 | FORMATS = {
33 | "ack_seq": ".",
34 | }
35 |
36 | LABELS = {
37 | "ack_seq": "ACK'ed sequence #",
38 | "base_rtt": "Base RTT",
39 | "beta": "Additive increase β",
40 | "cwnd": "Congestion window",
41 | "delta_t": "Time delta between ACKs",
42 | "ev": "Resets",
43 | "p_norm": "Normalized power",
44 | "p_smooth": "Smoothed power",
45 | "qlen": "Queue length",
46 | "rate": "Pacing rate",
47 | "rtt_grad": "RTT gradient",
48 | "time": "Time",
49 | }
50 |
51 | LIMITS = {
52 | "rate": (0, 100 * 10**9),
53 | }
54 |
55 | PRETTY_UNITS = {
56 | "bit^2/s": r"$\frac{bit^2}{s}$",
57 | "us": "µs",
58 | }
59 |
60 | UNIT_FACTORS = {
61 | "bit/s": 8,
62 | "us": 0.001,
63 | "ms": 0.000001,
64 | "s": 0.000000001,
65 | }
66 |
67 | # The first unit specified for a column, if any, is the default unit:
68 | UNITS = {
69 | "base_rtt": ("us", "ms", "s"),
70 | "beta": ("bytes",),
71 | "cwnd": ("bytes",),
72 | "delta_t": ("ns",),
73 | "ev": (None,),
74 | "p_norm": (None,),
75 | "p_smooth": (None,),
76 | "qlen": ("bytes",),
77 | "rate": ("bytes/s", "bit/s"),
78 | "rtt_grad": (None,),
79 | "time": ("ns", "us", "ms", "s"),
80 | }
81 |
82 |
83 | class DictStrArg:
84 | def __init__(self, value_type):
85 | self._value_type = value_type
86 |
87 | def __call__(self, list_str):
88 | def to_key_val(s):
89 | kv = s.split(":", 2)
90 | if len(kv) < 1 or len(kv[0]) == 0:
91 | raise argparse.ArgumentTypeError(f"missing key: '{s}'")
92 | if len(kv) < 2 or len(kv[1]) == 0:
93 | raise argparse.ArgumentTypeError(
94 | f"missing value for key '{kv[0]}': '{s}'"
95 | )
96 | try:
97 | return kv[0], self._value_type(kv[1])
98 | except ValueError:
99 | raise argparse.ArgumentTypeError(
100 | f"invalid {self._value_type.__name__} value for key '{kv[0]}': '{kv[1]}'"
101 | )
102 |
103 | return dict(to_key_val(s) for s in list_str.split(",") if len(s) > 0)
104 |
105 |
106 | class OddIntArg(int):
107 | def __new__(cls, arg_str):
108 | try:
109 | self = super().__new__(cls, arg_str)
110 | except ValueError:
111 | pass
112 | else:
113 | if self % 2 != 0:
114 | return self
115 | raise argparse.ArgumentTypeError(f"invalid odd integer value: '{arg_str}'")
116 |
117 |
118 | def check_unit(col, unit):
119 | units = UNITS.get(col, (None,))
120 | if unit is None:
121 | unit = units[0]
122 | if unit not in (None, *units):
123 | raise LookupError(f"unit “{unit}“ not available for “{col}“")
124 | return unit
125 |
126 |
127 | def load_df(csv_file):
128 | df = pd.read_csv(csv_file)
129 | csv_file.close()
130 |
131 | # Is this CSV output from iperf?
132 | try:
133 | df["time"] = df["datetime"] / UNIT_FACTORS["s"]
134 | except KeyError:
135 | pass
136 | else:
137 | df["rate"] /= UNIT_FACTORS["bit/s"]
138 |
139 | df["time"] -= df["time"].min()
140 |
141 | try:
142 | df["p_norm"] = df["p_norm_scaled"] / df["power_scale"]
143 | except KeyError:
144 | pass
145 | else:
146 | df = df.drop(columns=["p_norm_scaled", "power_scale"])
147 |
148 | return df
149 |
150 |
151 | def set_ylim(axis, min_val, max_val, min_lim, max_lim):
152 | try:
153 | min_val = max(min_val, min_lim)
154 | except TypeError:
155 | pass
156 | try:
157 | max_val = min(max_val, max_lim)
158 | except TypeError:
159 | pass
160 | _, ymargin = axis.margins()
161 | the_margin = ymargin * (max_val - min_val)
162 | axis.set_ylim(min_val - the_margin, max_val + the_margin)
163 |
164 |
165 | def setup_axis(axis, col, unit):
166 | pretty_unit = PRETTY_UNITS.get(unit, unit)
167 | axis.set_minor_locator(ticker.AutoMinorLocator())
168 |
169 | if col in ("ack_seq", "cwnd"):
170 | axis.set_major_formatter(ticker.StrMethodFormatter("{x:.0f}"))
171 | else:
172 | if unit in ("bit/s", "bytes", "bytes/s"):
173 | axis.set_major_formatter(ticker.EngFormatter(unit=pretty_unit))
174 | elif unit in ("s", "ms", "us", "ns"):
175 | precision = 0
176 | if unit == "s":
177 | precision = 3
178 | axis.set_major_formatter(
179 | ticker.StrMethodFormatter(f"{{x:.{precision}f}} {pretty_unit}")
180 | )
181 |
182 |
183 | def main():
184 | early_argparser = argparse.ArgumentParser(add_help=False, allow_abbrev=False)
185 | early_argparser.add_argument(
186 | "--all-flows",
187 | action="store_true",
188 | help="select all flows",
189 | )
190 | early_argparser.add_argument(
191 | "--flow",
192 | default=0,
193 | type=int,
194 | help="select a single flow by zero-based index",
195 | )
196 | early_argparser.add_argument(
197 | "--info",
198 | action="store_true",
199 | help="show information about the CSV file and exit",
200 | )
201 | early_args, remaining_argv = early_argparser.parse_known_args()
202 |
203 | argparser = argparse.ArgumentParser(
204 | add_help=True, allow_abbrev=False, parents=[early_argparser]
205 | )
206 | argparser.add_argument("csv_file", type=argparse.FileType("r"), nargs="+")
207 |
208 | if not early_args.info:
209 | argparser.add_argument("--fmt", default={}, type=DictStrArg(str))
210 | argparser.add_argument("--max", default={}, type=DictStrArg(float))
211 | argparser.add_argument("--min", default={}, type=DictStrArg(float))
212 | argparser.add_argument(
213 | "--smooth",
214 | const=99,
215 | default=0,
216 | nargs="?",
217 | type=OddIntArg,
218 | help="smooth the plotted data; an optionally given uneven integer number (greater than 2) specifies the filter window size",
219 | )
220 | argparser.add_argument("--title")
221 | argparser.add_argument("--unit", default={}, type=DictStrArg(str))
222 | argparser.add_argument("-x", choices=COLUMNS, default="time")
223 | argparser.add_argument("-y", choices=COLUMNS, nargs="+")
224 |
225 | args = argparser.parse_args(remaining_argv, early_args)
226 |
227 | if args.info:
228 | for csv_file in args.csv_file:
229 | df = load_df(csv_file)
230 | pd.options.display.float_format = "{:06.4f}".format
231 | df.info()
232 | print(f"\n{df.describe(percentiles=[])}")
233 | return
234 |
235 | if args.all_flows and len(args.y) > 1:
236 | print("can only plot one data column when plotting multiple flows")
237 | return
238 |
239 | _fig, axs = plt.subplots(
240 | len(args.csv_file), 1, constrained_layout=True, squeeze=False
241 | )
242 |
243 | for ax, csv_file in zip(axs.flat, args.csv_file):
244 | df = load_df(csv_file)
245 |
246 | if "hash" in df:
247 | hash_col = "hash"
248 | elif "thread" in df:
249 | hash_col = "thread"
250 | else:
251 | print("missing a 'hash' or 'thread' column")
252 | return
253 |
254 | available_hashes = df[hash_col].sort_values().unique()
255 |
256 | if not args.all_flows:
257 | selected_hash = available_hashes[args.flow]
258 | df = df[df[hash_col] == selected_hash]
259 |
260 | xunit = args.unit.get(args.x)
261 | try:
262 | xunit = check_unit(args.x, xunit)
263 | except LookupError as e:
264 | return e
265 |
266 | df[args.x] *= UNIT_FACTORS.get(xunit, 1)
267 | xmin = args.min.get(args.x, df[args.x].min())
268 | xmax = args.max.get(args.x, df[args.x].max())
269 | df = df[(df[args.x] >= xmin) & (df[args.x] <= xmax)]
270 |
271 | ax.set_xlabel(LABELS.get(args.x, args.x))
272 | setup_axis(ax.xaxis, args.x, xunit)
273 | axx = None
274 | lines = []
275 |
276 | if args.all_flows:
277 | y = args.y[0]
278 |
279 | max_yval = None
280 | min_yval = None
281 | yfmt = args.fmt.get(y) or FORMATS.get(y, "-")
282 | ymax = args.max.get(y)
283 | ymin = args.min.get(y)
284 | yunit = args.unit.get(y)
285 | try:
286 | yunit = check_unit(y, yunit)
287 | except LookupError as e:
288 | return e
289 | df[y] *= UNIT_FACTORS.get(yunit, 1)
290 |
291 | grouped_df = df.groupby(hash_col)
292 |
293 | for i, flow_hash in enumerate(available_hashes):
294 | flow_df = grouped_df.get_group(flow_hash)
295 | y_vals = (
296 | savgol_filter(flow_df[y], args.smooth, 2)
297 | if args.smooth > 0
298 | else flow_df[y]
299 | )
300 |
301 | lines.extend(
302 | ax.plot(
303 | flow_df[args.x],
304 | y_vals,
305 | yfmt,
306 | label=f"Flow {i}",
307 | )
308 | )
309 |
310 | flow_max_yval = y_vals.max()
311 | flow_min_yval = y_vals.min()
312 | try:
313 | max_yval = max(max_yval, flow_max_yval)
314 | except TypeError:
315 | max_yval = flow_max_yval
316 | try:
317 | min_yval = min(min_yval, flow_min_yval)
318 | except TypeError:
319 | min_yval = flow_min_yval
320 |
321 | ax.set_ylabel(LABELS.get(y, y))
322 | set_ylim(ax, min_yval, max_yval, ymin, ymax)
323 | setup_axis(ax.yaxis, y, yunit)
324 | else:
325 | for i, y in enumerate(args.y):
326 | if args.x == y:
327 | return f"cannot use the same column “{args.x}” for x and y axis"
328 |
329 | ycolor = COLORS[i % len(COLORS)]
330 | yfmt = args.fmt.get(y) or FORMATS.get(y, "-")
331 | ymax = args.max.get(y)
332 | ymin = args.min.get(y)
333 | yunit = args.unit.get(y)
334 | try:
335 | yunit = check_unit(y, yunit)
336 | except LookupError as e:
337 | return e
338 | df[y] *= UNIT_FACTORS.get(yunit, 1)
339 |
340 | if axx is not None and y not in ("ev",):
341 | axx = ax.twinx()
342 | axx.spines["right"].set_position(("outward", (len(lines) - 1) * 50))
343 | else:
344 | axx = ax
345 |
346 | if y == "ev":
347 | lines.append(
348 | ax.vlines(
349 | df.loc[df[y].notna(), args.x],
350 | 0,
351 | 1,
352 | color="lightgrey",
353 | label=LABELS.get(y),
354 | transform=ax.get_xaxis_transform(),
355 | )
356 | )
357 | else:
358 | y_vals = (
359 | savgol_filter(df[y], args.smooth, 2)
360 | if args.smooth > 0
361 | else df[y]
362 | )
363 | lines.extend(
364 | axx.plot(
365 | df[args.x],
366 | y_vals,
367 | yfmt,
368 | color=ycolor,
369 | label=LABELS.get(y, y),
370 | )
371 | )
372 | axx.set_ylabel(LABELS.get(y, y))
373 | set_ylim(axx, y_vals.min(), y_vals.max(), ymin, ymax)
374 | setup_axis(axx.yaxis, y, yunit)
375 |
376 | if len(lines) > 1:
377 | ax.legend(handles=lines)
378 |
379 | if args.title:
380 | ax.set_title(args.title)
381 | elif len(args.csv_file) > 1:
382 | ax.set_title(Path(csv_file.name).name)
383 |
384 | plt.show()
385 |
386 |
387 | if __name__ == "__main__":
388 | raise SystemExit(main())
389 |
--------------------------------------------------------------------------------
/tools/powertcp_experiment:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | set -eu
4 |
5 | readonly base_srv_port=50000
6 |
7 | clients_run()
8 | {
9 | "$tcp_int_tool" trace >"${tmpdir}/tcp_int_trace" &
10 | local trace_pid=$!
11 |
12 | local iperf_pids=()
13 | for ((i = 0; i < ${#destinations[@]}; ++i)); do
14 | iperf --client "${destinations[$i]}" \
15 | --interval="$iperf_interval" \
16 | --nodelay \
17 | --port=$((base_srv_port + 1 + i)) \
18 | --reportstyle=C \
19 | --tcp-congestion="$cca" \
20 | --time="$run_duration" \
21 | >"${tmpdir}/$i.iperf" &
22 | iperf_pids+=($!)
23 | done
24 |
25 | wait "${iperf_pids[@]}"
26 |
27 | progress ' capturing traces …'
28 |
29 | sleep 5 # Wait a bit to catch all trace entries.
30 | kill "$trace_pid"
31 | wait
32 |
33 | # The last line of the captured `tcp_int trace` is often broken.
34 | sed -i '$d' "${tmpdir}/tcp_int_trace"
35 |
36 | # The last line of the captured iperf outputs is a summary. We don't need it.
37 | sed -i -s '$d' "${tmpdir}/"*.iperf
38 |
39 | if [[ ! -f $tcp_int_csvfile ]]; then
40 | prepend_lines '2,2' 'cca,' <"${tmpdir}/tcp_int_trace" >"$tcp_int_csvfile"
41 | fi
42 | prepend_lines '4,$' "${cca}," <"${tmpdir}/tcp_int_trace" >>"$tcp_int_csvfile"
43 |
44 | if [[ ! -f $iperf_csvfile ]]; then
45 | printf 'cca,datetime,srcip,srcport,dstip,dstport,thread,interval,cwnd,rate\n' >"$iperf_csvfile"
46 | fi
47 | cat "${tmpdir}/"*.iperf | prepend_lines '' "${cca}," >>"$iperf_csvfile"
48 |
49 | rm -f -- "${tmpdir}/"*
50 | }
51 |
52 | die()
53 | {
54 | local r=$1
55 | shift
56 | # shellcheck disable=SC2059
57 | printf "$@" >&2
58 | exit "$r"
59 | }
60 |
61 | prepend_lines()
62 | {
63 | local addr=$1
64 | local prefix=$2
65 | local script
66 | printf -v script '%ss/^/%s/;%sp;d' "$addr" "$prefix" "$addr"
67 | sed -e "$script"
68 | }
69 |
70 | progress()
71 | {
72 | # shellcheck disable=SC2059
73 | printf "$@" >&2
74 | }
75 |
76 | servers_run()
77 | {
78 | local srv_pids=()
79 | for ((i = 0; i < ${#destinations[@]}; ++i)); do
80 | iperf --port=$((base_srv_port + 1 + i)) --server &
81 | srv_pids+=($!)
82 | done
83 |
84 | netcat -q0 -l "$base_srv_port"
85 | kill "${srv_pids[@]}"
86 | wait
87 | }
88 |
89 | readonly role=$1
90 | shift
91 |
92 | if [[ $role == client ]]; then
93 | ccas=(bpf_powertcp)
94 | iperf_interval=1
95 | powertcp_params=()
96 | run_duration=10
97 |
98 | while getopts 'C:i:P:t:' opt; do
99 | case $opt in
100 | C) ccas+=("$OPTARG") ;;
101 | i) iperf_interval=$OPTARG ;;
102 | P)
103 | # shellcheck disable=SC2206
104 | powertcp_params=($OPTARG)
105 | ;;
106 | t) run_duration=$OPTARG ;;
107 | ?) exit 2 ;;
108 | esac
109 | done
110 |
111 | [[ ${#ccas[@]} -gt 1 ]] || ccas+=(cubic)
112 |
113 | readonly ccas
114 | readonly iperf_interval
115 | readonly powertcp_params
116 | readonly run_duration
117 |
118 | shift $((OPTIND - 1))
119 | else
120 | while getopts '' opt; do
121 | case $opt in
122 | ?) exit 2 ;;
123 | esac
124 | done
125 | fi
126 |
127 | if [[ $role == client ]]; then
128 | [[ $# -gt 0 ]] || die 2 'missing destination(s)\n'
129 | readonly destinations=("$@")
130 | elif [[ $# -gt 0 ]]; then
131 | die 2 'unexpected arguments -- %s\n' "$*"
132 | fi
133 |
134 | if [[ $role == client ]]; then
135 | progress 'Sending experiment parameters to server …\n'
136 | netcat -q0 "${destinations[0]}" "$base_srv_port" <<-EOF || die 1 "Cannot reach server, experiment must be started there first!\n"
137 | readonly ccas=("${ccas[@]}")
138 | readonly destinations=(${destinations[*]})
139 | EOF
140 | else
141 | progress 'Waiting for client to send experiment parameters …\n'
142 | # Executing arbitrary remote shell code, what could go wrong O:-)
143 | eval "$(netcat -q0 -l "$base_srv_port")"
144 | fi
145 |
146 | readonly repo_dir=${0%/*}/..
147 | if [[ ! -d ${repo_dir}/tools ]]; then
148 | printf "I don’t know where I’m called from\n" >&2
149 | exit 2
150 | fi
151 |
152 | tmpdir=$(mktemp --directory) || die 1 'failed to create a tempdir'
153 | readonly tmpdir
154 | # shellcheck disable=SC2064
155 | trap "rm -rf -- '$tmpdir'" EXIT HUP INT TERM
156 |
157 | readonly powertcp_tool=${repo_dir}/bpf/powertcp
158 | readonly tcp_int_tool=${repo_dir}/bpf/tcp-int/code/src/tools/tcp_int
159 |
160 | printf -v csvfile_prefix '%s-' "${ccas[@]}"
161 | readonly iperf_csvfile=${csvfile_prefix}iperf.csv
162 | readonly tcp_int_csvfile=${csvfile_prefix}tcp_int.csv
163 | rm -f "$iperf_csvfile" "$tcp_int_csvfile"
164 |
165 | "$tcp_int_tool" unload &>/dev/null || :
166 | "$tcp_int_tool" load
167 | "$tcp_int_tool" enable
168 | printf "%d" "$$" >/sys/fs/cgroup/cgroup.tcp-int/cgroup.procs
169 |
170 | "$powertcp_tool" register -f tracing "${powertcp_params[@]}"
171 |
172 | if [[ $role == client ]]; then
173 | for cca in "${ccas[@]}"; do
174 | progress '%s …' "$cca"
175 | clients_run
176 | progress ' done.\n'
177 | done
178 |
179 | netcat -w1 -q0 "${destinations[0]}" "$base_srv_port"
180 | else
181 | servers_run
182 | fi
183 |
--------------------------------------------------------------------------------
/tools/powertcp_plot:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 |
3 | import argparse
4 |
5 | import matplotlib as mpl
6 | import matplotlib.pyplot as plt
7 | import numpy as np
8 | import pandas as pd
9 | from matplotlib import legend_handler, ticker
10 |
11 | FIGSIZE = (2.5, 2)
12 |
13 | REPLACEMENTS = {
14 | "DIP:DPORT": "Flow",
15 | "QDEPTH(B)": "Queue depth",
16 | "TIME(s)": "time",
17 | "bpf_powertcp": "PowerTCP",
18 | "cca": "CCA",
19 | "cubic": "Cubic",
20 | "dctcp": "DCTCP",
21 | "dstport": "Flow",
22 | }
23 |
24 | LINESTYLES = ["--", "-"]
25 |
26 |
27 | def main():
28 | plt.rcParams.update({"pdf.fonttype": 42})
29 |
30 | try:
31 | mpl.style.use("seaborn-v0_8-colorblind")
32 | except Exception as e:
33 | print("Failed to change matplotlib style: {}".format(e))
34 |
35 | argparser = argparse.ArgumentParser()
36 | argparser.add_argument("--mean-window", default=1000, type=int)
37 | argparser.add_argument("--since", default=0, type=float)
38 | argparser.add_argument("--until", default=None, type=float)
39 | argparser.add_argument("iperf_csv_file", type=argparse.FileType("r"))
40 | argparser.add_argument("tcp_int_csv_file", type=argparse.FileType("r"))
41 | args = argparser.parse_args()
42 |
43 | iperf_df = pd.read_csv(args.iperf_csv_file)
44 | args.iperf_csv_file.close() # Be nice and close files :-)
45 |
46 | ccas = iperf_df["cca"].unique()
47 | filename_prefix = "-".join(ccas)
48 |
49 | iperf_df = iperf_df.rename(columns=REPLACEMENTS).replace(REPLACEMENTS)
50 | iperf_df["datetime"] = iperf_df["datetime"] - iperf_df.groupby("CCA")[
51 | "datetime"
52 | ].transform("min")
53 | iperf_df["Flow"] -= 50000
54 | iperf_df["rate"] /= 10**9
55 |
56 | iperf_until = (
57 | args.until
58 | if args.until is not None
59 | else min(iperf_df.groupby("CCA")["datetime"].max())
60 | )
61 | iperf_df = iperf_df[
62 | (iperf_df["datetime"] >= args.since) & (iperf_df["datetime"] <= iperf_until)
63 | ]
64 | iperf_df["datetime"] -= args.since
65 |
66 | fig1, ax1 = plt.subplots(figsize=FIGSIZE, layout="constrained")
67 | cca_rate_lines = {
68 | cca_group[0]: tuple(
69 | ax1.plot("datetime", "rate", data=flow_df, linestyle=linestyle)[0]
70 | for _flow, flow_df in cca_group[1].groupby("Flow")
71 | )
72 | for linestyle, cca_group in zip(LINESTYLES, iperf_df.groupby("CCA"))
73 | }
74 |
75 | ax1.grid(linestyle="--")
76 | ax1.legend(
77 | bbox_to_anchor=(-0.15, 1.1, 1.15, 0),
78 | borderaxespad=0,
79 | handler_map={tuple: legend_handler.HandlerTuple(None, pad=0)},
80 | handles=cca_rate_lines.values(),
81 | labels=cca_rate_lines.keys(),
82 | loc="lower left",
83 | mode="expand",
84 | ncols=len(ccas),
85 | )
86 | ax1.set_xlabel("Time (s)")
87 | ax1.set_ylabel("Throughput (Gbps)")
88 | ax1.yaxis.set_major_locator(ticker.FixedLocator([1, 10, 15, 20, 25, 50, 100]))
89 |
90 | fig1.savefig(f"{filename_prefix}-throughput.pdf")
91 |
92 | tcp_int_df = pd.read_csv(args.tcp_int_csv_file)
93 | args.tcp_int_csv_file.close()
94 |
95 | tcp_int_df.columns = tcp_int_df.columns.str.strip()
96 | tcp_int_df = tcp_int_df.rename(columns=REPLACEMENTS).replace(REPLACEMENTS)
97 |
98 | tcp_int_until = (
99 | args.until
100 | if args.until is not None
101 | else min(tcp_int_df.groupby("CCA")["time"].max())
102 | )
103 | tcp_int_df = tcp_int_df[
104 | (tcp_int_df["time"] >= args.since) & (tcp_int_df["time"] <= tcp_int_until)
105 | ]
106 | tcp_int_df["Queue depth"] /= 1000
107 | tcp_int_df["time"] -= args.since
108 |
109 | fig2, ax2 = plt.subplots(figsize=FIGSIZE, layout="constrained")
110 |
111 | for i, cca_group in enumerate(tcp_int_df.groupby("CCA")):
112 | cca, cca_df = cca_group
113 |
114 | qdepth_per_cca_flow = pd.pivot_table(
115 | cca_df, index=np.arange(len(cca_df)), values=["Queue depth", "time"]
116 | )
117 | qdepth_per_cca_flow = qdepth_per_cca_flow.groupby(
118 | np.arange(len(qdepth_per_cca_flow)) // args.mean_window
119 | ).mean()
120 |
121 | ax2.plot(
122 | "time",
123 | "Queue depth",
124 | data=qdepth_per_cca_flow,
125 | linestyle=LINESTYLES[i % len(LINESTYLES)],
126 | label=cca,
127 | )
128 |
129 | ax2.grid(linestyle="--")
130 | ax2.legend(
131 | bbox_to_anchor=(-0.25, 1.1, 1.25, 0),
132 | borderaxespad=0,
133 | loc="lower left",
134 | mode="expand",
135 | ncols=len(ccas),
136 | )
137 | ax2.set_xlabel("Time (s)")
138 | ax2.set_ylabel("Queue depth (KB)")
139 | ax2.yaxis.set_major_locator(ticker.MaxNLocator(nbins="auto", steps=[5, 10]))
140 |
141 | fig2.savefig(f"{filename_prefix}-qdepth.pdf")
142 |
143 | plt.show()
144 |
145 |
146 | if __name__ == "__main__":
147 | raise SystemExit(main())
148 |
--------------------------------------------------------------------------------
/tools/reinsmod:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | #
3 | # Remove a module (if loaded) and add it to the kernel (again).
4 | #
5 | # Usage: reinsmod MODULENAME [MODULE_PARAMETER...]
6 |
7 | set -eu
8 |
9 | modpath=$1
10 | shift
11 |
12 | modname=${modpath##*/}
13 | modname=${modname%.ko}
14 |
15 | ! lsmod | grep -q "$modname" || rmmod "$modname"
16 | insmod "$modpath" "$@"
17 |
--------------------------------------------------------------------------------
/tools/screen/iperf-client.screen:
--------------------------------------------------------------------------------
1 | screen
2 | exec ./bpf/tcp-int/code/src/tools/tcp_int ecr-disable
3 |
--------------------------------------------------------------------------------
/tools/screen/iperf-servers.screen:
--------------------------------------------------------------------------------
1 | screen iperf -s
2 | screen iperf3 -s
3 |
--------------------------------------------------------------------------------
/tools/send_something:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | set -eu
3 |
4 | usage()
5 | {
6 | printf "Usage: %s HOST PORT DATA_SIZE [REPEATS [DELAY]]\n" "$0"
7 | }
8 |
9 | if [ $# -lt 3 ]; then
10 | printf "Missing arguments\n" >&2
11 | usage >&2
12 | exit 2
13 | fi
14 |
15 | host=$1
16 | port=$2
17 | count=$3
18 | repeats=${4:-1}
19 | delay=${5:-0}
20 |
21 | for i in $(seq 1 "$repeats"); do
22 | dd count="$count" if=/dev/urandom iflag=count_bytes
23 | [ $i = "$repeats" ] || sleep "$delay"
24 | done | netcat -q0 "$host" "$port"
25 |
--------------------------------------------------------------------------------
/tools/setup-bpf:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | #
3 | # Setup the BPF implementation and TCP-INT and start a screen session to use it
4 | # (for e.g. iperf3 usage or with already running iperf(3) servers).
5 | #
6 |
7 | set -eu
8 |
9 | if [ "$(id -u)" -ne 0 ]; then
10 | echo "setup-bpf: you probably want to execute this as root" >&2
11 | exit 2
12 | fi
13 |
14 | repo_dir=${0%/*}/..
15 | session=$1
16 | user=${SUDO_USER:-$USER}
17 |
18 | shift
19 |
20 | if [ ! -d "$repo_dir/tools" ]; then
21 | echo "must be called from within the repository" >&2
22 | exit 2
23 | fi
24 |
25 | ./bpf/powertcp unregister || :
26 | ./bpf/tcp-int/code/src/tools/tcp_int unload || :
27 |
28 | ./bpf/tcp-int/code/src/tools/tcp_int load
29 | ./bpf/tcp-int/code/src/tools/tcp_int enable
30 | ./bpf/powertcp register "$@"
31 |
32 | echo $$ >> /sys/fs/cgroup/cgroup.tcp-int/cgroup.procs
33 |
34 | screen -S "$session.$user" -c "$repo_dir/tools/screen/$session.screen"
35 |
--------------------------------------------------------------------------------
/tools/setup-module:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | #
3 | # Setup the BPF implementation and TCP-INT and start a screen session to use it
4 | # (for e.g. iperf3 usage or with already running iperf(3) servers).
5 | #
6 |
7 | set -eu
8 |
9 | if [ "$(id -u)" -ne 0 ]; then
10 | echo "setup-module: you probably want to execute this as root" >&2
11 | exit 2
12 | fi
13 |
14 | repo_dir=${0%/*}/..
15 | session=$1
16 | user=${SUDO_USER:-$USER}
17 |
18 | shift
19 |
20 | if [ ! -d "$repo_dir/tools" ]; then
21 | echo "must be called from within the repository" >&2
22 | exit 2
23 | fi
24 |
25 | ! lsmod | grep -q ^tcp_powertcp || rmmod tcp_powertcp
26 | insmod "$repo_dir/tcp_powertcp.ko" "$@"
27 |
28 | screen -S "$session.$user" -c "$repo_dir/tools/screen/$session.screen"
29 |
--------------------------------------------------------------------------------
/tools/tracing/to_csv:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bpftrace
2 |
3 | #include
4 |
5 | BEGIN
6 | {
7 | print("time,hash,ack_seq,base_rtt,beta,cwnd,rate,power_scale,p_norm_scaled,ev");
8 | }
9 |
10 | tracepoint:powertcp:new_ack
11 | {
12 | printf("%llu,%u,%u,%ld,%d,%u,%lu,%ld,%ld,\n", args->time, args->hash,
13 | args->ack_seq, @base_rtt[tid], @beta[tid], args->cwnd, args->rate,
14 | @power_scale[tid], @p_norm[tid]);
15 | }
16 |
17 | tracepoint:powertcp:norm_power
18 | {
19 | @base_rtt[tid] = args->base_rtt;
20 | }
21 |
22 | tracepoint:powertcp:reset
23 | {
24 | if (args->ev != CA_EVENT_CWND_RESTART && @beta[tid] > 0) {
25 | printf("%llu,%u,,%ld,%d,%u,%lu,%ld,%ld,%d\n", args->time, args->hash,
26 | args->base_rtt, @beta[tid], args->cwnd, args->rate,
27 | @power_scale[tid], @p_norm[tid], args->ev);
28 | } else {
29 | printf("%llu,%u,,%ld,,%u,%lu,,,%d\n", args->time, args->hash,
30 | args->base_rtt, args->cwnd, args->rate, args->ev);
31 | }
32 | }
33 |
34 | tracepoint:powertcp:update_window
35 | {
36 | @beta[tid] = args->beta;
37 | @power_scale[tid] = args->power_scale;
38 | @p_norm[tid] = args->p_norm;
39 | }
40 |
--------------------------------------------------------------------------------
/tools/tune-eth:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | set -u
3 |
4 | if [ $# -lt 1 ]; then
5 | printf "missing interface name\n" >&2
6 | exit 2
7 | fi
8 |
9 | call_if_found()
10 | {
11 | if command -v "$1" >/dev/null; then
12 | "$@"
13 | else
14 | printf '%s not found in PATH, skipping `%s`\n' "$1" "$*" >&2
15 | fi
16 | }
17 |
18 | call_if_found tuned-adm profile network-latency
19 |
20 | for iface in "$@"; do
21 | call_if_found sysctl -qw \
22 | vm.overcommit_memory=1 \
23 | net.core.busy_poll=50000 \
24 | net.core.busy_read=50000 \
25 | net.core.somaxconn=4096 \
26 | net.core.netdev_max_backlog=8192 \
27 | net.ipv4.tcp_max_syn_backlog=16384 \
28 | net.core.rmem_max=16777216 \
29 | net.core.wmem_max=16777216 \
30 | net.ipv4.tcp_mem="764688 1019584 16777216" \
31 | net.ipv4.tcp_rmem="8192 87380 16777216" \
32 | net.ipv4.tcp_wmem="8192 65536 16777216"
33 |
34 | call_if_found sysctl -qw \
35 | net.ipv4.tcp_sack=0 \
36 | net.ipv4.tcp_timestamps=0
37 |
38 | call_if_found ethtool -G "$iface" \
39 | tx 8160 \
40 | rx 8160
41 |
42 | call_if_found ethtool -K "$iface" \
43 | gro on \
44 | gso on \
45 | lro on \
46 | rx on \
47 | tso on \
48 | tx on
49 |
50 | call_if_found ethtool -C "$iface" \
51 | adaptive-rx off rx-usecs 0 \
52 | adaptive-tx off tx-usecs 10
53 | done
54 |
--------------------------------------------------------------------------------