├── .clang-format ├── .gitignore ├── .gitmodules ├── CITATION.cff ├── LICENSE ├── Makefile ├── README.md ├── bpf ├── .gitignore ├── Makefile ├── README.md ├── bpf_ca_helpers.h ├── powertcp.bpf.c ├── powertcp.cpp ├── powertcp_tcp-int.bpf.c └── powertcp_tcp-int_head.bpf.c ├── dkms.conf ├── doc ├── code-structure.md └── module.md ├── powertcp.c ├── powertcp_defs.h ├── powertcp_head.c ├── powertcp_int.c ├── powertcp_no-int.c ├── powertcp_no-int_head.c ├── powertcp_trace.h ├── tcp_powertcp.c ├── tcp_powertcp_trace.h └── tools ├── README.md ├── bpf_tracer ├── gro_experiment ├── gro_plot ├── iperf_csv ├── iratio_experiment ├── iratio_plot ├── mtu_experiment ├── mtu_plot ├── plot ├── powertcp_experiment ├── powertcp_plot ├── reinsmod ├── screen ├── iperf-client.screen └── iperf-servers.screen ├── send_something ├── setup-bpf ├── setup-module ├── tracing └── to_csv └── tune-eth /.clang-format: -------------------------------------------------------------------------------- 1 | # SPDX-License-Identifier: GPL-2.0 2 | # 3 | # clang-format configuration file. Intended for clang-format >= 4. 4 | # 5 | # For more information, see: 6 | # 7 | # Documentation/process/clang-format.rst 8 | # https://clang.llvm.org/docs/ClangFormat.html 9 | # https://clang.llvm.org/docs/ClangFormatStyleOptions.html 10 | # 11 | --- 12 | AccessModifierOffset: -4 13 | AlignAfterOpenBracket: Align 14 | AlignConsecutiveAssignments: false 15 | AlignConsecutiveDeclarations: false 16 | #AlignEscapedNewlines: Left # Unknown to clang-format-4.0 17 | AlignOperands: true 18 | AlignTrailingComments: false 19 | AllowAllParametersOfDeclarationOnNextLine: false 20 | AllowShortBlocksOnASingleLine: false 21 | AllowShortCaseLabelsOnASingleLine: false 22 | AllowShortFunctionsOnASingleLine: None 23 | AllowShortIfStatementsOnASingleLine: false 24 | AllowShortLoopsOnASingleLine: false 25 | AlwaysBreakAfterDefinitionReturnType: None 26 | AlwaysBreakAfterReturnType: None 27 | AlwaysBreakBeforeMultilineStrings: false 28 | AlwaysBreakTemplateDeclarations: false 29 | BinPackArguments: true 30 | BinPackParameters: true 31 | BraceWrapping: 32 | AfterClass: false 33 | AfterControlStatement: false 34 | AfterEnum: false 35 | AfterFunction: true 36 | AfterNamespace: true 37 | AfterObjCDeclaration: false 38 | AfterStruct: false 39 | AfterUnion: false 40 | #AfterExternBlock: false # Unknown to clang-format-5.0 41 | BeforeCatch: false 42 | BeforeElse: false 43 | IndentBraces: false 44 | #SplitEmptyFunction: true # Unknown to clang-format-4.0 45 | #SplitEmptyRecord: true # Unknown to clang-format-4.0 46 | #SplitEmptyNamespace: true # Unknown to clang-format-4.0 47 | BreakBeforeBinaryOperators: None 48 | BreakBeforeBraces: Custom 49 | #BreakBeforeInheritanceComma: false # Unknown to clang-format-4.0 50 | BreakBeforeTernaryOperators: false 51 | BreakConstructorInitializersBeforeComma: false 52 | #BreakConstructorInitializers: BeforeComma # Unknown to clang-format-4.0 53 | BreakAfterJavaFieldAnnotations: false 54 | BreakStringLiterals: false 55 | ColumnLimit: 80 56 | CommentPragmas: '^ IWYU pragma:' 57 | #CompactNamespaces: false # Unknown to clang-format-4.0 58 | ConstructorInitializerAllOnOneLineOrOnePerLine: false 59 | ConstructorInitializerIndentWidth: 8 60 | ContinuationIndentWidth: 8 61 | Cpp11BracedListStyle: false 62 | DerivePointerAlignment: false 63 | DisableFormat: false 64 | ExperimentalAutoDetectBinPacking: false 65 | #FixNamespaceComments: false # Unknown to clang-format-4.0 66 | 67 | # Taken from: 68 | # git grep -h '^#define [^[:space:]]*for_each[^[:space:]]*(' include/ \ 69 | # | sed "s,^#define \([^[:space:]]*for_each[^[:space:]]*\)(.*$, - '\1'," \ 70 | # | sort | uniq 71 | ForEachMacros: 72 | - 'apei_estatus_for_each_section' 73 | - 'ata_for_each_dev' 74 | - 'ata_for_each_link' 75 | - '__ata_qc_for_each' 76 | - 'ata_qc_for_each' 77 | - 'ata_qc_for_each_raw' 78 | - 'ata_qc_for_each_with_internal' 79 | - 'ax25_for_each' 80 | - 'ax25_uid_for_each' 81 | - '__bio_for_each_bvec' 82 | - 'bio_for_each_bvec' 83 | - 'bio_for_each_bvec_all' 84 | - 'bio_for_each_integrity_vec' 85 | - '__bio_for_each_segment' 86 | - 'bio_for_each_segment' 87 | - 'bio_for_each_segment_all' 88 | - 'bio_list_for_each' 89 | - 'bip_for_each_vec' 90 | - 'bitmap_for_each_clear_region' 91 | - 'bitmap_for_each_set_region' 92 | - 'blkg_for_each_descendant_post' 93 | - 'blkg_for_each_descendant_pre' 94 | - 'blk_queue_for_each_rl' 95 | - 'bond_for_each_slave' 96 | - 'bond_for_each_slave_rcu' 97 | - 'bpf_for_each_spilled_reg' 98 | - 'btree_for_each_safe128' 99 | - 'btree_for_each_safe32' 100 | - 'btree_for_each_safe64' 101 | - 'btree_for_each_safel' 102 | - 'card_for_each_dev' 103 | - 'cgroup_taskset_for_each' 104 | - 'cgroup_taskset_for_each_leader' 105 | - 'cpufreq_for_each_entry' 106 | - 'cpufreq_for_each_entry_idx' 107 | - 'cpufreq_for_each_valid_entry' 108 | - 'cpufreq_for_each_valid_entry_idx' 109 | - 'css_for_each_child' 110 | - 'css_for_each_descendant_post' 111 | - 'css_for_each_descendant_pre' 112 | - 'device_for_each_child_node' 113 | - 'displayid_iter_for_each' 114 | - 'dma_fence_chain_for_each' 115 | - 'do_for_each_ftrace_op' 116 | - 'drm_atomic_crtc_for_each_plane' 117 | - 'drm_atomic_crtc_state_for_each_plane' 118 | - 'drm_atomic_crtc_state_for_each_plane_state' 119 | - 'drm_atomic_for_each_plane_damage' 120 | - 'drm_client_for_each_connector_iter' 121 | - 'drm_client_for_each_modeset' 122 | - 'drm_connector_for_each_possible_encoder' 123 | - 'drm_for_each_bridge_in_chain' 124 | - 'drm_for_each_connector_iter' 125 | - 'drm_for_each_crtc' 126 | - 'drm_for_each_crtc_reverse' 127 | - 'drm_for_each_encoder' 128 | - 'drm_for_each_encoder_mask' 129 | - 'drm_for_each_fb' 130 | - 'drm_for_each_legacy_plane' 131 | - 'drm_for_each_plane' 132 | - 'drm_for_each_plane_mask' 133 | - 'drm_for_each_privobj' 134 | - 'drm_mm_for_each_hole' 135 | - 'drm_mm_for_each_node' 136 | - 'drm_mm_for_each_node_in_range' 137 | - 'drm_mm_for_each_node_safe' 138 | - 'flow_action_for_each' 139 | - 'for_each_acpi_dev_match' 140 | - 'for_each_active_dev_scope' 141 | - 'for_each_active_drhd_unit' 142 | - 'for_each_active_iommu' 143 | - 'for_each_aggr_pgid' 144 | - 'for_each_available_child_of_node' 145 | - 'for_each_bio' 146 | - 'for_each_board_func_rsrc' 147 | - 'for_each_bvec' 148 | - 'for_each_card_auxs' 149 | - 'for_each_card_auxs_safe' 150 | - 'for_each_card_components' 151 | - 'for_each_card_dapms' 152 | - 'for_each_card_pre_auxs' 153 | - 'for_each_card_prelinks' 154 | - 'for_each_card_rtds' 155 | - 'for_each_card_rtds_safe' 156 | - 'for_each_card_widgets' 157 | - 'for_each_card_widgets_safe' 158 | - 'for_each_cgroup_storage_type' 159 | - 'for_each_child_of_node' 160 | - 'for_each_clear_bit' 161 | - 'for_each_clear_bit_from' 162 | - 'for_each_cmsghdr' 163 | - 'for_each_compatible_node' 164 | - 'for_each_component_dais' 165 | - 'for_each_component_dais_safe' 166 | - 'for_each_comp_order' 167 | - 'for_each_console' 168 | - 'for_each_cpu' 169 | - 'for_each_cpu_and' 170 | - 'for_each_cpu_not' 171 | - 'for_each_cpu_wrap' 172 | - 'for_each_dapm_widgets' 173 | - 'for_each_dev_addr' 174 | - 'for_each_dev_scope' 175 | - 'for_each_dma_cap_mask' 176 | - 'for_each_dpcm_be' 177 | - 'for_each_dpcm_be_rollback' 178 | - 'for_each_dpcm_be_safe' 179 | - 'for_each_dpcm_fe' 180 | - 'for_each_drhd_unit' 181 | - 'for_each_dss_dev' 182 | - 'for_each_dtpm_table' 183 | - 'for_each_efi_memory_desc' 184 | - 'for_each_efi_memory_desc_in_map' 185 | - 'for_each_element' 186 | - 'for_each_element_extid' 187 | - 'for_each_element_id' 188 | - 'for_each_endpoint_of_node' 189 | - 'for_each_evictable_lru' 190 | - 'for_each_fib6_node_rt_rcu' 191 | - 'for_each_fib6_walker_rt' 192 | - 'for_each_free_mem_pfn_range_in_zone' 193 | - 'for_each_free_mem_pfn_range_in_zone_from' 194 | - 'for_each_free_mem_range' 195 | - 'for_each_free_mem_range_reverse' 196 | - 'for_each_func_rsrc' 197 | - 'for_each_hstate' 198 | - 'for_each_if' 199 | - 'for_each_iommu' 200 | - 'for_each_ip_tunnel_rcu' 201 | - 'for_each_irq_nr' 202 | - 'for_each_link_codecs' 203 | - 'for_each_link_cpus' 204 | - 'for_each_link_platforms' 205 | - 'for_each_lru' 206 | - 'for_each_matching_node' 207 | - 'for_each_matching_node_and_match' 208 | - 'for_each_member' 209 | - 'for_each_memcg_cache_index' 210 | - 'for_each_mem_pfn_range' 211 | - '__for_each_mem_range' 212 | - 'for_each_mem_range' 213 | - '__for_each_mem_range_rev' 214 | - 'for_each_mem_range_rev' 215 | - 'for_each_mem_region' 216 | - 'for_each_migratetype_order' 217 | - 'for_each_msi_entry' 218 | - 'for_each_msi_entry_safe' 219 | - 'for_each_net' 220 | - 'for_each_net_continue_reverse' 221 | - 'for_each_netdev' 222 | - 'for_each_netdev_continue' 223 | - 'for_each_netdev_continue_rcu' 224 | - 'for_each_netdev_continue_reverse' 225 | - 'for_each_netdev_feature' 226 | - 'for_each_netdev_in_bond_rcu' 227 | - 'for_each_netdev_rcu' 228 | - 'for_each_netdev_reverse' 229 | - 'for_each_netdev_safe' 230 | - 'for_each_net_rcu' 231 | - 'for_each_new_connector_in_state' 232 | - 'for_each_new_crtc_in_state' 233 | - 'for_each_new_mst_mgr_in_state' 234 | - 'for_each_new_plane_in_state' 235 | - 'for_each_new_private_obj_in_state' 236 | - 'for_each_node' 237 | - 'for_each_node_by_name' 238 | - 'for_each_node_by_type' 239 | - 'for_each_node_mask' 240 | - 'for_each_node_state' 241 | - 'for_each_node_with_cpus' 242 | - 'for_each_node_with_property' 243 | - 'for_each_nonreserved_multicast_dest_pgid' 244 | - 'for_each_of_allnodes' 245 | - 'for_each_of_allnodes_from' 246 | - 'for_each_of_cpu_node' 247 | - 'for_each_of_pci_range' 248 | - 'for_each_old_connector_in_state' 249 | - 'for_each_old_crtc_in_state' 250 | - 'for_each_old_mst_mgr_in_state' 251 | - 'for_each_oldnew_connector_in_state' 252 | - 'for_each_oldnew_crtc_in_state' 253 | - 'for_each_oldnew_mst_mgr_in_state' 254 | - 'for_each_oldnew_plane_in_state' 255 | - 'for_each_oldnew_plane_in_state_reverse' 256 | - 'for_each_oldnew_private_obj_in_state' 257 | - 'for_each_old_plane_in_state' 258 | - 'for_each_old_private_obj_in_state' 259 | - 'for_each_online_cpu' 260 | - 'for_each_online_node' 261 | - 'for_each_online_pgdat' 262 | - 'for_each_pci_bridge' 263 | - 'for_each_pci_dev' 264 | - 'for_each_pci_msi_entry' 265 | - 'for_each_pcm_streams' 266 | - 'for_each_physmem_range' 267 | - 'for_each_populated_zone' 268 | - 'for_each_possible_cpu' 269 | - 'for_each_present_cpu' 270 | - 'for_each_prime_number' 271 | - 'for_each_prime_number_from' 272 | - 'for_each_process' 273 | - 'for_each_process_thread' 274 | - 'for_each_prop_codec_conf' 275 | - 'for_each_prop_dai_codec' 276 | - 'for_each_prop_dai_cpu' 277 | - 'for_each_prop_dlc_codecs' 278 | - 'for_each_prop_dlc_cpus' 279 | - 'for_each_prop_dlc_platforms' 280 | - 'for_each_property_of_node' 281 | - 'for_each_registered_fb' 282 | - 'for_each_requested_gpio' 283 | - 'for_each_requested_gpio_in_range' 284 | - 'for_each_reserved_mem_range' 285 | - 'for_each_reserved_mem_region' 286 | - 'for_each_rtd_codec_dais' 287 | - 'for_each_rtd_components' 288 | - 'for_each_rtd_cpu_dais' 289 | - 'for_each_rtd_dais' 290 | - 'for_each_set_bit' 291 | - 'for_each_set_bit_from' 292 | - 'for_each_set_clump8' 293 | - 'for_each_sg' 294 | - 'for_each_sg_dma_page' 295 | - 'for_each_sg_page' 296 | - 'for_each_sgtable_dma_page' 297 | - 'for_each_sgtable_dma_sg' 298 | - 'for_each_sgtable_page' 299 | - 'for_each_sgtable_sg' 300 | - 'for_each_sibling_event' 301 | - 'for_each_subelement' 302 | - 'for_each_subelement_extid' 303 | - 'for_each_subelement_id' 304 | - '__for_each_thread' 305 | - 'for_each_thread' 306 | - 'for_each_unicast_dest_pgid' 307 | - 'for_each_vsi' 308 | - 'for_each_wakeup_source' 309 | - 'for_each_zone' 310 | - 'for_each_zone_zonelist' 311 | - 'for_each_zone_zonelist_nodemask' 312 | - 'fwnode_for_each_available_child_node' 313 | - 'fwnode_for_each_child_node' 314 | - 'fwnode_graph_for_each_endpoint' 315 | - 'gadget_for_each_ep' 316 | - 'genradix_for_each' 317 | - 'genradix_for_each_from' 318 | - 'hash_for_each' 319 | - 'hash_for_each_possible' 320 | - 'hash_for_each_possible_rcu' 321 | - 'hash_for_each_possible_rcu_notrace' 322 | - 'hash_for_each_possible_safe' 323 | - 'hash_for_each_rcu' 324 | - 'hash_for_each_safe' 325 | - 'hctx_for_each_ctx' 326 | - 'hlist_bl_for_each_entry' 327 | - 'hlist_bl_for_each_entry_rcu' 328 | - 'hlist_bl_for_each_entry_safe' 329 | - 'hlist_for_each' 330 | - 'hlist_for_each_entry' 331 | - 'hlist_for_each_entry_continue' 332 | - 'hlist_for_each_entry_continue_rcu' 333 | - 'hlist_for_each_entry_continue_rcu_bh' 334 | - 'hlist_for_each_entry_from' 335 | - 'hlist_for_each_entry_from_rcu' 336 | - 'hlist_for_each_entry_rcu' 337 | - 'hlist_for_each_entry_rcu_bh' 338 | - 'hlist_for_each_entry_rcu_notrace' 339 | - 'hlist_for_each_entry_safe' 340 | - 'hlist_for_each_entry_srcu' 341 | - '__hlist_for_each_rcu' 342 | - 'hlist_for_each_safe' 343 | - 'hlist_nulls_for_each_entry' 344 | - 'hlist_nulls_for_each_entry_from' 345 | - 'hlist_nulls_for_each_entry_rcu' 346 | - 'hlist_nulls_for_each_entry_safe' 347 | - 'i3c_bus_for_each_i2cdev' 348 | - 'i3c_bus_for_each_i3cdev' 349 | - 'ide_host_for_each_port' 350 | - 'ide_port_for_each_dev' 351 | - 'ide_port_for_each_present_dev' 352 | - 'idr_for_each_entry' 353 | - 'idr_for_each_entry_continue' 354 | - 'idr_for_each_entry_continue_ul' 355 | - 'idr_for_each_entry_ul' 356 | - 'in_dev_for_each_ifa_rcu' 357 | - 'in_dev_for_each_ifa_rtnl' 358 | - 'inet_bind_bucket_for_each' 359 | - 'inet_lhash2_for_each_icsk_rcu' 360 | - 'key_for_each' 361 | - 'key_for_each_safe' 362 | - 'klp_for_each_func' 363 | - 'klp_for_each_func_safe' 364 | - 'klp_for_each_func_static' 365 | - 'klp_for_each_object' 366 | - 'klp_for_each_object_safe' 367 | - 'klp_for_each_object_static' 368 | - 'kunit_suite_for_each_test_case' 369 | - 'kvm_for_each_memslot' 370 | - 'kvm_for_each_vcpu' 371 | - 'list_for_each' 372 | - 'list_for_each_codec' 373 | - 'list_for_each_codec_safe' 374 | - 'list_for_each_continue' 375 | - 'list_for_each_entry' 376 | - 'list_for_each_entry_continue' 377 | - 'list_for_each_entry_continue_rcu' 378 | - 'list_for_each_entry_continue_reverse' 379 | - 'list_for_each_entry_from' 380 | - 'list_for_each_entry_from_rcu' 381 | - 'list_for_each_entry_from_reverse' 382 | - 'list_for_each_entry_lockless' 383 | - 'list_for_each_entry_rcu' 384 | - 'list_for_each_entry_reverse' 385 | - 'list_for_each_entry_safe' 386 | - 'list_for_each_entry_safe_continue' 387 | - 'list_for_each_entry_safe_from' 388 | - 'list_for_each_entry_safe_reverse' 389 | - 'list_for_each_entry_srcu' 390 | - 'list_for_each_prev' 391 | - 'list_for_each_prev_safe' 392 | - 'list_for_each_safe' 393 | - 'llist_for_each' 394 | - 'llist_for_each_entry' 395 | - 'llist_for_each_entry_safe' 396 | - 'llist_for_each_safe' 397 | - 'mci_for_each_dimm' 398 | - 'media_device_for_each_entity' 399 | - 'media_device_for_each_intf' 400 | - 'media_device_for_each_link' 401 | - 'media_device_for_each_pad' 402 | - 'nanddev_io_for_each_page' 403 | - 'netdev_for_each_lower_dev' 404 | - 'netdev_for_each_lower_private' 405 | - 'netdev_for_each_lower_private_rcu' 406 | - 'netdev_for_each_mc_addr' 407 | - 'netdev_for_each_uc_addr' 408 | - 'netdev_for_each_upper_dev_rcu' 409 | - 'netdev_hw_addr_list_for_each' 410 | - 'nft_rule_for_each_expr' 411 | - 'nla_for_each_attr' 412 | - 'nla_for_each_nested' 413 | - 'nlmsg_for_each_attr' 414 | - 'nlmsg_for_each_msg' 415 | - 'nr_neigh_for_each' 416 | - 'nr_neigh_for_each_safe' 417 | - 'nr_node_for_each' 418 | - 'nr_node_for_each_safe' 419 | - 'of_for_each_phandle' 420 | - 'of_property_for_each_string' 421 | - 'of_property_for_each_u32' 422 | - 'pci_bus_for_each_resource' 423 | - 'pcl_for_each_chunk' 424 | - 'pcl_for_each_segment' 425 | - 'pcm_for_each_format' 426 | - 'ping_portaddr_for_each_entry' 427 | - 'plist_for_each' 428 | - 'plist_for_each_continue' 429 | - 'plist_for_each_entry' 430 | - 'plist_for_each_entry_continue' 431 | - 'plist_for_each_entry_safe' 432 | - 'plist_for_each_safe' 433 | - 'pnp_for_each_card' 434 | - 'pnp_for_each_dev' 435 | - 'protocol_for_each_card' 436 | - 'protocol_for_each_dev' 437 | - 'queue_for_each_hw_ctx' 438 | - 'radix_tree_for_each_slot' 439 | - 'radix_tree_for_each_tagged' 440 | - 'rb_for_each' 441 | - 'rbtree_postorder_for_each_entry_safe' 442 | - 'rdma_for_each_block' 443 | - 'rdma_for_each_port' 444 | - 'rdma_umem_for_each_dma_block' 445 | - 'resource_list_for_each_entry' 446 | - 'resource_list_for_each_entry_safe' 447 | - 'rhl_for_each_entry_rcu' 448 | - 'rhl_for_each_rcu' 449 | - 'rht_for_each' 450 | - 'rht_for_each_entry' 451 | - 'rht_for_each_entry_from' 452 | - 'rht_for_each_entry_rcu' 453 | - 'rht_for_each_entry_rcu_from' 454 | - 'rht_for_each_entry_safe' 455 | - 'rht_for_each_from' 456 | - 'rht_for_each_rcu' 457 | - 'rht_for_each_rcu_from' 458 | - '__rq_for_each_bio' 459 | - 'rq_for_each_bvec' 460 | - 'rq_for_each_segment' 461 | - 'scsi_for_each_prot_sg' 462 | - 'scsi_for_each_sg' 463 | - 'sctp_for_each_hentry' 464 | - 'sctp_skb_for_each' 465 | - 'shdma_for_each_chan' 466 | - '__shost_for_each_device' 467 | - 'shost_for_each_device' 468 | - 'sk_for_each' 469 | - 'sk_for_each_bound' 470 | - 'sk_for_each_entry_offset_rcu' 471 | - 'sk_for_each_from' 472 | - 'sk_for_each_rcu' 473 | - 'sk_for_each_safe' 474 | - 'sk_nulls_for_each' 475 | - 'sk_nulls_for_each_from' 476 | - 'sk_nulls_for_each_rcu' 477 | - 'snd_array_for_each' 478 | - 'snd_pcm_group_for_each_entry' 479 | - 'snd_soc_dapm_widget_for_each_path' 480 | - 'snd_soc_dapm_widget_for_each_path_safe' 481 | - 'snd_soc_dapm_widget_for_each_sink_path' 482 | - 'snd_soc_dapm_widget_for_each_source_path' 483 | - 'tb_property_for_each' 484 | - 'tcf_exts_for_each_action' 485 | - 'udp_portaddr_for_each_entry' 486 | - 'udp_portaddr_for_each_entry_rcu' 487 | - 'usb_hub_for_each_child' 488 | - 'v4l2_device_for_each_subdev' 489 | - 'v4l2_m2m_for_each_dst_buf' 490 | - 'v4l2_m2m_for_each_dst_buf_safe' 491 | - 'v4l2_m2m_for_each_src_buf' 492 | - 'v4l2_m2m_for_each_src_buf_safe' 493 | - 'virtio_device_for_each_vq' 494 | - 'while_for_each_ftrace_op' 495 | - 'xa_for_each' 496 | - 'xa_for_each_marked' 497 | - 'xa_for_each_range' 498 | - 'xa_for_each_start' 499 | - 'xas_for_each' 500 | - 'xas_for_each_conflict' 501 | - 'xas_for_each_marked' 502 | - 'xbc_array_for_each_value' 503 | - 'xbc_for_each_key_value' 504 | - 'xbc_node_for_each_array_value' 505 | - 'xbc_node_for_each_child' 506 | - 'xbc_node_for_each_key_value' 507 | - 'zorro_for_each_dev' 508 | 509 | #IncludeBlocks: Preserve # Unknown to clang-format-5.0 510 | IncludeCategories: 511 | - Regex: '.*' 512 | Priority: 1 513 | IncludeIsMainRegex: '(Test)?$' 514 | IndentCaseLabels: false 515 | #IndentPPDirectives: None # Unknown to clang-format-5.0 516 | IndentWidth: 8 517 | IndentWrappedFunctionNames: false 518 | JavaScriptQuotes: Leave 519 | JavaScriptWrapImports: true 520 | KeepEmptyLinesAtTheStartOfBlocks: false 521 | MacroBlockBegin: '' 522 | MacroBlockEnd: '' 523 | MaxEmptyLinesToKeep: 1 524 | NamespaceIndentation: None 525 | #ObjCBinPackProtocolList: Auto # Unknown to clang-format-5.0 526 | ObjCBlockIndentWidth: 8 527 | ObjCSpaceAfterProperty: true 528 | ObjCSpaceBeforeProtocolList: true 529 | 530 | # Taken from git's rules 531 | #PenaltyBreakAssignment: 10 # Unknown to clang-format-4.0 532 | PenaltyBreakBeforeFirstCallParameter: 30 533 | PenaltyBreakComment: 10 534 | PenaltyBreakFirstLessLess: 0 535 | PenaltyBreakString: 10 536 | PenaltyExcessCharacter: 100 537 | PenaltyReturnTypeOnItsOwnLine: 60 538 | 539 | PointerAlignment: Right 540 | ReflowComments: false 541 | SortIncludes: false 542 | #SortUsingDeclarations: false # Unknown to clang-format-4.0 543 | SpaceAfterCStyleCast: false 544 | SpaceAfterTemplateKeyword: true 545 | SpaceBeforeAssignmentOperators: true 546 | #SpaceBeforeCtorInitializerColon: true # Unknown to clang-format-5.0 547 | #SpaceBeforeInheritanceColon: true # Unknown to clang-format-5.0 548 | SpaceBeforeParens: ControlStatements 549 | #SpaceBeforeRangeBasedForLoopColon: true # Unknown to clang-format-5.0 550 | SpaceInEmptyParentheses: false 551 | SpacesBeforeTrailingComments: 1 552 | SpacesInAngles: false 553 | SpacesInContainerLiterals: false 554 | SpacesInCStyleCastParentheses: false 555 | SpacesInParentheses: false 556 | SpacesInSquareBrackets: false 557 | Standard: Cpp03 558 | TabWidth: 8 559 | UseTab: Always 560 | ... 561 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.cmd 2 | *.d 3 | *.ko 4 | *.mod* 5 | *.o 6 | Module.symvers 7 | modules.order 8 | -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "bpf/tcp-int"] 2 | path = bpf/tcp-int 3 | url = https://github.com/jtdor/p4app-TCP-INT.git 4 | -------------------------------------------------------------------------------- /CITATION.cff: -------------------------------------------------------------------------------- 1 | cff-version: 1.2.0 2 | authors: 3 | - family-names: "Hinz" 4 | given-names: "Jörn-Thorben" 5 | orcid: "https://orcid.org/0009-0005-6588-3873" 6 | title: "PowerTCP for Linux" 7 | url: "https://github.com/inet-tub/powertcp-linux" 8 | message: "If you use this software, please cite it as below." 9 | preferred-citation: 10 | type: conference-paper 11 | authors: 12 | - family-names: "Hinz" 13 | given-names: "Jörn-Thorben" 14 | orcid: "https://orcid.org/0009-0005-6588-3873" 15 | - family-names: "Vamsi" 16 | given-names: "Addanki" 17 | orcid: "https://orcid.org/0000-0002-0577-0413" 18 | - family-names: "Györgyi" 19 | given-names: "Csaba" 20 | orcid: "https://orcid.org/0000-0002-8083-3277" 21 | - family-names: "Jepsen" 22 | given-names: "Theo" 23 | orcid: "https://orcid.org/0000-0002-5845-5089" 24 | - family-names: "Schmid" 25 | given-names: "Stefan" 26 | orcid: "https://orcid.org/0000-0002-7798-1711" 27 | doi: "10.1145/3609021.3609295" 28 | journal: "eBPF '23: Proceedings of the 1st Workshop on eBPF and Kernel Extensions" 29 | publisher: 30 | name: "Association for Computing Machinery" 31 | month: 9 32 | start: 1 33 | end: 7 34 | title: "TCP's Third Eye: Leveraging eBPF for Telemetry-Powered Congestion Control" 35 | year: 2023 36 | references: 37 | - authors: 38 | - family-names: "Vamsi" 39 | given-names: "Addanki" 40 | - family-names: "Michel" 41 | given-names: "Oliver" 42 | - family-names: "Schmid" 43 | given-names: "Stefan" 44 | title: "PowerTCP: Pushing the Performance Limits of Datacenter Networks" 45 | start: 51 46 | end: 70 47 | journal: "19th USENIX Symposium on Networked Systems Design and Implementation (NSDI 22)" 48 | year: 2022 49 | month: 4 50 | type: conference-paper 51 | - authors: 52 | - family-names: "Jereczek" 53 | given-names: "Grzegorz" 54 | - family-names: "Jepsen" 55 | given-names: "Theo" 56 | - family-names: "Wass" 57 | given-names: "Simon" 58 | - family-names: "Pujari" 59 | given-names: "Bimmy" 60 | - family-names: "Zhen" 61 | given-names: "Jerry" 62 | - family-names: "Lee" 63 | given-names: "Jeongkeun" 64 | title: "TCP-INT: Lightweight Network Telemetry with TCP Transport" 65 | start: 58 66 | end: 60 67 | journal: "Proceedings of the SIGCOMM'22 Poster and Demo Sessions" 68 | year: 2022 69 | month: 10 70 | type: conference-paper 71 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2022 Internet Network Architectures (INET) 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | ifneq ($(KERNELRELEASE),) 2 | 3 | # Without explicitly specifying the source folder as an include dir, 4 | # define_trace.h fails to find our trace header. 5 | ccflags-y := -I$(src) 6 | 7 | obj-m := tcp_powertcp.o 8 | 9 | else 10 | 11 | KDIR ?= /lib/modules/$(shell uname -r)/build 12 | 13 | .PHONY: modules modules_install clean help 14 | modules modules_install clean help: 15 | $(MAKE) -C $(KDIR) M=$$PWD $@ 16 | 17 | dkms_package_version := $(shell awk -F= '$$1 == "PACKAGE_VERSION" { gsub("\"", "", $$2); print $$2 }' dkms.conf) 18 | 19 | .PHONY: dkms_install 20 | dkms_install: 21 | dkms install . 22 | 23 | .PHONY: dkms_uninstall 24 | dkms_uninstall: 25 | dkms remove --all powertcp/$(dkms_package_version) 26 | $(RM) -r /usr/src/powertcp-$(dkms_package_version) 27 | 28 | endif 29 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # PowerTCP for Linux 2 | This repository presents an implementation of the PowerTCP congestion control 3 | for the Linux kernel. The two variants of PowerTCP are provided as separate 4 | congestion control algorithms: 5 | - the telemetry-based *PowerTCP* and 6 | - the simplified, timing-based *RTT-PowerTCP* (called *θ-PowerTCP* in the 7 | [paper](#for-powertcp)). 8 | 9 | Please see the [references](#references) for background on this work. 10 | 11 | This repository contains two implementations of PowerTCP: a kernel module and an 12 | eBPF program. 13 | 14 | ## Step-by-step instructions 15 | 16 | The main focus of this work is on the eBPF implementation. Follow its 17 | [instructions](bpf/README.md) for experimenting with it. 18 | 19 | There is also a proof-of-concept implementation as a kernel module, see its 20 | [instructions](doc/module.md). 21 | 22 | ## Implementation details 23 | There is *some* documentation on aspects of the implementation(s) in 24 | [doc/](doc/). 25 | 26 | ## References 27 | 28 | ### For the work in this repository 29 | > Jörn-Thorben Hinz, Vamsi Addanki, Csaba Györgyi, Theo Jepsen, and Stefan Schmid. 30 | > “TCP's Third Eye: Leveraging eBPF for Telemetry-Powered Congestion Control” 31 | > In *Proceedings of the 1st Workshop on eBPF and Kernel Extensions*, pp. 1-7. 2023. 32 | 33 | https://doi.org/10.1145/3609021.3609295 34 | 35 |
36 | Click for BibTex citation 37 | 38 | ```bib 39 | @inproceedings{tcpsthirdeye, 40 | author = {Hinz, J\"{o}rn-Thorben and Addanki, Vamsi and Gy\"{o}rgyi, Csaba and Jepsen, Theo and Schmid, Stefan}, 41 | title = {TCP's Third Eye: Leveraging EBPF for Telemetry-Powered Congestion Control}, 42 | year = {2023}, 43 | isbn = {9798400702938}, 44 | publisher = {Association for Computing Machinery}, 45 | address = {New York, NY, USA}, 46 | url = {https://doi.org/10.1145/3609021.3609295}, 47 | doi = {10.1145/3609021.3609295}, 48 | booktitle = {Proceedings of the 1st Workshop on EBPF and Kernel Extensions}, 49 | pages = {1–7}, 50 | numpages = {7}, 51 | keywords = {eBPF, datacenter, INT, congestion control, TCP, linux kernel}, 52 | location = {New York, NY, USA}, 53 | series = {eBPF '23} 54 | } 55 | ``` 56 | 57 |
58 | 59 | ### For PowerTCP 60 | > Vamsi Addanki, Oliver Michel, and Stefan Schmid. 61 | > “PowerTCP: Pushing the Performance Limits of Datacenter NEtworks” 62 | > In *19th USENIX Symposium on Networked Systems Design and Implementation (NSDI 22)*, pp. 51-70. 2022. 63 | 64 | https://www.usenix.org/conference/nsdi22/presentation/addanki 65 | 66 |
67 | Click for BibTex citation 68 | 69 | ```bib 70 | @inproceedings{powertcp, 71 | author = {Vamsi Addanki and Oliver Michel and Stefan Schmid}, 72 | title = {{PowerTCP}: Pushing the Performance Limits of Datacenter Networks}, 73 | booktitle = {19th USENIX Symposium on Networked Systems Design and Implementation (NSDI 22)}, 74 | year = {2022}, 75 | isbn = {978-1-939133-27-4}, 76 | address = {Renton, WA}, 77 | pages = {51--70}, 78 | url = {https://www.usenix.org/conference/nsdi22/presentation/addanki}, 79 | publisher = {USENIX Association}, 80 | month = apr 81 | } 82 | ``` 83 | 84 |
85 | 86 | ### For TCP-INT 87 | > Grzegorz Jereczek, Theo Jepsen, Simon Wass, Bimmy Pujari, Jerry Zhen, and Jeongkeun Lee. 88 | > “TCP-INT: Lightweight Network Telemetry with TCP Transport” 89 | > In *Proceedings of the SIGCOMM'22 Poster and Demo Sessions*, pp. 58-60. 2022. 90 | 91 | https://doi.org/10.1145/3546037.3546064 92 | 93 |
94 | Click for BibTex citation 95 | 96 | ```bib 97 | @inproceedings{tcpint, 98 | author = {Jereczek, Grzegorz and Jepsen, Theo and Wass, Simon and Pujari, Bimmy and Zhen, Jerry and Lee, Jeongkeun}, 99 | title = {TCP-INT: Lightweight Network Telemetry with TCP Transport}, 100 | year = {2022}, 101 | isbn = {9781450394345}, 102 | publisher = {Association for Computing Machinery}, 103 | address = {New York, NY, USA}, 104 | url = {https://doi.org/10.1145/3546037.3546064}, 105 | doi = {10.1145/3546037.3546064}, 106 | pages = {58–60}, 107 | numpages = {3}, 108 | keywords = {in-band network telemetry, network monitoring}, 109 | location = {Amsterdam, Netherlands}, 110 | series = {SIGCOMM '22} 111 | } 112 | ``` 113 | 114 |
115 | -------------------------------------------------------------------------------- /bpf/.gitignore: -------------------------------------------------------------------------------- 1 | *.skel.h 2 | powertcp 3 | vmlinux.h 4 | -------------------------------------------------------------------------------- /bpf/Makefile: -------------------------------------------------------------------------------- 1 | BPFTOOL ?= /usr/sbin/bpftool 2 | CLANG ?= clang 3 | LLVM_STRIP ?= llvm-strip 4 | VMLINUX ?= /sys/kernel/btf/vmlinux 5 | TCP_INT_DIR ?= tcp-int/code 6 | 7 | HAVE_WRITABLE_SK_PACING ?= 0 8 | USE_SWLAT_AS_TIMESTAMP ?= 0 9 | 10 | BPF_OBJS := powertcp.bpf.o 11 | BPF_DEPS := $(BPF_OBJS:.o=.d) 12 | BPF_SKELS := $(BPF_OBJS:.bpf.o=.skel.h) 13 | PROGS := powertcp 14 | PROG_DEPS := $(PROGS:=.d) 15 | PROG_OBJS := $(PROGS:=.o) 16 | VMLINUX_H := vmlinux.h 17 | 18 | # Copied from Linux' tools/scripts/Makefile.arch: 19 | ARCH := $(shell uname -m | sed -e s/i.86/x86/ -e s/x86_64/x86/ \ 20 | -e s/sun4u/sparc/ -e s/sparc64/sparc/ \ 21 | -e /arm64/!s/arm.*/arm/ -e s/sa110/arm/ \ 22 | -e s/s390x/s390/ -e s/parisc64/parisc/ \ 23 | -e s/ppc.*/powerpc/ -e s/mips.*/mips/ \ 24 | -e s/sh[234].*/sh/ -e s/aarch64.*/arm64/ \ 25 | -e s/riscv.*/riscv/) 26 | 27 | LIBBPF_CFLAGS := $(shell pkg-config --cflags libbpf) 28 | LIBBPF_LIBS := $(shell pkg-config --libs libbpf) 29 | 30 | BPF_CFLAGS := -g -target bpf -D__TARGET_ARCH_$(ARCH) \ 31 | $(LIBBPF_CFLAGS) -c -O2 \ 32 | -mcpu=v3 -Wall -DHAVE_WRITABLE_SK_PACING=$(HAVE_WRITABLE_SK_PACING) \ 33 | -DUSE_SWLAT_AS_TIMESTAMP=$(USE_SWLAT_AS_TIMESTAMP) \ 34 | -I.. -I$(TCP_INT_DIR)/include 35 | CXXFLAGS := -std=gnu++17 -O3 -Wall -Wextra $(LIBBPF_CFLAGS) -I.. -I$(TCP_INT_DIR)/src/tools 36 | DEP_CFLAGS := -M -MG -I.. -I$(TCP_INT_DIR)/include 37 | LDLIBS := $(LIBBPF_LIBS) 38 | 39 | .PHONY: all 40 | all: $(BPF_OBJS) $(PROGS) tcp_int 41 | 42 | .PHONY: clean 43 | clean: 44 | $(MAKE) -C tcp-int/code/src clean 45 | $(RM) $(BPF_DEPS) $(BPF_OBJS) $(BPF_SKELS) $(PROG_DEPS) $(PROG_OBJS) $(PROGS) $(VMLINUX_H) 46 | 47 | $(VMLINUX_H): $(VMLINUX) 48 | $(BPFTOOL) btf dump file $< format c > $@ 49 | 50 | %.bpf.o: %.bpf.c 51 | $(CLANG) $(BPF_CFLAGS) $< -o $@ 52 | $(LLVM_STRIP) -g $@ 53 | 54 | %.skel.h: %.bpf.o 55 | $(BPFTOOL) gen skeleton $< > $@ 56 | 57 | $(PROGS): CC=$(CXX) 58 | 59 | $(PROG_DEPS): DEP_CFLAGS += -I$(TCP_INT_DIR)/src/tools 60 | 61 | $(BPF_DEPS): %.d: %.c 62 | $(CC) $(DEP_CFLAGS) $< -MF $@ 63 | 64 | $(PROG_DEPS): %.d: %.cpp 65 | $(CXX) $(DEP_CFLAGS) $< -MF $@ 66 | 67 | .PHONY: tcp_int 68 | tcp_int: 69 | env --unset=VMLINUX_H $(MAKE) -C tcp-int/code/src 70 | 71 | ifneq ($(MAKECMDGOALS),clean) 72 | -include $(BPF_DEPS) $(PROG_DEPS) 73 | endif 74 | -------------------------------------------------------------------------------- /bpf/README.md: -------------------------------------------------------------------------------- 1 | # PowerTCP eBPF implementation 2 | 3 | > [!IMPORTANT] 4 | > The `bpf_powertcp` congestion control is fully functional but requires TCP-INT 5 | > to be deployed on your network switches. A description on how to deploy TCP-INT 6 | > is unfortunately out of the scope of this repository. You can find a hint in 7 | > the TCP-INT repository: [Switch Code](https://github.com/p4lang/p4app-TCP-INT/tree/v0.2.0-alpha#switch-code). 8 | 9 | > [!NOTE] 10 | > The `bpf_rttpowertcp` is fully functional when the network interface(s) 11 | > support hardware timestamping. You can check the support 12 | > with (as root/with `sudo`) 13 | > ``` 14 | > ethtool -T INTERFACE | grep hardware-receive 15 | > ``` 16 | > which should output `hardware-receive`. 17 | 18 | Following are step-by-step instructions on how to use and experiment with the 19 | PowerTCP eBPF implementation. All commands listed here are assumed to be executed 20 | in the root folder of this repository. 21 | 22 | When loaded into the kernel, the congestion control algorithms are called 23 | `bpf_powertcp` and `bpf_rttpowertcp`. 24 | 25 | ## Prerequisites 26 | 27 | ### In the network 28 | - TCP-INT 29 | [deployed](https://github.com/p4lang/p4app-TCP-INT/tree/v0.2.0-alpha#switch-code) 30 | on network switches 31 | 32 | ### On the hosts 33 | - Linux kernel 5.10 or above (ideally 6.0 or above) 34 | - `bpftool` version 5.15 or above 35 | - `clang` version 3.7 or above 36 | - `g++` version 10 or above 37 | - libbpf version 0.5 or above 38 | - `llvm-strip` 39 | - `make` 40 | 41 | The required versions are available starting with Debian 10 (Bullseye) and Ubuntu 42 | 22.04 (Jammy Jellyfish). 43 | 44 | The installation of the required software is shown in the following. 45 | 46 |
47 | Details on the kernel requirements 48 | 49 | The target kernel must be compiled with `CONFIG_DEBUG_INFO_BTF=y`. It usually 50 | is, check with 51 | ``` 52 | grep -w CONFIG_DEBUG_INFO_BTF /boot/config-$(uname -r) 53 | ``` 54 | 55 | For optimal performance, the target kernel can be 56 | [patched for `sk_pacing_rate` to be writable](https://lore.kernel.org/all/20220622191227.898118-2-jthinz@mailbox.tu-berlin.de/) 57 | from eBPF code. This patch is included in kernel versions 6.0 and following, no 58 | further action is required. If the target kernel is *manually* patched, enable 59 | the usage of the pacing rate in the eBPF programs by appending 60 | `HAVE_WRITABLE_SK_PACING=1` to the below invocation of `make`. 61 | 62 |
63 | 64 | ## After checkout 65 | After checking out this repository, also checkout TCP-INT which is managed as a 66 | Git submodule in the bpf/tcp-int/ subdirectory: 67 | ``` 68 | git submodule update --init 69 | ``` 70 | 71 | ## Preparation 72 | 73 | The preparation steps need to be executed on both client and server. 74 | 75 | 1. Install required packages (as root/with `sudo`): 76 | ``` 77 | apt install 'bpftool|linux-tools-common$' clang g++ gcc libbpf-dev llvm make 78 | ``` 79 | 80 | Ideally, tune the network interface *IFACE* for low latency etc. (as root/with 81 | `sudo`): 82 | ``` 83 | apt install ethtool procps tuned 84 | ./tools/tune-eth IFACE 85 | ``` 86 | 2. Build the PowerTCP BPF program and TCP-INT: 87 | ``` 88 | make -C bpf/ 89 | ``` 90 | 91 | If you are using a modified TCP-INT P4 application that replaces the `swlat` 92 | telemetry field with a timestamp, append `USE_SWLAT_AS_TIMESTAMP=1` to the 93 | above invocation of `make`. 94 | 95 | Disable stripping of the object files (for more human-readable `objdump` 96 | output) by appending `LLVM_STRIP=/bin/true` to the above invocation of `make`. 97 | 3. For `bpf_rttpowertcp`, enable hardware timestamping on the relevant network 98 | interface(s) *IFACE(s)* (as root/with `sudo`): 99 | ``` 100 | ./bpf/powertcp enable-hwts IFACE(s) 101 | ``` 102 | 103 | ## On the server 104 | 105 | *Close any previously opened screen sessions that were opened this way.* 106 | 107 | Start `iperf` and `iperf3` server instances, ready to use PowerTCP, in a screen 108 | session (as root/with `sudo`): 109 | ``` 110 | ./tools/setup-bpf iperf-servers 111 | ``` 112 | **Beware: You are root user inside the screen session!** 113 | 114 | Algorithm parameters (see [On the client](#on-the-client)) do not need to be 115 | set on the server, they are irrelevant here. 116 | 117 | ## On the client 118 | 119 | > [!NOTE] 120 | > Applications that want to use `bpf_powertcp` or `bpf_rttpowertcp` must be 121 | > executed in the *tcp-int* cgroup. The `setup-bpf` script takes care of this. 122 | 123 | On the client, you can use PowerTCP in an interactive session or automatically 124 | record traces of the algorithm execution. 125 | 126 | ### Interactive usage 127 | 128 | *Close any previously opened screen sessions that were opened this way.* 129 | 130 | The `setup_bpf` script opens a screen session readily prepared to use PowerTCP. 131 | Applications executed in this screen session are in the *tcp-int* cgroup, as 132 | required. 133 | 134 | You can pass algorithm parameters to `setup-bpf`. You should pass at least 135 | `hop_bw` and `host_bw`, e.g (as root/with `sudo`): 136 | ``` 137 | ./tools/setup-bpf iperf-client tracing host_bw=25000 hop_bw=25000 base_rtt=50 138 | ``` 139 | For a list of the available parameters see 140 | ``` 141 | ./bpf/powertcp -h 142 | ``` 143 | 144 | **Beware: You are root user inside the screen session!** 145 | 146 | Inside the screen session, you can, e.g, 147 | - run `iperf3` (or `iperf`, the options differ) 148 | ``` 149 | iperf3 -N -C bpf_powertcp -c SERVER_IP 150 | iperf3 -N -C bpf_rttpowertcp -c SERVER_IP 151 | ``` 152 | - or watch PowerTCP’s trace output 153 | ``` 154 | ./bpf/powertcp trace 155 | ``` 156 | (for CSV output append the option `-C`—or see [Record traces](#record-traces)) 157 | - or watch TCP-INT’s trace output 158 | ``` 159 | ./bpf/tcp-int/code/src/tools/tcp_int trace 160 | ``` 161 | - or quickly setup PowerTCP with different parameters 162 | ``` 163 | ./bpf/powertcp register -f tracing host_bw=100000 hop_bw=100000 base_rtt=50 gamma=0.7 164 | ``` 165 | 166 | ### Record traces 167 | 168 | *To record a trace, close any previously opened screen sessions opened for 169 | [interactive usage](#interactive-usage).* 170 | 171 | Record traces (as CSV files) of running `iperf`/`iperf3` with multiple 172 | **combinations** of algorithm parameters (as root/with `sudo`): 173 | ``` 174 | ./tools/bpf_tracer iperf3 -N -c SERVER_IP -C bpf_powertcp -- host_bw=25000 hop_bw=20000 base_rtt=50 beta="2 10" gamma="0.5 0.9" 175 | ``` 176 | 177 | `bpf_tracer` takes an `iperf`/`iperf3` command line followed by PowerTCP 178 | algorithm parameters, separated by a `--`: 179 | ``` 180 | ./tools/bpf_tracer IPERF(3)_CMDLINE -- POWERTCP_PARAMS 181 | ``` 182 | `IPERF(3)_CMDLINE` must contain a full `iperf`/`iperf3` *client* command line; 183 | *it must specify the congestion control algorithm to use*. 184 | 185 | `POWERTCP_PARAMS` can contain any of the parameters listed by 186 | `./bpf/powertcp -h`. Multiple values can be given for each parameter as a 187 | quoted string. 188 | 189 | The above example call produces 4 CSV files: 190 | ``` 191 | bpf_powertcp-gamma=0.5 base_rtt=50 hop_bw=20000 beta=10 host_bw=25000.csv 192 | bpf_powertcp-gamma=0.5 base_rtt=50 hop_bw=20000 beta=2 host_bw=25000.csv 193 | bpf_powertcp-gamma=0.9 base_rtt=50 hop_bw=20000 beta=10 host_bw=25000.csv 194 | bpf_powertcp-gamma=0.9 base_rtt=50 hop_bw=20000 beta=2 host_bw=25000.csv 195 | ``` 196 | -------------------------------------------------------------------------------- /bpf/bpf_ca_helpers.h: -------------------------------------------------------------------------------- 1 | // SPDX-License-Identifier: GPL-2.0 OR MIT 2 | /* 3 | * Common helpers for an eBPF CA. 4 | * 5 | * Similar to Linux' tools/testing/selftests/bpf/bpf_tcp_helpers.h but without 6 | * type definitions. vmlinux.h is used here for those. Most parts are copied 7 | * from net/tcp.h. 8 | */ 9 | 10 | #ifndef BPF_CA_HELPERS_H 11 | #define BPF_CA_HELPERS_H 12 | 13 | #include "vmlinux.h" 14 | 15 | #define MEGA 1000000UL 16 | #define SO_MAX_PACING_RATE 47 17 | #define SO_TIMESTAMPING_NEW 65 18 | #define SOL_SOCKET 1 19 | #define SOL_TCP 6 20 | #define TCP_INFINITE_SSTHRESH 0x7fffffff 21 | #define USEC_PER_SEC 1000000L 22 | #define NSEC_PER_SEC 1000000000L 23 | #define NSEC_PER_USEC 1000L 24 | 25 | #define __KERNEL_DIV_ROUND_UP(n, d) (((n) + (d)-1) / (d)) 26 | #define BITS_PER_BYTE 8 27 | #define BITS_PER_TYPE(type) (sizeof(type) * BITS_PER_BYTE) 28 | #define BITS_TO_BYTES(nr) __KERNEL_DIV_ROUND_UP(nr, BITS_PER_TYPE(char)) 29 | #if __STDC_VERSION__ <= 201710L 30 | #define BUILD_BUG_ON(cond) _Static_assert(!(cond), "BUILD BUG: " #cond) 31 | #else 32 | #define BUILD_BUG_ON(cond) static_assert(!(cond), "BUILD BUG: " #cond) 33 | #endif 34 | #define ICSK_CA_PRIV_SIZE \ 35 | (sizeof(((struct inet_connection_sock *)NULL)->icsk_ca_priv)) 36 | #define max(x, y) (((x) > (y)) ? (x) : (y)) 37 | #define max_t(type, x, y) max((type)(x), (type)(y)) 38 | #define min(x, y) (((x) < (y)) ? (x) : (y)) 39 | #define min_t(type, x, y) min((type)(x), (type)(y)) 40 | 41 | static inline bool before(u32 seq1, u32 seq2) 42 | { 43 | return (s32)(seq1 - seq2) < 0; 44 | } 45 | #define after(seq2, seq1) before(seq1, seq2) 46 | 47 | static inline struct inet_connection_sock *inet_csk(const struct sock *sk) 48 | { 49 | return (struct inet_connection_sock *)sk; 50 | } 51 | 52 | static inline void *inet_csk_ca(const struct sock *sk) 53 | { 54 | return (void *)inet_csk(sk)->icsk_ca_priv; 55 | } 56 | 57 | /* Minimum RTT in usec. ~0 means not available. */ 58 | static inline u32 tcp_min_rtt(const struct tcp_sock *tp) 59 | { 60 | return tp->rtt_min.s[0].v; 61 | } 62 | 63 | static inline struct tcp_sock *tcp_sk(const struct sock *sk) 64 | { 65 | return (struct tcp_sock *)sk; 66 | } 67 | 68 | static inline u32 tcp_stamp_us_delta(u64 t1, u64 t0) 69 | { 70 | return max_t(s64, t1 - t0, 0); 71 | } 72 | 73 | #endif 74 | -------------------------------------------------------------------------------- /bpf/powertcp.bpf.c: -------------------------------------------------------------------------------- 1 | // SPDX-License-Identifier: GPL-2.0 OR MIT 2 | /* 3 | * PowerTCP congestion control 4 | * 5 | * Based on the algorithm developed in: 6 | * Addanki, V., O. Michel, and S. Schmid. 7 | * "PowerTCP: Pushing the Performance Limits of Datacenter Networks." 8 | * 19th USENIX Symposium on Networked Systems Design and Implementation (NSDI 22). 9 | * USENIX Association, 2022. 10 | * Available at: https://arxiv.org/pdf/2112.14309.pdf 11 | * 12 | * Implemented by: 13 | * Jörn-Thorben Hinz, TU Berlin, 2022. 14 | */ 15 | 16 | #include "bpf_ca_helpers.h" 17 | #include "powertcp_defs.h" 18 | 19 | #include "vmlinux.h" 20 | 21 | #include "../powertcp_trace.h" 22 | 23 | #include 24 | #include 25 | 26 | char _license[] SEC("license") = "Dual MIT/GPL"; 27 | 28 | #define ULONG_MAX (-1UL) 29 | 30 | #define POWERTCP_CONG_OPS_ATTRS SEC(".struct_ops") 31 | #define POWERTCP_CONG_OPS_FUNC(name, ...) \ 32 | SEC("struct_ops/" __stringify(name)) \ 33 | BPF_PROG(name, __VA_ARGS__) 34 | #define POWERTCP_CONG_OPS_FUNC_PTR (void *) 35 | #define POWERTCP_CONG_OPS_NAME_PREFIX bpf_ 36 | 37 | /* Configuration variables can only be set before loading the BPF object: */ 38 | #define POWERTCP_PARAM_ATTRS const volatile 39 | 40 | #include "powertcp_tcp-int_head.bpf.c" 41 | 42 | #include "../powertcp_head.c" 43 | 44 | POWERTCP_PARAM_ATTRS bool tracing = false; 45 | 46 | extern __u32 LINUX_KERNEL_VERSION __kconfig; 47 | 48 | struct { 49 | __uint(type, BPF_MAP_TYPE_SK_STORAGE); 50 | __uint(map_flags, BPF_F_NO_PREALLOC); 51 | __type(key, int); 52 | __type(value, u64); 53 | } map_powertcp_hwtstamps SEC(".maps"); 54 | 55 | struct { 56 | __uint(type, BPF_MAP_TYPE_RINGBUF); 57 | __uint(max_entries, 512 * 1024); 58 | } trace_events SEC(".maps"); 59 | 60 | /* Look for the host bandwidth (in Mbit/s). */ 61 | static unsigned long get_host_bw(struct sock *sk) 62 | { 63 | return host_bw; 64 | #if 0 65 | const struct dst_entry *dst = sk->sk_dst_cache; 66 | unsigned long bw = fallback_host_bw; 67 | 68 | if (dst && dst->dev) { 69 | struct ethtool_link_ksettings cmd; 70 | int r; 71 | 72 | rtnl_lock(); 73 | /* ethtool_params_from_link_mode() would be even simpler. 74 | * But dst->dev->link_mode seems to always be 0 at this point. */ 75 | r = __ethtool_get_link_ksettings(dst->dev, &cmd); 76 | rtnl_unlock(); 77 | if (r == 0 && cmd.base.speed != SPEED_UNKNOWN) { 78 | bw = cmd.base.speed; 79 | pr_debug("hash=%u: got link speed: %lu Mbit/s\n", 80 | sk->sk_hash, bw); 81 | } else { 82 | pr_warn("link speed unavailable, using fallback: %lu Mbit/s\n", 83 | bw); 84 | } 85 | } 86 | 87 | return bw; 88 | #endif 89 | } 90 | 91 | static u64 get_tstamp(const struct sock *sk) 92 | { 93 | u64 *hwtstamp = bpf_sk_storage_get(&map_powertcp_hwtstamps, 94 | (struct sock *)sk, NULL, 0); 95 | if (hwtstamp && *hwtstamp) { 96 | return *hwtstamp; 97 | } 98 | 99 | return tcp_sk(sk)->tcp_clock_cache; 100 | } 101 | 102 | static void output_trace_event(struct powertcp_trace_event *trace_event) 103 | { 104 | trace_event->time = bpf_ktime_get_ns(); 105 | bpf_ringbuf_output(&trace_events, trace_event, sizeof(*trace_event), 0); 106 | } 107 | 108 | void require_hwtstamps(struct sock *sk) 109 | { 110 | /* Nothing to do here. For a BPF program to have __sk_buff.hwtstamp 111 | * populated, only ioctl(SIOCSHWTSTAMP) must be executed on the network 112 | * device. No bpf_setsockopt(SO_TIMESTAMPING_*) is necessary. 113 | */ 114 | } 115 | 116 | static void require_pacing(struct sock *sk) 117 | { 118 | /* When using a kernel version before 6.0 that is manually patched with 119 | * https://lore.kernel.org/all/20220622191227.898118-2-jthinz@mailbox.tu-berlin.de/, 120 | * writing to sk_pacing_* can be enabled with HAVE_WRITABLE_SK_PACING=1 121 | * passed to make. 122 | */ 123 | if (HAVE_WRITABLE_SK_PACING || 124 | LINUX_KERNEL_VERSION >= KERNEL_VERSION(6, 0, 0)) { 125 | /* We do want sk_pacing_rate to be respected: */ 126 | #if __clang_major__ >= 12 127 | // cmpxchg(&sk->sk_pacing_status, SK_PACING_NONE, SK_PACING_NEEDED); 128 | __sync_bool_compare_and_swap(&sk->sk_pacing_status, 129 | SK_PACING_NONE, SK_PACING_NEEDED); 130 | #else 131 | if (sk->sk_pacing_status == SK_PACING_NONE) { 132 | sk->sk_pacing_status = SK_PACING_NEEDED; 133 | } 134 | #endif 135 | } 136 | } 137 | 138 | /* Set the socket pacing rate (bytes per second). */ 139 | static void set_rate(struct sock *sk, unsigned long rate) 140 | { 141 | /* When using a kernel version before 6.0 that is manually patched with 142 | * https://lore.kernel.org/all/20220622191227.898118-2-jthinz@mailbox.tu-berlin.de/, 143 | * writing to sk_pacing_* can be enabled with HAVE_WRITABLE_SK_PACING=1 144 | * passed to make. 145 | * 146 | * With an older and unpatched kernel, it is impossible to control 147 | * sk_pacing_rate here from BPF code. 148 | */ 149 | if (HAVE_WRITABLE_SK_PACING || 150 | LINUX_KERNEL_VERSION >= KERNEL_VERSION(6, 0, 0)) { 151 | sk->sk_pacing_rate = min(rate, sk->sk_max_pacing_rate); 152 | } 153 | } 154 | 155 | static bool tracing_enabled() 156 | { 157 | return tracing; 158 | } 159 | 160 | void POWERTCP_CONG_OPS_FUNC(powertcp_cong_avoid, struct sock *sk, u32 ack, 161 | u32 acked) 162 | { 163 | /* Before, tcp_congestion_ops.cong_avoid was non-optional in 164 | * net/ipv4/bpf_tcp_ca.c, even if it is never used when cong_control is 165 | * also set. This was fixed in Linux 6.0 with 166 | * https://lore.kernel.org/all/20220622191227.898118-3-jthinz@mailbox.tu-berlin.de/. 167 | * 168 | * This stub is kept here for compatibility with older kernels. 169 | */ 170 | } 171 | 172 | SEC("cgroup_skb/ingress") 173 | int powertcp_hwtstamp(struct __sk_buff *skb) 174 | { 175 | struct bpf_sock *sk = skb->sk; 176 | if (sk) { 177 | u64 *hwtstamp = 178 | bpf_sk_storage_get(&map_powertcp_hwtstamps, sk, NULL, 179 | BPF_SK_STORAGE_GET_F_CREATE); 180 | if (hwtstamp) { 181 | __u64 hwts = skb->hwtstamp; 182 | __u64 ts = skb->tstamp; 183 | *hwtstamp = hwts > 0 ? hwts : ts; 184 | } 185 | } 186 | 187 | return 1; 188 | } 189 | 190 | #include "powertcp_tcp-int.bpf.c" 191 | 192 | #include "../powertcp.c" 193 | -------------------------------------------------------------------------------- /bpf/powertcp.cpp: -------------------------------------------------------------------------------- 1 | // SPDX-License-Identifier: GPL-2.0 OR MIT 2 | /* 3 | * Loader and configuration tool for the eBPF implementation of the PowerTCP 4 | * congestion control algorithm. 5 | * 6 | * Author: 7 | * Jörn-Thorben Hinz, TU Berlin, 2022. 8 | */ 9 | #include "powertcp.skel.h" 10 | #include "powertcp_defs.h" 11 | 12 | #include "tcp_int.h" 13 | 14 | #include 15 | #include 16 | #if !defined(LIBBPF_MAJOR_VERSION) || LIBBPF_MAJOR_VERSION < 1 17 | #include 18 | #endif 19 | #include 20 | #include 21 | #include 22 | #include 23 | #include 24 | #include 25 | #include 26 | #include 27 | #include 28 | #include 29 | #include 30 | #include 31 | #include 32 | #include 33 | #include 34 | #include 35 | #include 36 | #include 37 | #include 38 | #include 39 | #include 40 | #include 41 | #include 42 | #include 43 | #include 44 | #include 45 | 46 | #include "powertcp_trace.h" 47 | 48 | namespace 49 | { 50 | template 51 | struct delete_func_wrapper { 52 | void operator()(T *ptr) const noexcept 53 | { 54 | DeleteFunc(ptr); 55 | } 56 | }; 57 | 58 | using arg_vector = std::vector; 59 | 60 | template 61 | using ptr_with_delete_func = 62 | std::unique_ptr >; 63 | 64 | using powertcp_bpf_ptr = 65 | ptr_with_delete_func; 66 | 67 | struct powertcp_param_bool { 68 | using rodata_type = bool; 69 | std::size_t rodata_off; 70 | }; 71 | 72 | struct powertcp_param_double { 73 | using rodata_type = long; 74 | std::size_t rodata_off; 75 | double scale; 76 | }; 77 | 78 | struct powertcp_param_long { 79 | using rodata_type = long; 80 | std::size_t rodata_off; 81 | }; 82 | 83 | using powertcp_param = std::variant; 85 | 86 | struct powertcp_param_visitor { 87 | const std::string &str; 88 | powertcp_bpf::powertcp_bpf__rodata *rodata; 89 | 90 | void operator()(const powertcp_param_bool &par) const 91 | { 92 | assign_param(true, par, rodata); 93 | } 94 | 95 | void operator()(const powertcp_param_double &par) const 96 | { 97 | assign_param(std::stod(str) * par.scale, par, rodata); 98 | } 99 | 100 | void operator()(const powertcp_param_long &par) const 101 | { 102 | assign_param(std::stol(str), par, rodata); 103 | } 104 | 105 | template 106 | void assign_param(T val, P param, 107 | powertcp_bpf::powertcp_bpf__rodata *rodata) const 108 | { 109 | assert(rodata != nullptr); 110 | 111 | auto &rodata_param = 112 | *reinterpret_cast( 113 | reinterpret_cast(rodata) + 114 | param.rodata_off); 115 | /* TODO: Maybe check if a value is in the allowed range. Or do that in 116 | * the BPF code. */ 117 | rodata_param = val; 118 | } 119 | }; 120 | 121 | using ring_buffer_ptr = ptr_with_delete_func; 122 | 123 | class unique_fd { 124 | public: 125 | unique_fd() noexcept : fd_{ -1 } 126 | { 127 | } 128 | 129 | explicit unique_fd(int fd) noexcept : fd_{ fd } 130 | { 131 | } 132 | 133 | unique_fd(const unique_fd &) = delete; 134 | unique_fd &operator=(const unique_fd &) = delete; 135 | 136 | unique_fd(unique_fd &&other) noexcept 137 | : fd_{ std::exchange(other.fd_, -1) } 138 | { 139 | } 140 | 141 | unique_fd &operator=(unique_fd &&other) noexcept 142 | { 143 | close(); 144 | std::swap(fd_, other.fd_); 145 | return *this; 146 | } 147 | 148 | ~unique_fd() 149 | { 150 | close(); 151 | } 152 | 153 | explicit operator bool() const noexcept 154 | { 155 | return fd_ > -1; 156 | } 157 | 158 | void close() noexcept 159 | { 160 | if (fd_ > -1) { 161 | ::close(fd_); /* Ignoring any errors here. */ 162 | fd_ = -1; 163 | } 164 | } 165 | 166 | int get() const noexcept 167 | { 168 | return fd_; 169 | } 170 | 171 | private: 172 | int fd_; 173 | }; 174 | 175 | using bpf_link_ptr = 176 | std::unique_ptr >; 178 | 179 | #define POWERTCP_RODATA_OFFSET(member) \ 180 | offsetof(powertcp_bpf::powertcp_bpf__rodata, member) 181 | const std::unordered_map params = { 182 | { "base_rtt", powertcp_param_long{ POWERTCP_RODATA_OFFSET(base_rtt) } }, 183 | { "beta", powertcp_param_long{ POWERTCP_RODATA_OFFSET(beta) } }, 184 | { "expected_flows", 185 | powertcp_param_long{ POWERTCP_RODATA_OFFSET(expected_flows) } }, 186 | { "gamma", 187 | powertcp_param_double{ POWERTCP_RODATA_OFFSET(gamma), gamma_scale } }, 188 | { "hop_bw", powertcp_param_long{ POWERTCP_RODATA_OFFSET(hop_bw) } }, 189 | { "host_bw", powertcp_param_long{ POWERTCP_RODATA_OFFSET(host_bw) } }, 190 | { "tracing", powertcp_param_bool{ POWERTCP_RODATA_OFFSET(tracing) } }, 191 | }; 192 | #undef POWERTCP_RODATA_OFFSET 193 | 194 | const std::filesystem::path powertcp_pin_dir = "/sys/fs/bpf/powertcp"; 195 | 196 | volatile std::sig_atomic_t running = true; 197 | 198 | void parse_param(std::string_view param_arg, 199 | powertcp_bpf::powertcp_bpf__rodata *rodata) 200 | { 201 | std::istringstream iss(std::string{ param_arg }); 202 | 203 | std::string name_tok; 204 | std::getline(iss, name_tok, '='); 205 | 206 | const auto param_iter = params.find(name_tok); 207 | if (param_iter == std::end(params)) { 208 | std::ostringstream oss; 209 | oss << "Unknown algorithm parameter '" << name_tok << "'"; 210 | throw std::invalid_argument(oss.str()); 211 | } 212 | 213 | std::string value_tok; 214 | std::getline(iss, value_tok, '='); 215 | 216 | try { 217 | std::visit(powertcp_param_visitor{ value_tok, rodata }, 218 | param_iter->second); 219 | } catch (const std::invalid_argument &) { 220 | std::ostringstream oss; 221 | oss << "Invalid value '" << value_tok << "' for parameter " 222 | << name_tok << ": invalid number"; 223 | throw std::invalid_argument(oss.str()); 224 | } catch (const std::out_of_range &) { 225 | std::ostringstream oss; 226 | oss << "Invalid value '" << value_tok << "' for parameter " 227 | << name_tok << ": out of range"; 228 | throw std::out_of_range(oss.str()); 229 | } 230 | } 231 | 232 | void pin_map(bpf_map *map) 233 | { 234 | assert(map != nullptr); 235 | 236 | const char *map_name = bpf_map__name(map); 237 | const auto pin_path = powertcp_pin_dir / map_name; 238 | if (bpf_map__pin(map, pin_path.c_str())) { 239 | if (errno == EEXIST) { 240 | fprintf(stderr, "%s is already pinned, skipping\n", 241 | map_name); 242 | return; 243 | } 244 | 245 | std::ostringstream oss; 246 | oss << "bpf_map__pin(" << map_name << ")"; 247 | throw std::system_error(errno, std::generic_category(), 248 | oss.str()); 249 | } 250 | } 251 | 252 | void attach_and_pin_cgroup_prog(bpf_program *prog, 253 | std::filesystem::path cgroup_path) 254 | { 255 | const char *prog_name = bpf_program__name(prog); 256 | 257 | std::filesystem::create_directory(cgroup_path); 258 | 259 | const auto cgroup_fd = unique_fd{ open(cgroup_path.c_str(), O_RDONLY) }; 260 | if (!cgroup_fd) { 261 | throw std::system_error(errno, std::generic_category(), 262 | "open(cgroup_path)"); 263 | } 264 | 265 | const auto link = bpf_link_ptr{ bpf_program__attach_cgroup( 266 | prog, cgroup_fd.get()) }; 267 | if (!link) { 268 | std::ostringstream oss; 269 | oss << "bpf_program__attach_cgroup(" << prog_name << ")"; 270 | throw std::system_error(errno, std::generic_category(), 271 | oss.str()); 272 | } 273 | 274 | const auto pin_path = 275 | powertcp_pin_dir / 276 | std::filesystem::path{ "link_" }.concat(prog_name); 277 | if (bpf_link__pin(link.get(), pin_path.c_str())) { 278 | if (errno == EEXIST) { 279 | fprintf(stderr, "%s is already pinned, skipping\n", 280 | prog_name); 281 | return; 282 | } 283 | 284 | std::ostringstream oss; 285 | oss << "bpf_link__pin(" << prog_name << ")"; 286 | throw std::system_error(errno, std::generic_category(), 287 | oss.str()); 288 | } 289 | } 290 | 291 | void attach_struct_ops(bpf_map *struct_ops) 292 | { 293 | auto link = bpf_link_ptr{ bpf_map__attach_struct_ops(struct_ops) }; 294 | if (!link) { 295 | if (errno == EEXIST) { 296 | fprintf(stderr, "%s is already registered, skipping\n", 297 | bpf_map__name(struct_ops)); 298 | return; 299 | } 300 | 301 | std::ostringstream oss; 302 | oss << "attach_struct_ops(" << bpf_map__name(struct_ops) << ")"; 303 | throw std::system_error(errno, std::generic_category(), 304 | oss.str()); 305 | } 306 | 307 | /* Have to __disconnect() before __destroy() so the attached struct_ops 308 | * outlive this userspace program. 309 | */ 310 | bpf_link__disconnect(link.get()); 311 | } 312 | 313 | void delete_struct_ops(std::string_view map_name) 314 | { 315 | unique_fd fd; 316 | __u32 id = 0; 317 | 318 | auto info = bpf_map_info{}; 319 | __u32 info_len = sizeof(info); 320 | 321 | while (true) { 322 | if (bpf_map_get_next_id(id, &id)) { 323 | if (errno != ENOENT) { 324 | throw std::system_error(errno, 325 | std::generic_category(), 326 | "map_get_next_id"); 327 | } 328 | return; 329 | } 330 | 331 | fd = unique_fd(bpf_map_get_fd_by_id(id)); 332 | if (!fd) { 333 | if (errno == ENOENT) { 334 | continue; 335 | } 336 | throw std::system_error(errno, std::generic_category(), 337 | "map_get_fd_by_id"); 338 | } 339 | 340 | if (bpf_obj_get_info_by_fd(fd.get(), &info, &info_len)) { 341 | throw std::system_error(errno, std::generic_category(), 342 | "obj_get_info_by_fd"); 343 | } 344 | 345 | if (info.type == BPF_MAP_TYPE_STRUCT_OPS && 346 | map_name == info.name) { 347 | break; 348 | } 349 | } 350 | 351 | constexpr auto zero = 0; 352 | if (bpf_map_delete_elem(fd.get(), &zero)) { 353 | throw std::system_error(errno, std::generic_category(), 354 | "map_delete_elem"); 355 | } 356 | } 357 | 358 | void enable_hwts(std::string_view dev) 359 | { 360 | auto fd = unique_fd{ socket(AF_UNIX, SOCK_DGRAM, 0) }; 361 | if (!fd) { 362 | throw std::system_error{ 363 | std::make_error_code(std::errc{ errno }), "socket" 364 | }; 365 | } 366 | 367 | hwtstamp_config hwts_conf = {}; 368 | hwts_conf.rx_filter = HWTSTAMP_FILTER_ALL; 369 | 370 | ifreq ifr = {}; 371 | ifr.ifr_data = reinterpret_cast<__caddr_t>(&hwts_conf); 372 | 373 | dev.copy(ifr.ifr_name, sizeof(ifr.ifr_name) - 1); 374 | assert(ifr.ifr_name[sizeof(ifr.ifr_name) - 1] == '\0'); 375 | if (std::size(dev) != std::strlen(ifr.ifr_name)) { 376 | std::ostringstream oss; 377 | oss << "device name too long: " << dev; 378 | throw std::invalid_argument{ oss.str() }; 379 | } 380 | 381 | if (ioctl(fd.get(), SIOCSHWTSTAMP, &ifr)) { 382 | const auto err = std::make_error_code(std::errc{ errno }); 383 | std::ostringstream oss; 384 | 385 | if (err == std::errc::not_supported) { 386 | oss << dev << " does not support hardware timestamping"; 387 | throw std::runtime_error{ oss.str() }; 388 | } else { 389 | oss << dev << ": ioctl(SIOCSHWTSTAMP)"; 390 | throw std::system_error{ err, oss.str() }; 391 | } 392 | } 393 | } 394 | 395 | void do_enable_hwts(const arg_vector &args) 396 | { 397 | for (auto &&arg : args) { 398 | enable_hwts(arg); 399 | } 400 | } 401 | 402 | void do_register(const arg_vector &args) 403 | { 404 | auto skel = powertcp_bpf_ptr{ powertcp_bpf__open() }; 405 | if (!skel) { 406 | throw std::system_error(errno, std::generic_category(), "open"); 407 | } 408 | 409 | for (auto &&arg : args) { 410 | parse_param(arg, skel->rodata); 411 | } 412 | 413 | auto map_fd = unique_fd( 414 | bpf_obj_get(TCP_INT_BPF_PIN_PATH "/map_tcp_int_state")); 415 | if (!map_fd) { 416 | throw std::system_error(errno, std::generic_category(), 417 | "obj_get(map_tcp_int_state)"); 418 | } 419 | 420 | auto *map_tcp_int_state = 421 | bpf_object__find_map_by_name(skel->obj, "map_tcp_int_state"); 422 | if (!map_tcp_int_state) { 423 | throw std::system_error(errno, std::generic_category(), 424 | "find_map_by_name(map_tcp_int_state)"); 425 | } 426 | 427 | if (bpf_map__reuse_fd(map_tcp_int_state, map_fd.get()) < 0) { 428 | throw std::system_error(errno, std::generic_category(), 429 | "reuse_fd(map_tcp_int_state)"); 430 | } 431 | 432 | if (powertcp_bpf__load(skel.get())) { 433 | throw std::system_error(errno, std::generic_category(), "load"); 434 | } 435 | 436 | attach_struct_ops(skel->maps.powertcp); 437 | attach_struct_ops(skel->maps.rttpowertcp); 438 | 439 | attach_and_pin_cgroup_prog(skel->progs.powertcp_hwtstamp, 440 | TCP_INT_CGROUP_PATH); 441 | 442 | /* struct_ops program maps are "pinned"/kept alive in their own way (see 443 | * the comment in attach_struct_ops()), we only want to pin other maps 444 | * here: 445 | */ 446 | pin_map(skel->maps.map_powertcp_hwtstamps); 447 | pin_map(skel->maps.trace_events); 448 | } 449 | 450 | int handle_trace_event(void * /* ctx */, void *data, std::size_t /* data_sz */) 451 | { 452 | /* TODO: If it seems appropriate later, merge handle_trace_event() and 453 | * handle_trace_event_csv() and just use two different format strings. 454 | */ 455 | const powertcp_trace_event &ev = 456 | *static_cast(data); 457 | 458 | /* 459 | * Desired alignment in the output, showing the maximum value per data type: 460 | * 461 | * # Time (us) Socket hash CWND (segments) Pacing rate (Mbit/s) Norm. power Smoothed power Queue length (bytes) Delta t (ns) Tx. bytes diff RTT grad. 462 | * 18446744073709551615 4294967295 4294967295 xxxxxxxxxx x.yyyyyyyy x.yyyyyyyy 4294967295 4294967295 4294967295 x.yyyyyyyy 463 | */ 464 | std::printf( 465 | "%20llu %10u %10u %10lu %10.8f %10.8f %10ld %10u %10u %10.8f\n", 466 | ev.time, ev.sock_hash, ev.cwnd, ev.rate * 8 / 1000000, 467 | static_cast(ev.p_norm) / power_scale, 468 | static_cast(ev.p_smooth) / power_scale, ev.qlen, 469 | ev.delta_t, ev.tx_bytes_diff, 470 | static_cast(ev.rtt_grad) / power_scale); 471 | 472 | return 0; 473 | } 474 | 475 | int handle_trace_event_csv(void * /* ctx */, void *data, 476 | std::size_t /* data_sz */) 477 | { 478 | /* TODO: If it seems appropriate later, merge handle_trace_event() and 479 | * handle_trace_event_csv() and just use two different format strings. 480 | */ 481 | const auto &ev = *static_cast(data); 482 | 483 | std::printf("%llu,%u,%u,%lu,%0f,%0f,%ld,%u,%u,%0f\n", ev.time, 484 | ev.sock_hash, ev.cwnd, ev.rate, 485 | static_cast(ev.p_norm) / power_scale, 486 | static_cast(ev.p_smooth) / power_scale, ev.qlen, 487 | ev.delta_t, ev.tx_bytes_diff, 488 | static_cast(ev.rtt_grad) / power_scale); 489 | 490 | return 0; 491 | } 492 | 493 | void do_trace(bool output_csv) 494 | { 495 | auto map_fd = unique_fd{ bpf_obj_get( 496 | (powertcp_pin_dir / "trace_events").c_str()) }; 497 | if (!map_fd) { 498 | throw std::system_error(-map_fd.get(), std::generic_category(), 499 | "bpf_obj_get"); 500 | } 501 | 502 | auto handle_func = 503 | output_csv ? handle_trace_event_csv : handle_trace_event; 504 | auto ring_buf = ring_buffer_ptr{ ring_buffer__new( 505 | map_fd.get(), handle_func, nullptr, nullptr) }; 506 | if (!ring_buf) { 507 | throw std::system_error(errno, std::generic_category(), 508 | "ring_buffer__new"); 509 | } 510 | 511 | const char *output_header; 512 | if (output_csv) { 513 | output_header = 514 | "time,hash,cwnd,rate,p_norm,p_smooth,qlen,delta_t,tx_bytes_diff,rtt_grad"; 515 | } else { 516 | output_header = 517 | "# Time (us) Socket hash CWND (segments) Pacing rate (Mbit/s) Norm. power Smoothed power Queue length (bytes) Delta t (ns) Tx. bytes diff RTT grad."; 518 | } 519 | 520 | auto repeated_timeout = true; 521 | std::puts(output_header); 522 | while (running) { 523 | if (auto err = ring_buffer__poll(ring_buf.get(), 100); 524 | err < 0 && err != -EINTR) { 525 | throw std::system_error(-err, std::generic_category(), 526 | "ring_buffer__poll"); 527 | } else if (err == 0 && !repeated_timeout) { 528 | /* err == 0 is a timeout */ 529 | if (!output_csv) { 530 | std::puts(output_header); 531 | } 532 | ::fflush(stdout); 533 | repeated_timeout = true; 534 | } else if (err > 0) { 535 | repeated_timeout = false; 536 | } 537 | } 538 | } 539 | 540 | void do_unregister() 541 | { 542 | delete_struct_ops("powertcp"); 543 | delete_struct_ops("rttpowertcp"); 544 | std::filesystem::remove_all(powertcp_pin_dir); 545 | } 546 | 547 | void handle_signal(int /* sig */) 548 | { 549 | running = false; 550 | } 551 | 552 | void usage(const char *prog, FILE *outfile) 553 | { 554 | fprintf(outfile, 555 | "Usage: %1$s enable-hwts [DEVICE...]\n" 556 | " %1$s [OPTION...] register [PARAMETER...]\n" 557 | " %1$s [OPTION...] trace | unregister\n" 558 | "\n" 559 | "COMMANDS\n" 560 | " enable-hwts\n" 561 | " Enable hardware timestamping on the given network device(s).\n" 562 | "\n" 563 | " register\n" 564 | " Register the PowerTCP eBPF programs, optionally setting algorithm\n" 565 | " parameters.\n" 566 | "\n" 567 | " trace\n" 568 | " Trace the execution of the algorithm.\n" 569 | "\n" 570 | " unregister\n" 571 | " Unregister the PowerTCP eBPF programs.\n" 572 | "\n" 573 | "OPTIONS\n" 574 | " -C\n" 575 | " Output traced values in CSV format.\n" 576 | "\n" 577 | " -f\n" 578 | " Force an unregister before a register so parameters can be set to\n" 579 | " new values.\n" 580 | "\n" 581 | "PARAMETERS\n" 582 | " The following parameters of the PowerTCP algorithm can be set with the\n" 583 | " register command:\n" 584 | " - base_rtt in µs\n" 585 | " - beta in number of packets\n" 586 | " - expected_flows in number of flows\n" 587 | " - gamma in range 0.0 to 1.0\n" 588 | " - hop_bw in Mbit/s\n" 589 | " - host_bw in Mbit/s\n" 590 | "\n" 591 | " Passing the additional, value-less parameter \"tracing\" enables tracing\n" 592 | " the algorithm with trace command.\n" 593 | "\n" 594 | "EXAMPLE\n" 595 | "\n" 596 | " # %1$s register expected_flows=1\n" 597 | " # %1$s enable-hwts eno1 eno2 eno3\n" 598 | "\n", 599 | prog); 600 | } 601 | } // namespace 602 | 603 | int main(int argc, char *argv[]) 604 | { 605 | bool force = false; 606 | auto output_csv = false; 607 | 608 | int opt; 609 | while (-1 != (opt = getopt(argc, argv, "Cfh"))) { 610 | switch (opt) { 611 | case 'C': 612 | output_csv = true; 613 | break; 614 | case 'f': 615 | force = true; 616 | break; 617 | case 'h': 618 | usage(argv[0], stdout); 619 | return EXIT_SUCCESS; 620 | default: 621 | usage(argv[0], stderr); 622 | return EXIT_FAILURE; 623 | } 624 | } 625 | 626 | if (optind >= argc) { 627 | usage(argv[0], stderr); 628 | return EXIT_FAILURE; 629 | } 630 | 631 | struct sigaction sigact = {}; 632 | sigact.sa_handler = handle_signal; 633 | sigact.sa_flags = SA_RESETHAND; 634 | if (sigaction(SIGINT, &sigact, nullptr)) { 635 | std::perror("sigaction"); 636 | return EXIT_FAILURE; 637 | } 638 | 639 | #if !defined(LIBBPF_MAJOR_VERSION) || LIBBPF_MAJOR_VERSION < 1 640 | if (libbpf_set_strict_mode(LIBBPF_STRICT_ALL)) { 641 | std::perror("libbpf_set_strict_mode"); 642 | return EXIT_FAILURE; 643 | } 644 | #endif 645 | 646 | const auto cmd = std::string_view{ argv[optind] }; 647 | const auto args = arg_vector(argv + optind + 1, argv + argc); 648 | 649 | if (cmd == "enable-hwts") { 650 | try { 651 | do_enable_hwts(args); 652 | } catch (const std::exception &e) { 653 | fprintf(stderr, "%s\n", e.what()); 654 | return EXIT_FAILURE; 655 | } 656 | } else if (cmd == "register") { 657 | if (force) { 658 | try { 659 | do_unregister(); 660 | } catch (const std::exception &e) { 661 | fprintf(stderr, "%s\n", e.what()); 662 | } 663 | } 664 | 665 | try { 666 | do_register(args); 667 | } catch (const std::exception &e) { 668 | fprintf(stderr, "%s\n", e.what()); 669 | return EXIT_FAILURE; 670 | } 671 | } else if (cmd == "trace") { 672 | try { 673 | do_trace(output_csv); 674 | } catch (const std::exception &e) { 675 | fprintf(stderr, "%s\n", e.what()); 676 | return EXIT_FAILURE; 677 | } 678 | } else if (cmd == "unregister") { 679 | if (argc - optind > 2) { 680 | fprintf(stderr, 681 | "unexpected argument(s) after 'unregister'\n"); 682 | return EXIT_FAILURE; 683 | } 684 | try { 685 | do_unregister(); 686 | } catch (const std::exception &e) { 687 | fprintf(stderr, "%s\n", e.what()); 688 | return EXIT_FAILURE; 689 | } 690 | } else { 691 | usage(argv[0], stderr); 692 | return EXIT_FAILURE; 693 | } 694 | } 695 | -------------------------------------------------------------------------------- /bpf/powertcp_tcp-int.bpf.c: -------------------------------------------------------------------------------- 1 | // SPDX-License-Identifier: GPL-2.0 OR MIT 2 | 3 | #include "tcp_int_common.h" 4 | #include "tcp_int_common.bpf.h" 5 | 6 | static const struct powertcp_int *get_int(struct sock *sk, 7 | const struct powertcp_int *prev_int) 8 | { 9 | struct ptcp_powertcp *ca = inet_csk_ca(sk); 10 | struct powertcp_int_impl *int_impl = &ca->int_impl; 11 | const struct tcp_sock *tp = tcp_sk(sk); 12 | /* Not using tcp_int_get_state() here since it uses 13 | * BPF_SK_STORAGE_GET_F_CREATE. We might want to use a missing map entry as 14 | * an indicator to fall back to RTT-PowerTCP. 15 | */ 16 | const struct tcp_int_state *tint = 17 | bpf_sk_storage_get(&map_tcp_int_state, sk, NULL, 0); 18 | 19 | if (tint) { 20 | u32 bandwidth = BITS_TO_BYTES(hop_bw); 21 | #if USE_SWLAT_AS_TIMESTAMP 22 | u32 ts = tint->swlat; 23 | #else 24 | u32 ts = get_tstamp(sk); 25 | #endif 26 | u32 dt = (!prev_int ? tp->srtt_us * (1000u >> 3) : 27 | ts - prev_int->hops[0].ts) & 28 | max_ts; 29 | 30 | if (dt == 0) { 31 | int_impl->cached_int.n_hop = 0; 32 | return NULL; 33 | } 34 | 35 | int_impl->cached_int.n_hop = 1; 36 | /* TCP-INT does not provide an identification for the path. */ 37 | /* TODO: Evaluate if it makes sense to use the switch ID as path ID. 38 | * Could lead to a too frequently detected path change, though. 39 | */ 40 | int_impl->cached_int.path_id = 1; 41 | 42 | int_impl->cached_int.hops[0].bandwidth = bandwidth; 43 | int_impl->cached_int.hops[0].qlen = tint->qdepth; 44 | int_impl->cached_int.hops[0].ts = ts; 45 | /* In lack of a tx_bytes value, we estimate it here. A factor of 46 | * MEGA/USEC_PER_SEC is cancelled in the calculation: 47 | */ 48 | int_impl->cached_int.hops[0].tx_bytes = 49 | bandwidth * tint->util / 100 / NSEC_PER_USEC * dt; 50 | 51 | return &int_impl->cached_int; 52 | } else { 53 | int_impl->cached_int.n_hop = 0; 54 | } 55 | 56 | return NULL; 57 | } 58 | 59 | static const struct powertcp_int *get_prev_int(struct sock *sk) 60 | { 61 | struct ptcp_powertcp *ca = inet_csk_ca(sk); 62 | struct powertcp_int_impl *int_impl = &ca->int_impl; 63 | struct powertcp_int *prev_int = &int_impl->prev_int; 64 | 65 | if (prev_int->n_hop) { 66 | /* With TCP-INT, the difference in tx_bytes since last ACK is already 67 | * estimated in get_int(). The previous value must be 0 so 68 | * ptcp_norm_power() does not calculate a second difference with a 69 | * value potentially coming from a different switch. 70 | */ 71 | prev_int->hops[0].tx_bytes = 0; 72 | return prev_int; 73 | } 74 | 75 | return NULL; 76 | } 77 | 78 | static int int_impl_init(struct sock *sk) 79 | { 80 | return 0; 81 | } 82 | 83 | static void int_impl_release(struct sock *sk) 84 | { 85 | /* no-op */ 86 | } 87 | 88 | static void int_impl_reset(powertcp_int_impl_t *int_impl, enum tcp_ca_event ev) 89 | { 90 | int_impl->prev_int.path_id = 0; 91 | } 92 | 93 | static void int_impl_update_old(powertcp_int_impl_t *int_impl) 94 | { 95 | int_impl->prev_int = int_impl->cached_int; 96 | } 97 | -------------------------------------------------------------------------------- /bpf/powertcp_tcp-int_head.bpf.c: -------------------------------------------------------------------------------- 1 | // SPDX-License-Identifier: GPL-2.0 OR MIT 2 | 3 | enum { max_n_hops = 1 }; 4 | 5 | #include "powertcp_int.c" 6 | 7 | /* TCP-INT's swlat field (which we optionally replace with a timestamp), is 8 | * only 24 bits long. 9 | */ 10 | static const unsigned int max_ts = 0xFFFFFFu; 11 | 12 | /* In case the tx_bytes value is taken directly from a less-than-32-bit INT 13 | * field, its maximum value has to be known for correct wrap-around in 14 | * calculations. 15 | */ 16 | static const u32 max_tx_bytes = 0xFFFFFFFFu; 17 | 18 | struct powertcp_int_impl { 19 | struct powertcp_int cached_int; 20 | struct powertcp_int prev_int; 21 | }; 22 | typedef struct powertcp_int_impl powertcp_int_impl_t; 23 | -------------------------------------------------------------------------------- /dkms.conf: -------------------------------------------------------------------------------- 1 | PACKAGE_NAME="powertcp" 2 | PACKAGE_VERSION="0.0.1" 3 | BUILT_MODULE_NAME[0]="tcp_powertcp" 4 | DEST_MODULE_LOCATION[0]="/kernel/net/ipv4/" 5 | AUTOINSTALL="yes" 6 | -------------------------------------------------------------------------------- /doc/code-structure.md: -------------------------------------------------------------------------------- 1 | # Code structure for module and BPF implementation 2 | 3 | This is the common code structure for the module’s `tcp_powertcp.c` and the BPF 4 | implementation’s `powertcp.bpf.c`. It uses direct `#include`s of source files 5 | (instead of using multiple compilation units with headers) to enable full 6 | inlining and other optimizations for both the module and BPF implementation. 7 | 8 | The parts *must* appear in this order: 9 | 10 | 1. General `#include`s, including the required `linux/types.h` (module) or 11 | `vmlinux.h` (BPF) 12 | 13 | 2. An `#include` defining INT-related values and types in this order: 14 | 1. Provide `max_n_hops` as an enumerator 15 | 16 | 2. `#include "powertcp_int.c"` which provides `struct powertcp_int` 17 | (requires `max_n_hops`) and other structs 18 | 19 | 3. Provide a typedef for `powertcp_int_impl_t`, which can alias e.g. a 20 | struct or a pointer, and constants `max_ts` and `max_tx_bytes` 21 | 22 | This include should be named `tcp_powertcp_METHOD_head.c` resp. 23 | `powertcp_METHOD_head.bpf.c`, e.g. `tcp_powertcp_foobar_head.c` or 24 | `powertcp_tcp-int_head.bpf.c`. 25 | 26 | `powertcp_no-int_head.c` shows the required content of this file. 27 | 28 | 3. `#define`s for various `POWERTCP_*` macros as needed; their default values 29 | are defined in `powertcp_head.c` 30 | 31 | 4. `#include "powertcp_head.c"` providing the core `struct powertcp` (requires 32 | `powertcp_int_impl_t`), the variables for the algorithm parameters, and 33 | default `#define`s for the still undefined `POWERTCP_*` macros 34 | 35 | 5. Additional (algorithm) parameter variables, other static/constant variables 36 | 37 | 6. Definitions of the required, module- or BPF-specific functions 38 | 39 | 7. An `#include` defining the INT-related functions 40 | 41 | This include should be named `tcp_powertcp_METHOD.c` resp. 42 | `powertcp_METHOD.bpf.c`, e.g. `tcp_powertcp_foobar.c` or 43 | `powertcp_tcp-int.bpf.c`. 44 | 45 | `powertcp_no-int.c` shows the required content of this file. 46 | 47 | 8. `#include "powertcp.c"` of the algorithm implementation 48 | 49 | 9. Additional definitions of functions requiring the PowerTCP 50 | `tcp_congestion_ops` instances, e.g. `module_init` and `module_exit` 51 | -------------------------------------------------------------------------------- /doc/module.md: -------------------------------------------------------------------------------- 1 | # PowerTCP kernel module 2 | 3 | > [!IMPORTANT] 4 | > The kernel module is missing a source of telemetry (the integration is 5 | > prepared). Therefore, the `powertcp` congestion control in the module is only a 6 | > proof of concept. 7 | 8 | > [!NOTE] 9 | > The `rttpowertcp` in the kernel module is functional but—due to limitations 10 | > in the kernel—lacks access to higher-precision hardware timestamps. 11 | 12 | Following are step-by-step instructions on how to use and experiment with the 13 | PowerTCP kernel module. All commands listed here are assumed to be executed in 14 | the root folder of this repository. 15 | 16 | When loaded into the kernel, the congestion control algorithms are called 17 | `powertcp` and `rttpowertcp`. 18 | 19 | ## Prerequisites 20 | - Any recent Linux kernel and corresponding kernel headers 21 | - `gcc` 22 | - `make` 23 | - `dkms` (optional) 24 | 25 | ## Preparation 26 | 27 | The preparation steps need to be executed on both client and server. 28 | 29 | 1. Install required packages (as root/with `sudo`): 30 | ``` 31 | apt install gcc linux-headers-$(uname -r) make 32 | ``` 33 | 34 | Ideally, tune the network interface *IFACE* for low latency etc. (as root/with 35 | `sudo`): 36 | ``` 37 | apt install ethtool procps tuned 38 | ./tools/tune-eth IFACE 39 | ``` 40 | 2. Build the PowerTCP module implementation: 41 | ``` 42 | make 43 | ``` 44 | 45 | ## On the server 46 | 47 | *Close any previously opened screen sessions that were opened this way.* 48 | 49 | Start `iperf` and `iperf3` server instances, ready to use PowerTCP, in a screen 50 | session (as root/with `sudo`): 51 | ``` 52 | ./tools/setup-module iperf-servers 53 | ``` 54 | **Beware: You are root user inside the screen session!** 55 | 56 | Algorithm parameters (see [On the client](#on-the-client)) do not need to be 57 | set on the server, they are irrelevant here. 58 | 59 | ## On the client 60 | 61 | The `setup-module` script opens a screen session readily prepared to use 62 | PowerTCP. 63 | 64 | You can and should pass algorithm parameters to `setup-module`, e.g (as 65 | root/with `sudo`): 66 | ``` 67 | ./tools/setup-module iperf-client host_bw=25000 hop_bw=25000 base_rtt=50 68 | ``` 69 | For a list of the available parameters see 70 | ``` 71 | /sbin/modinfo tcp_powertcp.ko 72 | ``` 73 | Note that a value for the `gamma` parameter must be multiplied with the value 74 | of `power_scale` defined in [powertcp_defs.h](../powertcp_defs.h) and rounded 75 | to an integer afterwards. 76 | 77 | **Beware: You are root user inside the screen session!** 78 | 79 | Inside the screen session, you can, e.g, 80 | - run `iperf3` (or `iperf`, the options differ) 81 | ``` 82 | iperf3 -N -C rttpowertcp -c SERVER_IP 83 | ``` 84 | 85 | ## Installation through DKMS 86 | 87 | The kernel module is [prepared](dkms.conf) for system-wide installation through 88 | DKMS. The [Makefile](Makefile) provides a convenience target for installation 89 | through DKMS (as root/with `sudo`): 90 | ``` 91 | make dkms_install 92 | ``` 93 | 94 | ## Tracepoints 95 | There are 96 | [tracepoints](https://www.kernel.org/doc/html/latest/trace/tracepoints.html) to 97 | follow the algorithm, mainly for the three core functions defined in the 98 | [paper](#for-powertcp) and the values used and returned by them. The tracepoints 99 | can be found in `/sys/kernel/debug/tracing/events/powertcp`. 100 | 101 | They can be enabled for example (see 102 | [Event Tracing](https://www.kernel.org/doc/html/latest/trace/events.html)) with 103 | (as root/with `sudo`) 104 | ``` 105 | echo 1 > /sys/kernel/debug/tracing/events/powertcp/enable 106 | ``` 107 | and shown with (as root/with `sudo`) 108 | ``` 109 | cat /sys/kernel/debug/tracing/trace_pipe 110 | ``` 111 | or used with any other of the available tools, like 112 | [bpftrace](https://github.com/iovisor/bpftrace). 113 | 114 | ## Development Resources 115 | - [Kernel Build System: Building External Modules](https://www.kernel.org/doc/html/latest/kbuild/modules.html) 116 | -------------------------------------------------------------------------------- /powertcp.c: -------------------------------------------------------------------------------- 1 | // SPDX-License-Identifier: GPL-2.0 OR MIT 2 | /* 3 | * PowerTCP congestion control 4 | * 5 | * Based on the algorithm developed in: 6 | * Addanki, V., O. Michel, and S. Schmid. 7 | * "PowerTCP: Pushing the Performance Limits of Datacenter Networks." 8 | * 19th USENIX Symposium on Networked Systems Design and Implementation (NSDI 22). 9 | * USENIX Association, 2022. 10 | * Available at: https://arxiv.org/pdf/2112.14309.pdf 11 | * 12 | * Implemented by: 13 | * Jörn-Thorben Hinz, TU Berlin, 2022. 14 | */ 15 | 16 | #define POWERTCP_CONG_OPS_NAME_CONCAT2(prefix, cong_ops_name) \ 17 | prefix##cong_ops_name 18 | #define POWERTCP_CONG_OPS_NAME_CONCAT(prefix, cong_ops_name) \ 19 | POWERTCP_CONG_OPS_NAME_CONCAT2(prefix, cong_ops_name) 20 | #define POWERTCP_CONG_OPS_NAME(cong_ops_name) \ 21 | __stringify(POWERTCP_CONG_OPS_NAME_CONCAT( \ 22 | POWERTCP_CONG_OPS_NAME_PREFIX, cong_ops_name)) 23 | 24 | static void clear_old_cwnds(struct sock *sk) 25 | { 26 | struct powertcp *ca = inet_csk_ca(sk); 27 | ca->old_cwnd.cwnd = 0; 28 | ca->old_cwnd.snd_nxt = 0; 29 | } 30 | 31 | static unsigned long ewma(unsigned long weight, unsigned long weight_scale, 32 | unsigned long value, unsigned long old_value) 33 | { 34 | return (weight * value + (weight_scale - weight) * old_value) / 35 | weight_scale; 36 | } 37 | 38 | /* Return the snd_cwnd that was set when the newly acknowledged segment(s) were 39 | * sent. 40 | */ 41 | static unsigned long get_cwnd(const struct sock *sk) 42 | { 43 | const struct powertcp *ca = inet_csk_ca(sk); 44 | //const struct tcp_sock *tp = tcp_sk(sk); 45 | //u32 ack_seq = tp->snd_una; 46 | 47 | if (ca->old_cwnd.cwnd != 0 && ca->old_cwnd.snd_nxt != 0 /*&& 48 | before(ca->old_cwnd.snd_nxt, ack_seq)*/) { 49 | return ca->old_cwnd.cwnd; 50 | } 51 | 52 | return ca->snd_cwnd; 53 | } 54 | 55 | /* Return the most recently measured RTT (in us). */ 56 | static unsigned long get_rtt(const struct sock *sk, 57 | const struct rate_sample *rs) 58 | { 59 | const struct tcp_sock *tp = tcp_sk(sk); 60 | long rtt = rs->rtt_us; /* This is -1 if unavailable. */ 61 | if (rtt < 0) { 62 | rtt = tp->srtt_us >> 3; 63 | } 64 | return rtt; 65 | } 66 | 67 | /* Limit a value to positive, non-zero numbers. */ 68 | static unsigned long not_zero(unsigned long val) 69 | { 70 | return max(1UL, val); 71 | } 72 | 73 | static void set_cwnd(struct sock *sk, unsigned long cwnd, 74 | struct powertcp_trace_event *trace_event) 75 | { 76 | struct powertcp *ca = inet_csk_ca(sk); 77 | struct tcp_sock *tp = tcp_sk(sk); 78 | 79 | ca->snd_cwnd = cwnd; 80 | cwnd /= cwnd_scale; 81 | cwnd = min_t(unsigned long, cwnd, tp->snd_cwnd_clamp); 82 | tp->snd_cwnd = not_zero(cwnd); 83 | 84 | if (tracing_enabled() && trace_event) { 85 | trace_event->cwnd = tp->snd_cwnd; 86 | } 87 | } 88 | 89 | /* Look for the base (~= minimum) RTT (in us). */ 90 | static void update_base_rtt(struct sock *sk) 91 | { 92 | struct powertcp *ca = inet_csk_ca(sk); 93 | const struct tcp_sock *tp = tcp_sk(sk); 94 | u32 min_rtt; 95 | 96 | if (base_rtt > -1) { 97 | ca->base_rtt = base_rtt; 98 | return; 99 | } 100 | 101 | min_rtt = tcp_min_rtt(tp); 102 | if (min_rtt != ~0U) { 103 | ca->base_rtt = min_rtt; 104 | return; 105 | } 106 | 107 | min_rtt = tp->srtt_us >> 3; 108 | if (min_rtt) { 109 | ca->base_rtt = min_rtt; 110 | return; 111 | } 112 | 113 | /* bbr_init_pacing_rate_from_rtt() also uses this as fallback. */ 114 | ca->base_rtt = USEC_PER_SEC; 115 | } 116 | 117 | static void update_beta(struct sock *sk, unsigned long old_base_rtt) 118 | { 119 | struct powertcp *ca = inet_csk_ca(sk); 120 | const struct tcp_sock *tp = tcp_sk(sk); 121 | 122 | if (beta < 0 && 123 | (ca->base_rtt < old_base_rtt || old_base_rtt == ULONG_MAX)) { 124 | unsigned long new_beta = 125 | BITS_TO_BYTES(cwnd_scale /* * MEGA */ * ca->host_bw * 126 | ca->base_rtt / expected_flows) / 127 | tp->mss_cache /* / USEC_PER_SEC */; 128 | ca->beta = min(ca->beta, new_beta); 129 | } 130 | } 131 | 132 | static void reset(struct sock *sk, enum tcp_ca_event ev) 133 | { 134 | struct powertcp *ca = inet_csk_ca(sk); 135 | struct tcp_sock *tp = tcp_sk(sk); 136 | 137 | if (ev == CA_EVENT_TX_START || ev == CA_EVENT_CWND_RESTART) { 138 | unsigned long old_base_rtt = ca->base_rtt; 139 | update_base_rtt(sk); 140 | update_beta(sk, old_base_rtt); 141 | } 142 | 143 | /* Only reset those values on a CA_EVENT_CWND_RESTART (used on 144 | * initialization). Otherwise we would reset cwnd and rate too frequently if 145 | * there are frequent CA_EVENT_TX_STARTs. 146 | */ 147 | if (ev == CA_EVENT_CWND_RESTART) { 148 | unsigned long rate = BITS_TO_BYTES(MEGA * ca->host_bw); 149 | unsigned long cwnd = cwnd_scale * rate * ca->base_rtt / 150 | tp->mss_cache / USEC_PER_SEC; 151 | set_rate(sk, rate); 152 | set_cwnd(sk, cwnd, NULL); 153 | tp->snd_ssthresh = TCP_INFINITE_SSTHRESH; 154 | 155 | ca->p_smooth = 0; 156 | 157 | clear_old_cwnds(sk); 158 | } 159 | } 160 | 161 | /* Update the list of recent snd_cwnds. */ 162 | static bool update_old(struct sock *sk, unsigned long p_smooth) 163 | { 164 | struct powertcp *ca = inet_csk_ca(sk); 165 | const struct tcp_sock *tp = tcp_sk(sk); 166 | 167 | if (before(ca->old_cwnd.snd_nxt, tp->snd_una) || 168 | (ca->old_cwnd.cwnd == 0 && ca->old_cwnd.snd_nxt == 0)) { 169 | ca->old_cwnd.cwnd = ca->snd_cwnd; 170 | ca->old_cwnd.snd_nxt = tp->snd_nxt; 171 | } 172 | 173 | ca->p_smooth = p_smooth; 174 | 175 | return true; 176 | } 177 | 178 | static unsigned long update_window(struct sock *sk, unsigned long cwnd_old, 179 | unsigned long norm_power, 180 | struct powertcp_trace_event *trace_event) 181 | { 182 | const struct powertcp *ca = inet_csk_ca(sk); 183 | const struct tcp_sock *tp = tcp_sk(sk); 184 | unsigned long base_bdp = BITS_TO_BYTES(cwnd_scale) * ca->host_bw * 185 | ca->base_rtt / tp->mss_cache; 186 | unsigned long cwnd; 187 | 188 | norm_power = not_zero(norm_power); 189 | cwnd = ewma(gamma, gamma_scale, 190 | power_scale * cwnd_old / norm_power + ca->beta, 191 | ca->snd_cwnd); 192 | cwnd = not_zero(cwnd); 193 | cwnd = min(cwnd, base_bdp); 194 | set_cwnd(sk, cwnd, trace_event); 195 | return cwnd; 196 | } 197 | 198 | static int ptcp_init(struct sock *sk) 199 | { 200 | return int_impl_init(sk); 201 | } 202 | 203 | static unsigned long ptcp_norm_power(struct sock *sk, 204 | const struct rate_sample *rs, 205 | struct powertcp_trace_event *trace_event) 206 | { 207 | const struct powertcp *ca = inet_csk_ca(sk); 208 | unsigned long delta_t = 0; 209 | unsigned long p_norm = 0; 210 | unsigned long p_smooth = ca->p_smooth; 211 | 212 | const struct powertcp_int *prev_int = get_prev_int(sk); 213 | const struct powertcp_int *this_int = get_int(sk, prev_int); 214 | int i; 215 | 216 | /* TODO: Do something helpful (a full reset?) when the path changes. */ 217 | if (!this_int || !prev_int || this_int->path_id != prev_int->path_id) { 218 | /* Power calculations will be skipped for the first one or two ACKs. 219 | * p_smooth will still be 0 then. This is intentional to have power 220 | * smoothing start with a proper value (=p_norm) at the end of this 221 | * function. 222 | */ 223 | return 0; 224 | } 225 | 226 | /* for each egress port i on the path */ 227 | for (i = 0; i < this_int->n_hop && i < max_n_hops; ++i) { 228 | const struct powertcp_hop_int *hop_int = &this_int->hops[i]; 229 | const struct powertcp_hop_int *prev_hop_int = 230 | &prev_int->hops[i]; 231 | unsigned long dt = 232 | not_zero((hop_int->ts - prev_hop_int->ts) & max_ts); 233 | long queue_diff = 234 | (long)hop_int->qlen - (long)prev_hop_int->qlen; 235 | u32 tx_bytes_diff = 236 | (hop_int->tx_bytes - prev_hop_int->tx_bytes) & 237 | max_tx_bytes; 238 | /* The variable name "current" instead of lambda would conflict with a 239 | * macro of the same name in asm-generic/current.h. 240 | */ 241 | unsigned long lambda = 242 | not_zero((unsigned long)max( 243 | 0l, queue_diff + (long)tx_bytes_diff) * 244 | (NSEC_PER_SEC / dt)); 245 | unsigned long bdp = hop_int->bandwidth * ca->base_rtt; 246 | unsigned long voltage = hop_int->qlen + bdp; 247 | unsigned long hop_p = lambda * voltage; 248 | unsigned long equilibrium = not_zero( 249 | (unsigned long)hop_int->bandwidth * hop_int->bandwidth / 250 | power_scale * MEGA * ca->base_rtt); 251 | unsigned long hop_p_norm = hop_p / equilibrium; 252 | if (hop_p_norm > p_norm || i == 0) { 253 | p_norm = hop_p_norm; 254 | delta_t = dt; 255 | 256 | if (tracing_enabled() && trace_event) { 257 | trace_event->qlen = hop_int->qlen; 258 | trace_event->tx_bytes_diff = tx_bytes_diff; 259 | } 260 | } 261 | } 262 | 263 | delta_t = min(delta_t, NSEC_PER_USEC * ca->base_rtt); 264 | p_norm = max(p_norm_cutoff, p_norm); 265 | p_smooth = p_smooth == 0 ? p_norm : 266 | ewma(delta_t, NSEC_PER_USEC * ca->base_rtt, 267 | p_norm, p_smooth); 268 | 269 | if (tracing_enabled() && trace_event) { 270 | trace_event->delta_t = delta_t; 271 | trace_event->p_norm = p_norm; 272 | trace_event->p_smooth = p_smooth; 273 | } 274 | 275 | return p_smooth; 276 | } 277 | 278 | static void ptcp_release(struct sock *sk) 279 | { 280 | int_impl_release(sk); 281 | } 282 | 283 | static void ptcp_reset(struct sock *sk, enum tcp_ca_event ev) 284 | { 285 | struct ptcp_powertcp *ca = inet_csk_ca(sk); 286 | int_impl_reset(&ca->int_impl, ev); 287 | reset(sk, ev); 288 | } 289 | 290 | static bool ptcp_update_old(struct sock *sk, const struct rate_sample *rs, 291 | unsigned long p_smooth) 292 | { 293 | struct ptcp_powertcp *ca = inet_csk_ca(sk); 294 | int_impl_update_old(&ca->int_impl); 295 | return update_old(sk, p_smooth); 296 | } 297 | 298 | static unsigned long 299 | ptcp_update_window(struct sock *sk, unsigned long cwnd_old, 300 | unsigned long norm_power, 301 | struct powertcp_trace_event *trace_event) 302 | { 303 | return update_window(sk, cwnd_old, norm_power, trace_event); 304 | } 305 | 306 | static int rttptcp_init(struct sock *sk) 307 | { 308 | return 0; 309 | } 310 | 311 | static unsigned long 312 | rttptcp_norm_power(struct sock *sk, const struct rate_sample *rs, 313 | struct powertcp_trace_event *trace_event) 314 | { 315 | struct rttptcp_powertcp *ca = inet_csk_ca(sk); 316 | const struct tcp_sock *tp = tcp_sk(sk); 317 | unsigned long dt, rtt_grad, p_norm, delta_t; 318 | unsigned long p_smooth = ca->p_smooth; 319 | unsigned long rtt_us; 320 | 321 | if (before(tp->snd_una, ca->last_updated)) { 322 | return p_smooth; 323 | } 324 | 325 | ca->t = get_tstamp(sk); 326 | rtt_us = get_rtt(sk, rs); 327 | /* Timestamps are always increasing here, logically. So we want to have 328 | * unsigned wrap-around when it's time and don't use tcp_stamp_us_delta(). 329 | */ 330 | dt = not_zero(ca->t - ca->t_prev); 331 | delta_t = min(dt, ca->base_rtt * NSEC_PER_USEC); 332 | if (ca->prev_rtt_us <= rtt_us) { 333 | rtt_grad = NSEC_PER_USEC * power_scale * 334 | (rtt_us - ca->prev_rtt_us) / dt; 335 | p_norm = (rtt_grad + power_scale) * rtt_us / ca->base_rtt; 336 | } else { 337 | /* Separate code path for negative rtt_grad since BPF does not support 338 | * division by signed numbers. 339 | */ 340 | rtt_grad = NSEC_PER_USEC * power_scale * 341 | (ca->prev_rtt_us - rtt_us) / dt; 342 | p_norm = (power_scale - min(power_scale, rtt_grad)) * rtt_us / 343 | ca->base_rtt; 344 | } 345 | p_norm = max(p_norm_cutoff, p_norm); 346 | 347 | /* powertcp.p_smooth is initialized with 0, we don't want to smooth for the 348 | * very first calculation. 349 | */ 350 | p_smooth = p_smooth == 0 ? p_norm : 351 | ewma(delta_t, NSEC_PER_USEC * ca->base_rtt, 352 | p_norm, p_smooth); 353 | 354 | if (tracing_enabled() && trace_event) { 355 | trace_event->delta_t = delta_t; 356 | trace_event->p_norm = p_norm; 357 | trace_event->p_smooth = p_smooth; 358 | trace_event->rtt_grad = rtt_grad; 359 | } 360 | 361 | return p_smooth; 362 | } 363 | 364 | static void rttptcp_release(struct sock *sk) 365 | { 366 | /* no-op */ 367 | } 368 | 369 | static void rttptcp_reset(struct sock *sk, enum tcp_ca_event ev) 370 | { 371 | struct rttptcp_powertcp *ca = inet_csk_ca(sk); 372 | const struct tcp_sock *tp = tcp_sk(sk); 373 | 374 | reset(sk, ev); 375 | 376 | /* Only reset those on initialization. */ 377 | if (ev == CA_EVENT_CWND_RESTART) { 378 | // TODO: Evaluate if it actually improves performance of the algorithm 379 | // to reset those two values only on CA_EVENT_CWND_RESTART: 380 | ca->last_updated = tp->snd_nxt; 381 | ca->prev_rtt_us = tp->srtt_us >> 3; 382 | } 383 | 384 | ca->t_prev = ca->t; 385 | } 386 | 387 | static bool rttptcp_update_old(struct sock *sk, const struct rate_sample *rs, 388 | unsigned long p_smooth) 389 | { 390 | struct rttptcp_powertcp *ca = inet_csk_ca(sk); 391 | const struct tcp_sock *tp = tcp_sk(sk); 392 | 393 | if (before(tp->snd_una, ca->last_updated)) { 394 | return false; 395 | } 396 | 397 | update_old(sk, p_smooth); 398 | 399 | ca->last_updated = tp->snd_nxt; 400 | ca->prev_rtt_us = get_rtt(sk, rs); 401 | // TODO: There are multiple timestamps available here. Is there a better one? 402 | ca->t_prev = ca->t; 403 | 404 | return true; 405 | } 406 | 407 | static unsigned long 408 | rttptcp_update_window(struct sock *sk, unsigned long cwnd_old, 409 | unsigned long norm_power, 410 | struct powertcp_trace_event *trace_event) 411 | { 412 | struct rttptcp_powertcp *ca = inet_csk_ca(sk); 413 | const struct tcp_sock *tp = tcp_sk(sk); 414 | 415 | if (before(tp->snd_una, ca->last_updated)) { 416 | return ca->snd_cwnd; 417 | } 418 | 419 | return update_window(sk, cwnd_old, norm_power, trace_event); 420 | } 421 | 422 | #define DEFINE_POWERTCP_VARIANT(func_prefix, cong_ops_name) \ 423 | void POWERTCP_CONG_OPS_FUNC(powertcp_##func_prefix##_cwnd_event, \ 424 | struct sock *sk, enum tcp_ca_event ev) \ 425 | { \ 426 | struct powertcp *ca = inet_csk_ca(sk); \ 427 | \ 428 | if (POWERTCP_UNLIKELY(ca->host_bw == 0)) { \ 429 | return; \ 430 | } \ 431 | \ 432 | if (ev == CA_EVENT_TX_START) { \ 433 | func_prefix##_reset(sk, ev); \ 434 | } \ 435 | } \ 436 | \ 437 | void POWERTCP_CONG_OPS_FUNC(powertcp_##func_prefix##_init, \ 438 | struct sock *sk) \ 439 | { \ 440 | struct powertcp *ca = inet_csk_ca(sk); \ 441 | \ 442 | BUILD_BUG_ON(sizeof(struct powertcp) > ICSK_CA_PRIV_SIZE); \ 443 | BUILD_BUG_ON(sizeof(struct func_prefix##_powertcp) > \ 444 | ICSK_CA_PRIV_SIZE); \ 445 | \ 446 | func_prefix##_init(sk); \ 447 | \ 448 | ca->base_rtt = ULONG_MAX; \ 449 | ca->beta = beta < 0 ? ULONG_MAX : beta * cwnd_scale; \ 450 | ca->host_bw = get_host_bw(sk); \ 451 | \ 452 | func_prefix##_reset(sk, CA_EVENT_CWND_RESTART); \ 453 | \ 454 | require_hwtstamps(sk); \ 455 | require_pacing(sk); \ 456 | } \ 457 | \ 458 | void POWERTCP_CONG_OPS_FUNC(powertcp_##func_prefix##_cong_control, \ 459 | struct sock *sk, \ 460 | const struct rate_sample *rs) \ 461 | { \ 462 | struct powertcp *ca = inet_csk_ca(sk); \ 463 | const struct tcp_sock *tp = tcp_sk(sk); \ 464 | unsigned long cwnd_old; \ 465 | unsigned long norm_power; \ 466 | unsigned long cwnd; \ 467 | unsigned long rate; \ 468 | bool updated; \ 469 | struct powertcp_trace_event trace_event = {}; \ 470 | \ 471 | if (POWERTCP_UNLIKELY(ca->host_bw == 0)) { \ 472 | return; \ 473 | } \ 474 | \ 475 | cwnd_old = get_cwnd(sk); \ 476 | norm_power = func_prefix##_norm_power(sk, rs, &trace_event); \ 477 | if (norm_power) { \ 478 | cwnd = func_prefix##_update_window( \ 479 | sk, cwnd_old, norm_power, &trace_event); \ 480 | rate = (USEC_PER_SEC * cwnd * tp->mss_cache) / \ 481 | ca->base_rtt / cwnd_scale; \ 482 | set_rate(sk, rate); \ 483 | } \ 484 | \ 485 | updated = func_prefix##_update_old(sk, rs, norm_power); \ 486 | \ 487 | if (tracing_enabled() && updated && norm_power) { \ 488 | trace_event.rate = rate; \ 489 | trace_event.sock_hash = sk->__sk_common.skc_hash; \ 490 | output_trace_event(&trace_event); \ 491 | } \ 492 | } \ 493 | \ 494 | void POWERTCP_CONG_OPS_FUNC(powertcp_##func_prefix##_release, \ 495 | struct sock *sk) \ 496 | { \ 497 | const struct powertcp *ca = inet_csk_ca(sk); \ 498 | \ 499 | if (POWERTCP_UNLIKELY(ca->host_bw == 0)) { \ 500 | return; \ 501 | } \ 502 | \ 503 | clear_old_cwnds(sk); \ 504 | \ 505 | func_prefix##_release(sk); \ 506 | } \ 507 | \ 508 | POWERTCP_CONG_OPS_ATTRS struct tcp_congestion_ops cong_ops_name = { \ 509 | .cong_avoid = POWERTCP_CONG_OPS_FUNC_PTR powertcp_cong_avoid, \ 510 | .cong_control = POWERTCP_CONG_OPS_FUNC_PTR \ 511 | powertcp_##func_prefix##_cong_control, \ 512 | .cwnd_event = POWERTCP_CONG_OPS_FUNC_PTR \ 513 | powertcp_##func_prefix##_cwnd_event, \ 514 | .init = POWERTCP_CONG_OPS_FUNC_PTR \ 515 | powertcp_##func_prefix##_init, \ 516 | .name = POWERTCP_CONG_OPS_NAME(cong_ops_name), \ 517 | .release = POWERTCP_CONG_OPS_FUNC_PTR \ 518 | powertcp_##func_prefix##_release, \ 519 | .ssthresh = POWERTCP_CONG_OPS_FUNC_PTR powertcp_ssthresh, \ 520 | .undo_cwnd = POWERTCP_CONG_OPS_FUNC_PTR powertcp_undo_cwnd, \ 521 | } 522 | 523 | u32 POWERTCP_CONG_OPS_FUNC(powertcp_ssthresh, struct sock *sk) 524 | { 525 | /* We don't do slow starts here! */ 526 | return TCP_INFINITE_SSTHRESH; 527 | } 528 | 529 | u32 POWERTCP_CONG_OPS_FUNC(powertcp_undo_cwnd, struct sock *sk) 530 | { 531 | /* Never undo after a loss. */ 532 | return tcp_sk(sk)->snd_cwnd; 533 | } 534 | 535 | DEFINE_POWERTCP_VARIANT(ptcp, powertcp); 536 | 537 | /* Cannot name it rtt_powertcp due to the size limit for 538 | * tcp_congestion_ops.name. */ 539 | DEFINE_POWERTCP_VARIANT(rttptcp, rttpowertcp); 540 | -------------------------------------------------------------------------------- /powertcp_defs.h: -------------------------------------------------------------------------------- 1 | /* SPDX-License-Identifier: GPL-2.0 OR MIT */ 2 | /* 3 | * Constants and default values common to both PowerTCP implementations. 4 | */ 5 | #ifndef POWERTCP_DEFS_H 6 | #define POWERTCP_DEFS_H 7 | 8 | static const unsigned long cwnd_scale = (1UL << 10); 9 | static const unsigned long fallback_host_bw = 1000; /* Mbit/s */ 10 | static const unsigned long gamma_scale = (1UL << 10); 11 | static const unsigned long power_scale = (1UL << 16); 12 | static const unsigned long p_norm_cutoff = 0.01 * power_scale; 13 | 14 | /* Avoid an "initializer element is not constant" error with gcc before 8.1 by 15 | * using an enum instead of static const variables. No, I don't want to use 16 | * macros for constants here :-) 17 | */ 18 | enum { 19 | default_base_rtt = -1, /* us */ 20 | default_beta = -1, /* Number of packets */ 21 | default_expected_flows = 10, 22 | default_gamma = 921, /* ~= 0.9 * gamma_scale */ 23 | default_hop_bw = 1000, /* Mbit/s */ 24 | default_host_bw = 1000, /* Mbit/s */ 25 | }; 26 | 27 | #endif /* POWERTCP_DEFS_H */ 28 | -------------------------------------------------------------------------------- /powertcp_head.c: -------------------------------------------------------------------------------- 1 | // SPDX-License-Identifier: GPL-2.0 OR MIT 2 | 3 | #ifndef POWERTCP_CONG_OPS_ATTRS 4 | #define POWERTCP_CONG_OPS_ATTRS 5 | #endif 6 | 7 | #ifndef POWERTCP_CONG_OPS_FUNC 8 | #define POWERTCP_CONG_OPS_FUNC(name, args...) name(args) 9 | #endif 10 | 11 | #ifndef POWERTCP_CONG_OPS_FUNC_PTR 12 | #define POWERTCP_CONG_OPS_FUNC_PTR 13 | #endif 14 | 15 | #ifndef POWERTCP_CONG_OPS_NAME_PREFIX 16 | #define POWERTCP_CONG_OPS_NAME_PREFIX 17 | #endif 18 | 19 | #ifndef POWERTCP_LIKELY 20 | #define POWERTCP_LIKELY(cond) cond 21 | #endif 22 | 23 | #ifndef POWERTCP_PARAM_ATTRS 24 | #define POWERTCP_PARAM_ATTRS 25 | #endif 26 | 27 | #ifndef POWERTCP_UNLIKELY 28 | #define POWERTCP_UNLIKELY(cond) cond 29 | #endif 30 | 31 | #ifndef __stringify 32 | #define __stringify_1(x...) #x 33 | #define __stringify(x...) __stringify_1(x) 34 | #endif 35 | 36 | struct old_cwnd { 37 | u32 snd_nxt; 38 | unsigned long cwnd; 39 | }; 40 | 41 | #define POWERTCP_STRUCT(struct_name, ...) \ 42 | struct struct_name { \ 43 | unsigned long base_rtt; \ 44 | unsigned long snd_cwnd; \ 45 | \ 46 | unsigned long beta; /* number of packets scaled by cwnd_scale */ \ 47 | \ 48 | struct old_cwnd old_cwnd; \ 49 | \ 50 | unsigned long p_smooth; \ 51 | \ 52 | /* powertcp_cong_control() seems to (unexpectedly) get called once before \ 53 | * powertcp_init(). host_bw is still 0 then, thanks to \ 54 | * tcp_assign_congestion_control(), and we use that as an indicator whether \ 55 | * we are initialized. \ 56 | */ \ 57 | unsigned long host_bw; /* Mbit/s */ \ 58 | \ 59 | __VA_ARGS__ \ 60 | } 61 | #define POWERTCP_STRUCT_FIELDS(fields) fields 62 | 63 | // clang-format off 64 | POWERTCP_STRUCT(powertcp); 65 | 66 | POWERTCP_STRUCT(ptcp_powertcp, 67 | POWERTCP_STRUCT_FIELDS( 68 | powertcp_int_impl_t int_impl; 69 | ) 70 | ); 71 | 72 | POWERTCP_STRUCT(rttptcp_powertcp, 73 | POWERTCP_STRUCT_FIELDS( 74 | u32 last_updated; 75 | unsigned long prev_rtt_us; 76 | u64 t; /* in ns */ 77 | u64 t_prev; /* in ns */ 78 | ) 79 | ); 80 | // clang-format on 81 | 82 | #undef POWERTCP_STRUCT 83 | #undef POWERTCP_STRUCT_FIELDS 84 | 85 | POWERTCP_PARAM_ATTRS long base_rtt = default_base_rtt; 86 | POWERTCP_PARAM_ATTRS long beta = default_beta; /* Number of packets */ 87 | POWERTCP_PARAM_ATTRS long expected_flows = default_expected_flows; 88 | POWERTCP_PARAM_ATTRS long gamma = default_gamma; 89 | POWERTCP_PARAM_ATTRS long hop_bw = default_hop_bw; /* Mbit/s */ 90 | POWERTCP_PARAM_ATTRS long host_bw = fallback_host_bw; /* Mbit/s */ 91 | -------------------------------------------------------------------------------- /powertcp_int.c: -------------------------------------------------------------------------------- 1 | // SPDX-License-Identifier: GPL-2.0 OR MIT 2 | 3 | struct powertcp_hop_int { 4 | u32 bandwidth; /* in MByte/s */ 5 | u32 ts; /* careful: in ns */ 6 | u32 tx_bytes; 7 | u32 qlen; 8 | }; 9 | 10 | struct powertcp_int { 11 | int n_hop; 12 | int path_id; 13 | struct powertcp_hop_int hops[max_n_hops]; 14 | }; 15 | -------------------------------------------------------------------------------- /powertcp_no-int.c: -------------------------------------------------------------------------------- 1 | // SPDX-License-Identifier: GPL-2.0 OR MIT 2 | 3 | static const struct powertcp_int *get_int(struct sock *sk, 4 | const struct powertcp_int *prev_int) 5 | { 6 | return NULL; 7 | } 8 | 9 | static const struct powertcp_int *get_prev_int(struct sock *sk) 10 | { 11 | return NULL; 12 | } 13 | 14 | static int int_impl_init(struct sock *sk) 15 | { 16 | return 0; 17 | } 18 | 19 | static void int_impl_release(struct sock *sk) 20 | { 21 | } 22 | 23 | static void int_impl_reset(powertcp_int_impl_t *int_impl, enum tcp_ca_event ev) 24 | { 25 | } 26 | 27 | static void int_impl_update_old(powertcp_int_impl_t *int_impl) 28 | { 29 | } 30 | 31 | static int register_int(struct tcp_congestion_ops *cong_ops) 32 | { 33 | return 0; 34 | } 35 | 36 | static void unregister_int(struct tcp_congestion_ops *cong_ops) 37 | { 38 | } 39 | -------------------------------------------------------------------------------- /powertcp_no-int_head.c: -------------------------------------------------------------------------------- 1 | // SPDX-License-Identifier: GPL-2.0 OR MIT 2 | 3 | enum { max_n_hops = 1 }; 4 | 5 | #include "powertcp_int.c" 6 | 7 | /* In case the ts value is taken directly from a less-than-32-bit INT field, 8 | * its maximum value has to be known for correct wrap-around in calculations. 9 | */ 10 | static const unsigned int max_ts = -1; 11 | 12 | /* In case the tx_bytes value is taken directly from a less-than-32-bit INT 13 | * field, its maximum value has to be known for correct wrap-around in 14 | * calculations. 15 | */ 16 | static const u32 max_tx_bytes = -1; 17 | 18 | struct powertcp_int_impl { 19 | }; 20 | 21 | typedef struct powertcp_int_impl *powertcp_int_impl_t; 22 | -------------------------------------------------------------------------------- /powertcp_trace.h: -------------------------------------------------------------------------------- 1 | #ifndef POWERTCP_TRACE_H 2 | #define POWERTCP_TRACE_H 3 | 4 | /* This header requires prior inclusion of vmlinux.h or linux/types.h. */ 5 | 6 | struct powertcp_trace_event { 7 | __u64 time; 8 | unsigned int sock_hash; 9 | __u32 cwnd; 10 | unsigned long rate; 11 | unsigned long p_norm; 12 | unsigned long p_smooth; 13 | unsigned long qlen; 14 | __u32 tx_bytes_diff; 15 | __u32 delta_t; /* careful: in ns */ 16 | long rtt_grad; // long instead of unsigned long might truncate a huge rtt_grad 17 | }; 18 | 19 | #endif 20 | -------------------------------------------------------------------------------- /tcp_powertcp.c: -------------------------------------------------------------------------------- 1 | // SPDX-License-Identifier: GPL-2.0 OR MIT 2 | /* 3 | * PowerTCP congestion control 4 | * 5 | * Based on the algorithm developed in: 6 | * Addanki, V., O. Michel, and S. Schmid. 7 | * "PowerTCP: Pushing the Performance Limits of Datacenter Networks." 8 | * 19th USENIX Symposium on Networked Systems Design and Implementation (NSDI 22). 9 | * USENIX Association, 2022. 10 | * Available at: https://arxiv.org/pdf/2112.14309.pdf 11 | * 12 | * Implemented by: 13 | * Jörn-Thorben Hinz, TU Berlin, 2022. 14 | */ 15 | 16 | #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 17 | 18 | #include "powertcp_defs.h" 19 | 20 | #include 21 | #include 22 | #include 23 | #include 24 | #include 25 | #include 26 | 27 | #include "powertcp_trace.h" 28 | 29 | #ifndef MEGA 30 | #define MEGA 1000000UL 31 | #endif 32 | 33 | #define CREATE_TRACE_POINTS 34 | #include "tcp_powertcp_trace.h" 35 | 36 | #ifndef BITS_TO_BYTES 37 | #define BITS_TO_BYTES(nr) __KERNEL_DIV_ROUND_UP(nr, BITS_PER_TYPE(char)) 38 | #endif 39 | 40 | #define POWERTCP_CONG_OPS_ATTRS static __read_mostly 41 | #define POWERTCP_CONG_OPS_FUNC_ATTRS static 42 | #define POWERTCP_PARAM_ATTRS static __read_mostly 43 | #define POWERTCP_UNLIKELY unlikely 44 | 45 | #include "powertcp_no-int_head.c" 46 | 47 | #include "powertcp_head.c" 48 | 49 | module_param(base_rtt, long, 0444); 50 | MODULE_PARM_DESC( 51 | base_rtt, 52 | "base (minimum) round-trip time (RTT) in us (default: -1; -1: automatically detect)"); 53 | module_param(beta, long, 0444); 54 | MODULE_PARM_DESC(beta, 55 | "additive increase (default: -1; -1: automatically set beta)"); 56 | module_param(expected_flows, long, 0444); 57 | MODULE_PARM_DESC(expected_flows, 58 | "expected number of flows sharing the host NIC (default: 10)"); 59 | module_param(gamma, long, 0444); 60 | MODULE_PARM_DESC(gamma, "exponential moving average weight, times " __stringify( 61 | gamma_scale) "(default: 921 ~= 0,9)"); 62 | module_param(hop_bw, long, 0444); 63 | MODULE_PARM_DESC(hop_bw, "hop bandwidth in Mbit/s"); 64 | module_param(host_bw, long, 0444); 65 | MODULE_PARM_DESC( 66 | host_bw, 67 | "host NIC bandwidth in Mbit/s (default: -1; -1: detect from socket)"); 68 | 69 | /* Look for the host bandwidth (in Mbit/s). */ 70 | static unsigned long get_host_bw(struct sock *sk) 71 | { 72 | const struct dst_entry *dst; 73 | unsigned long bw = fallback_host_bw; 74 | 75 | if (host_bw > 0) { 76 | return host_bw; 77 | } 78 | 79 | dst = __sk_dst_get(sk); 80 | if (dst && dst->dev) { 81 | struct ethtool_link_ksettings cmd; 82 | int r; 83 | 84 | rtnl_lock(); 85 | /* ethtool_params_from_link_mode() would be even simpler. 86 | * But dst->dev->link_mode seems to always be 0 at this point. */ 87 | r = __ethtool_get_link_ksettings(dst->dev, &cmd); 88 | rtnl_unlock(); 89 | if (r == 0 && cmd.base.speed != SPEED_UNKNOWN) { 90 | bw = cmd.base.speed; 91 | pr_debug("hash=%u: got link speed: %lu Mbit/s\n", 92 | sk->sk_hash, bw); 93 | } else { 94 | pr_warn("link speed unavailable, using fallback: %lu Mbit/s\n", 95 | bw); 96 | } 97 | } 98 | 99 | return bw; 100 | } 101 | 102 | static u64 get_tstamp(const struct sock *sk) 103 | { 104 | return tcp_sk(sk)->tcp_clock_cache; 105 | } 106 | 107 | static void output_trace_event(struct powertcp_trace_event *trace_event) 108 | { 109 | trace_event->time = ktime_get_ns(); 110 | trace_cong_control(trace_event); 111 | } 112 | 113 | void require_hwtstamps(struct sock *sk) 114 | { 115 | /* TODO: Would it make sense to execute (the equivalent of) 116 | * ioctl(SIOCSHWTSTAMP) for the/a network device here? 117 | */ 118 | 119 | int optval = SOF_TIMESTAMPING_RX_HARDWARE; 120 | tcp_setsockopt(sk, SOL_SOCKET, SO_TIMESTAMPING_NEW, 121 | KERNEL_SOCKPTR(&optval), sizeof(optval)); 122 | } 123 | 124 | static void require_pacing(struct sock *sk) 125 | { 126 | cmpxchg(&sk->sk_pacing_status, SK_PACING_NONE, SK_PACING_NEEDED); 127 | } 128 | 129 | /* Set the socket pacing rate (bytes per second). */ 130 | static void set_rate(struct sock *sk, unsigned long rate) 131 | { 132 | /* Before 4.20, sk_max_pacing_rate was only a u32. Use explicit min_t with 133 | * type here to avoid a warning on those older kernels. 134 | */ 135 | sk->sk_pacing_rate = min_t(unsigned long, rate, sk->sk_max_pacing_rate); 136 | } 137 | 138 | static bool tracing_enabled(void) 139 | { 140 | return trace_cong_control_enabled(); 141 | } 142 | 143 | /* cong_avoid was previously non-optional in tcp_congestion_ops for a BPF CA. 144 | * For the module implementation it can just be set to a NULL pointer. 145 | */ 146 | static const void *const powertcp_cong_avoid = NULL; 147 | 148 | #include "powertcp_no-int.c" 149 | 150 | #include "powertcp.c" 151 | 152 | static int __init powertcp_register(void) 153 | { 154 | int ret; 155 | 156 | powertcp.owner = THIS_MODULE; 157 | ret = tcp_register_congestion_control(&powertcp); 158 | if (ret) { 159 | return ret; 160 | } 161 | 162 | ret = register_int(&powertcp); 163 | if (ret) { 164 | tcp_unregister_congestion_control(&powertcp); 165 | return ret; 166 | } 167 | 168 | rttpowertcp.owner = THIS_MODULE; 169 | ret = tcp_register_congestion_control(&rttpowertcp); 170 | if (ret) { 171 | return ret; 172 | } 173 | 174 | return 0; 175 | } 176 | 177 | static void __exit powertcp_unregister(void) 178 | { 179 | unregister_int(&powertcp); 180 | tcp_unregister_congestion_control(&powertcp); 181 | tcp_unregister_congestion_control(&rttpowertcp); 182 | } 183 | 184 | module_init(powertcp_register); 185 | module_exit(powertcp_unregister); 186 | 187 | MODULE_ALIAS("tcp_rttpowertcp"); 188 | MODULE_AUTHOR("Jörn-Thorben Hinz"); 189 | MODULE_DESCRIPTION("PowerTCP congestion control"); 190 | MODULE_LICENSE("Dual MIT/GPL"); 191 | -------------------------------------------------------------------------------- /tcp_powertcp_trace.h: -------------------------------------------------------------------------------- 1 | // SPDX-License-Identifier: GPL-2.0 OR MIT 2 | #undef TRACE_SYSTEM 3 | #define TRACE_SYSTEM powertcp 4 | 5 | #if !defined(_TRACE_POWERTCP_H) || defined(TRACE_HEADER_MULTI_READ) 6 | #define _TRACE_POWERTCP_H 7 | 8 | #include 9 | 10 | // clang-format off 11 | TRACE_EVENT(cong_control, 12 | TP_PROTO(const struct powertcp_trace_event *ev), 13 | TP_ARGS(ev), 14 | TP_STRUCT__entry( 15 | __field(u64, time) 16 | __field(unsigned int, sock_hash) 17 | __field(u32, cwnd) 18 | __field(unsigned long, rate) 19 | __field(unsigned long, p_norm) 20 | __field(unsigned long, p_smooth) 21 | __field(unsigned long, qlen) 22 | __field(__u32, tx_bytes_diff) 23 | __field(__u32, delta_t) 24 | ), 25 | TP_fast_assign( 26 | __entry->time = ev->time; 27 | __entry->sock_hash = ev->sock_hash; 28 | __entry->cwnd = ev->cwnd; 29 | __entry->rate = ev->rate; 30 | __entry->p_norm = ev->p_norm; 31 | __entry->p_smooth = ev->p_smooth; 32 | __entry->qlen = ev->qlen; 33 | __entry->tx_bytes_diff = ev->tx_bytes_diff; 34 | __entry->delta_t = ev->delta_t; 35 | ), 36 | TP_printk("time=%llu us sock_hash=%u cwnd=%u rate=%ld Mbit/s p_norm=%ld p_smooth=%ld qlen=%ld tx_bytes_diff=%u bytes delta_t=%u ns", 37 | __entry->time, 38 | __entry->sock_hash, 39 | __entry->cwnd, 40 | BITS_PER_BYTE * __entry->rate / MEGA, 41 | __entry->p_norm, 42 | __entry->p_smooth, 43 | __entry->qlen, 44 | __entry->tx_bytes_diff, 45 | __entry->delta_t 46 | ) 47 | ); 48 | // clang-format on 49 | 50 | #endif /* _TRACE_POWERTCP_H */ 51 | 52 | #undef TRACE_INCLUDE_FILE 53 | #define TRACE_INCLUDE_FILE tcp_powertcp_trace 54 | #undef TRACE_INCLUDE_PATH 55 | #define TRACE_INCLUDE_PATH . 56 | 57 | /* This part must be outside protection */ 58 | #include 59 | -------------------------------------------------------------------------------- /tools/README.md: -------------------------------------------------------------------------------- 1 | # `setup-bpf` and `setup-module` 2 | The setup scripts prepare the BPF and module implementation of PowerTCP, 3 | respectively, for usage. This includes (re)loading the implementation and its 4 | prerequisites and opening a screen session for interactive usage, e.g. for 5 | calling `iperf`/`iperf3`. 6 | 7 | **The scripts must be called as root or with `sudo`.** 8 | 9 | ## Usage 10 | ``` 11 | setup-bpf SESSION_NAME [PARAMETER...] 12 | setup-module SESSION_NAME [PARAMETER...] 13 | ``` 14 | 15 | ### `SESSION_NAME` 16 | Required. Name of a predefined screen session to open. Available sessions: 17 | - `iperf-client`: Opens an empty screen session for `iperf`/`iperf3` *client* 18 | usage. 19 | - `iperf-servers`: Opens a screen session with both `iperf` and `iperf3` 20 | servers readily running inside. 21 | 22 | **Using PowerTCP and its prerequisites outside of the opened screen session 23 | will require additional, manual setup steps (e.g., joining the TCP-INT cgroup 24 | for the BPF implementation).** 25 | 26 | ### `PARAMETER` 27 | Optional. One or multiple PowerTCP algorithm parameters. Available parameters: 28 | - `base_rtt`: Base RTT in µs 29 | - `beta`: Additive increase parameter in number of packets 30 | - `hop_bw`: Link speed of the switches in Mbit/s 31 | - `host_bw`: Link speed of the host in Mbit/s 32 | - `expected_flows`: Expected number of flows on a link 33 | - `gamma`: EWMA weight in range [0.0, 1.0] 34 | 35 | Currently, parameter values passed to `setup-module` need to be scaled with the 36 | constants defined in [powertcp_defs.h](../powertcp_defs.h). `setup-bpf` accepts 37 | values in the units specified above. 38 | 39 | Parameters can be set to different values from within the screen session 40 | without calling `setup-bpf` 41 | ```console 42 | root@host:powertcp-linux# ./bpf/powertcp -f register base_rtt=100 hop_bw=25000 host_bw=25000 43 | ``` 44 | or `setup-module` again 45 | ```console 46 | root@host:powertcp-linux# ./tools/reinsmod base_rtt=100 hop_bw=25000 host_bw=25000 47 | ``` 48 | 49 | ## Examples 50 | ```console 51 | user@host:powertcp-linux$ ./tools/setup-bpf iperf-client base_rtt=123 hop_bw=100000 host_bw=100000 52 | user@host:powertcp-linux$ ./tools/setup-bpf iperf-servers 53 | user@host:powertcp-linux$ ./tools/setup-module iperf-client base_rtt=456 host_bw=10000 54 | user@host:powertcp-linux$ ./tools/setup-module iperf-servers 55 | ``` 56 | -------------------------------------------------------------------------------- /tools/bpf_tracer: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # 3 | # Usage: 4 | # bpf_tracer IPERF(3)_CMDLINE -- POWERTCP_PARAMS 5 | # 6 | # Example calls: 7 | # bpf_tracer iperf -N -c 192.168.13.3 -Z bpf_powertcp -- host_bw=12000 hop_bw=25000 beta="2 10" base_rtt=50 8 | # bpf_tracer iperf3 -NZ -c 192.168.13.3 -C bpf_powertcp -- host_bw=12000 hop_bw=25000 beta="2 10" base_rtt=50 9 | # bpf_tracer iperf3 -NZ -c 192.168.13.3 -C bpf_powertcp -- host_bw=12000 hop_bw=25000 beta="1 2 4 8 10" base_rtt="50 500 5000" 10 | # bpf_tracer iperf3 -NZ -c 192.168.13.3 -C bpf_powertcp -- host_bw=12000 hop_bw=25000 beta="1 2 4 8 10" base_rtt=50 gamma="0.5 0.7 0.9" 11 | # 12 | 13 | set -eu 14 | 15 | iterate_param() 16 | { 17 | local inargs=$1 18 | local param=$2 19 | shift 2 20 | local vals 21 | read -r -a vals <<<"${powertcp_params[$param]}" 22 | 23 | local args 24 | for val in "${vals[@]}"; do 25 | args="${inargs:+$inargs }$param=$val" 26 | run "$args" "$@" 27 | done 28 | } 29 | 30 | run() 31 | { 32 | if [[ $# -gt 1 ]]; then 33 | iterate_param "$@" 34 | return 35 | fi 36 | 37 | local args=$1 38 | 39 | local csv_file 40 | printf -v csv_file "bpf_powertcp-%s.csv" "$args" 41 | 42 | [[ -z $args ]] || printf "# %s\n" "$args" 43 | 44 | # shellcheck disable=SC2086 45 | "${repo_dir}/bpf/powertcp" register -f tracing $args 46 | 47 | "${repo_dir}/bpf/powertcp" trace -C > "$csv_file" & 48 | local trace_pid=$! 49 | "${iperf_cmdline[@]}" 50 | 51 | sleep 3 52 | kill "$trace_pid" 53 | wait 54 | 55 | printf "\n" 56 | } 57 | 58 | repo_dir=${0%/*}/.. 59 | if [[ ! -d ${repo_dir}/tools ]]; then 60 | printf "I don’t know where I’m called from\n" >&2 61 | exit 2 62 | fi 63 | 64 | iperf_cmdline=() 65 | while [[ $# -gt 0 && $1 != -- ]]; do 66 | iperf_cmdline+=( "$1" ) 67 | shift 68 | done 69 | 70 | # Skip -- 71 | [[ $# -eq 0 ]] || shift 72 | 73 | declare -A powertcp_params 74 | for arg in "$@"; do 75 | powertcp_params+=( ["${arg%=*}"]="${arg#*=}" ) 76 | done 77 | 78 | "${repo_dir}/bpf/powertcp" unregister || : 79 | "${repo_dir}/bpf/tcp-int/code/src/tools/tcp_int" unload || : 80 | 81 | "${repo_dir}/bpf/tcp-int/code/src/tools/tcp_int" load 82 | "${repo_dir}/bpf/tcp-int/code/src/tools/tcp_int" "${TCP_INT_ENABLE:-enable}" 83 | 84 | printf "%d" "$$" > /sys/fs/cgroup/cgroup.tcp-int/cgroup.procs 85 | 86 | run "" "${!powertcp_params[@]}" 87 | -------------------------------------------------------------------------------- /tools/gro_experiment: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -eu 3 | 4 | client_runs() 5 | { 6 | local run_str 7 | printf -v run_str '\e[u\e[Krun %%i/%i' "$runs" 8 | 9 | progress '\e[s' 10 | for ((i = 0; i < runs; ++i)); do 11 | progress "$run_str" "$i" 12 | avg_throughput=$( 13 | iperf3 --client="$srv_ip" \ 14 | --congestion="$cca" \ 15 | --interval=0 \ 16 | --json \ 17 | --no-delay \ 18 | --omit=1 | 19 | jq .end.sum_received.bits_per_second 20 | ) 21 | printf '%s,%s,%s,%s\n' "$cca" "$gro" "$int" "${avg_throughput%%.*}" 22 | sleep 1 # Give the server a moment to be ready again 23 | done 24 | progress "${run_str}\n" "$runs" 25 | } 26 | 27 | die() 28 | { 29 | local r=$1 30 | shift 31 | # shellcheck disable=SC2059 32 | printf "$@" >&2 33 | exit "$r" 34 | } 35 | 36 | progress() 37 | { 38 | # shellcheck disable=SC2059 39 | [[ -t 1 ]] || printf "$@" >&2 40 | } 41 | 42 | server_runs() 43 | { 44 | for ((i = 0; i < runs; ++i)); do 45 | iperf3 --one-off --server 46 | done 47 | } 48 | 49 | readonly role=$1 50 | readonly iface=$2 51 | 52 | if [[ $role == client ]]; then 53 | readonly srv_ip=$3 54 | readonly runs=${4:-10} 55 | readonly ccas=${5:-cubic} 56 | 57 | netcat -q0 "$srv_ip" 5201 <<-EOF || die 1 "Cannot reach server, experiment must be started there first!\n" 58 | ccas="$ccas" 59 | runs="$runs" 60 | EOF 61 | else 62 | # Executing arbitrary remote shell code, what could go wrong O:-) 63 | eval "$(netcat -q0 -l 5201)" 64 | fi 65 | 66 | readonly repo_dir=${0%/*}/.. 67 | if [[ ! -d ${repo_dir}/tools ]]; then 68 | printf "I don’t know where I’m called from\n" >&2 69 | exit 2 70 | fi 71 | 72 | "${repo_dir}/bpf/tcp-int/code/src/tools/tcp_int" unload &>/dev/null || : 73 | "${repo_dir}/bpf/tcp-int/code/src/tools/tcp_int" load 74 | printf "%d" "$$" >/sys/fs/cgroup/cgroup.tcp-int/cgroup.procs 75 | 76 | printf 'cca,gro,int,avg_throughput\n' 77 | for cca in $ccas; do 78 | for gro in off on; do 79 | ethtool -K "$iface" gro "$gro" 80 | 81 | for int in disable enable; do 82 | progress 'cca=%s gro=%s int=%s:\t' "$cca" "$gro" "$int" 83 | "${repo_dir}/bpf/tcp-int/code/src/tools/tcp_int" "$int" 84 | "${role}_runs" 85 | done 86 | done 87 | done 88 | -------------------------------------------------------------------------------- /tools/gro_plot: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import argparse 4 | 5 | import matplotlib as mpl 6 | import matplotlib.pyplot as plt 7 | import matplotlib.ticker as ticker 8 | import pandas as pd 9 | 10 | FIGSIZE = (2.5, 2) 11 | 12 | REPLACEMENTS = { 13 | "bbr": "BBR", 14 | "cubic": "Cubic", 15 | "dctcp": "DCTCP", 16 | "disable": "disabled", 17 | "enable": "enabled", 18 | "gro": "GRO", 19 | "int": "INT", 20 | "reno": "Reno", 21 | } 22 | 23 | 24 | def main(): 25 | plt.rcParams.update({"pdf.fonttype": 42}) 26 | 27 | try: 28 | mpl.style.use("seaborn-v0_8-colorblind") 29 | except Exception as e: 30 | print("Failed to change matplotlib style: {}".format(e)) 31 | 32 | argparser = argparse.ArgumentParser() 33 | argparser.add_argument( 34 | "--link-speed", 35 | default=25, 36 | type=int, 37 | help="speed of the measured link in Gbit/s", 38 | ) 39 | argparser.add_argument("csv_file", type=argparse.FileType("r")) 40 | args = argparser.parse_args() 41 | 42 | df = pd.read_csv(args.csv_file).rename(columns=REPLACEMENTS).replace(REPLACEMENTS) 43 | args.csv_file.close() # Be nice and close files :-) 44 | 45 | df["avg_throughput"] /= 10**9 46 | 47 | means = pd.pivot_table(df, columns=["GRO", "INT"], index="cca") 48 | 49 | fig, ax = plt.subplots(figsize=FIGSIZE, layout="constrained") 50 | ax = means["avg_throughput"].plot( 51 | ax=ax, 52 | kind="bar", 53 | rot=0, 54 | xlabel="Congestion control algorithm", 55 | ylabel="Throughput (Gbps)", 56 | ) 57 | ax.grid(linestyle="--") 58 | ax.set_ylim(0, args.link_speed) 59 | ax.yaxis.set_major_locator(ticker.FixedLocator([1, 10, 15, 20, 25, 50, 100])) 60 | fig.savefig("cca-gro-int-avg_throughput.pdf") 61 | plt.show() 62 | 63 | 64 | if __name__ == "__main__": 65 | raise SystemExit(main()) 66 | -------------------------------------------------------------------------------- /tools/iperf_csv: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | printf 'datetime,srcip,srcport,dstip,dstport,thread,interval,cwnd,rate\n' 4 | iperf -yc -i1 "$@" 5 | -------------------------------------------------------------------------------- /tools/iratio_experiment: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # 3 | # Example usage: 4 | # ./tools/iratio_experiment client eno1 192.168.13.3 user@tofino /path/to/private/ssh/key bf-sde-9.7.2/ '1 2 4 8 16' 5 | # ./tools/iratio_experiment client -C reno -r 1 -s 100 -t 60 eno1 192.168.13.3 user@tofino /path/to/private/ssh/key bf-sde-9.7.2/ '1 2 4 8 16' 6 | # 7 | # ./tools/iratio_experiment server eno1 8 | # 9 | 10 | set -eu 11 | 12 | client_runs() 13 | { 14 | local run_str 15 | printf -v run_str '\e[u\e[Krun %%i/%i' "$runs" 16 | 17 | progress '\e[s' 18 | for ((i = 0; i < runs; ++i)); do 19 | progress "$run_str" "$i" 20 | avg_throughput=$( 21 | iperf3 --client="$srv_ip" \ 22 | --congestion="$cca" \ 23 | --interval=0 \ 24 | --json \ 25 | --no-delay \ 26 | --omit=1 \ 27 | --time="$run_duration" | 28 | jq .end.sum_received.bits_per_second 29 | ) 30 | printf '%s,%s,%s,%d,%s\n' "$cca" "$int" "$gro" "$iratio" "${avg_throughput%%.*}" 31 | sleep 1 # Give the server a moment to be ready again 32 | done 33 | progress "${run_str}\n" "$runs" 34 | } 35 | 36 | die() 37 | { 38 | local r=$1 39 | shift 40 | # shellcheck disable=SC2059 41 | printf "$@" >&2 42 | exit "$r" 43 | } 44 | 45 | progress() 46 | { 47 | if [[ $role == server || ! -t 1 ]]; then 48 | # shellcheck disable=SC2059 49 | printf "$@" >&2 50 | fi 51 | } 52 | 53 | server_runs() 54 | { 55 | printf '\n' 56 | for ((i = 0; i < runs; ++i)); do 57 | iperf3 --one-off --server 58 | done 59 | } 60 | 61 | readonly role=$1 62 | shift 63 | 64 | if [[ $role == client ]]; then 65 | cca=cubic 66 | link_speed=25 67 | mtu=1500 68 | qdepth_threshold=1500 69 | run_duration=10 70 | runs=10 71 | 72 | while getopts 'C:M:r:s:t:' opt; do 73 | case $opt in 74 | C) cca=$OPTARG ;; 75 | M) mtu=$OPTARG ;; 76 | q) qdepth_threshold=$OPTARG ;; 77 | r) runs=$OPTARG ;; 78 | s) link_speed=$OPTARG ;; 79 | t) run_duration=$OPTARG ;; 80 | ?) exit 2 ;; 81 | esac 82 | done 83 | 84 | readonly cca 85 | readonly link_speed 86 | readonly qdepth_threshold 87 | readonly run_duration 88 | readonly runs 89 | 90 | shift $((OPTIND - 1)) 91 | else 92 | while getopts '' opt; do 93 | case $opt in 94 | ?) exit 2 ;; 95 | esac 96 | done 97 | fi 98 | 99 | readonly iface=$1 100 | shift 101 | 102 | if [[ $role == client ]]; then 103 | readonly srv_ip=$1 104 | readonly switch_user_host=$2 105 | readonly private_key_file=$3 106 | readonly switch_sde_dir=$4 107 | readonly iratios=$5 108 | 109 | shift 5 110 | fi 111 | 112 | if [[ $# -gt 0 ]]; then 113 | die 2 'unexpected arguments -- %s\n' "$*" 114 | fi 115 | 116 | if [[ $role == client ]]; then 117 | progress 'Sending experiment parameters to server …\n' 118 | netcat -q0 "$srv_ip" 5201 <<-EOF || die 1 "Cannot reach server, experiment must be started there first!\n" 119 | cca="$cca" 120 | iratios="$iratios" 121 | mtu="$mtu" 122 | runs="$runs" 123 | EOF 124 | else 125 | progress 'Waiting for client to send experiment parameters …\n' 126 | # Executing arbitrary remote shell code, what could go wrong O:-) 127 | eval "$(netcat -q0 -l 5201)" 128 | fi 129 | 130 | readonly repo_dir=${0%/*}/.. 131 | if [[ ! -d ${repo_dir}/tools ]]; then 132 | printf "I don’t know where I’m called from\n" >&2 133 | exit 2 134 | fi 135 | 136 | "${repo_dir}/bpf/tcp-int/code/src/tools/tcp_int" unload &>/dev/null || : 137 | "${repo_dir}/bpf/tcp-int/code/src/tools/tcp_int" load 138 | printf "%d" "$$" >/sys/fs/cgroup/cgroup.tcp-int/cgroup.procs 139 | 140 | [[ $role == server ]] || printf 'cca,int,gro,iratio,avg_throughput\n' 141 | 142 | ip link set dev "$iface" mtu "$mtu" 143 | 144 | int=disable 145 | iratio=-1 146 | "${repo_dir}/bpf/tcp-int/code/src/tools/tcp_int" "$int" 147 | for gro in off on; do 148 | ethtool -K "$iface" gro "$gro" 149 | 150 | progress 'cca=%s gro=%s int=%s iratio=%d:\t' "$cca" "$gro" "$int" "$iratio" 151 | "${role}_runs" 152 | done 153 | 154 | int=enable 155 | "${repo_dir}/bpf/tcp-int/code/src/tools/tcp_int" "$int" 156 | for iratio in $iratios; do 157 | if [[ $role == client ]]; then 158 | # shellcheck disable=SC2087 159 | ssh -i "$private_key_file" "$switch_user_host" bash -s <<-EOF 160 | set -e 161 | cd "$switch_sde_dir" 162 | . set_sde.bash >/dev/null 163 | ./pkgsrc/switch-p4-16/scripts/tcp_int_cp.py deploy --link "$link_speed" --iratio "$iratio" --qdepth_th "$qdepth_threshold" &>/dev/null 164 | EOF 165 | fi 166 | 167 | for gro in off on; do 168 | ethtool -K "$iface" gro "$gro" 169 | 170 | progress 'cca=%s gro=%s int=%s iratio=%d:\t' "$cca" "$gro" "$int" "$iratio" 171 | "${role}_runs" 172 | done 173 | done 174 | -------------------------------------------------------------------------------- /tools/iratio_plot: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import argparse 4 | 5 | import matplotlib as mpl 6 | import matplotlib.legend_handler 7 | import matplotlib.pyplot as plt 8 | import matplotlib.ticker as ticker 9 | import pandas as pd 10 | 11 | FIGSIZE = (2.5, 2) 12 | 13 | LINESTYLES = ["--", "-"] 14 | MARKERS = ["x", "s"] 15 | 16 | REPLACEMENTS = { 17 | "gro": "GRO", 18 | "int": "INT", 19 | } 20 | 21 | 22 | def main(): 23 | plt.rcParams.update({"pdf.fonttype": 42}) 24 | 25 | try: 26 | mpl.style.use("seaborn-v0_8-colorblind") 27 | except Exception as e: 28 | print("Failed to change matplotlib style: {}".format(e)) 29 | 30 | argparser = argparse.ArgumentParser() 31 | argparser.add_argument("csv_file", type=argparse.FileType("r")) 32 | argparser.add_argument( 33 | "--max-iratio", type=int, help="maximum iratio value to plot" 34 | ) 35 | args = argparser.parse_args() 36 | 37 | df = pd.read_csv(args.csv_file).rename(columns=REPLACEMENTS).replace(REPLACEMENTS) 38 | args.csv_file.close() # Be nice and close files :-) 39 | 40 | if df["cca"].nunique() > 1: 41 | return "cannot plot for multiple CCAs" 42 | cca = df.loc[0, "cca"] 43 | 44 | max_iratio = max(df["iratio"]) 45 | try: 46 | max_iratio = min(args.max_iratio, max_iratio) 47 | except TypeError: 48 | pass 49 | 50 | df["avg_throughput"] /= 10**9 51 | df = df.loc[(df["INT"] == "enable") & (df["iratio"] <= max_iratio)] 52 | 53 | fig, ax = plt.subplots(figsize=FIGSIZE, layout="constrained") 54 | for linestyle, marker, gro_group in zip(LINESTYLES, MARKERS, df.groupby("GRO")): 55 | gro, gro_df = gro_group 56 | gro_df = gro_df.groupby("iratio").mean(numeric_only=True) 57 | ax.semilogx( 58 | "avg_throughput", 59 | data=gro_df, 60 | label=f"GRO {gro}", 61 | linestyle=linestyle, 62 | marker=marker, 63 | ) 64 | 65 | ax.grid(linestyle="--") 66 | ax.legend( 67 | bbox_to_anchor=(-0.15, 1.1, 1.15, 0), 68 | borderaxespad=0, 69 | loc="lower left", 70 | mode="expand", 71 | ncols=df["GRO"].nunique(), 72 | ) 73 | ax.set_xlabel("tagratio") 74 | ax.set_ylabel("Throughput (Gbps)") 75 | 76 | mid_iratio = df["iratio"].unique() 77 | mid_iratio = mid_iratio[len(mid_iratio) // 2] 78 | ax.set_xticks([1, 4, mid_iratio, max_iratio], [1, 4, mid_iratio, "≈ no INT"]) 79 | 80 | ax.xaxis.set_minor_locator(ticker.NullLocator()) 81 | ax.yaxis.set_major_locator(ticker.FixedLocator([1, 10, 15, 20, 25, 50, 100])) 82 | 83 | fig.savefig(f"{cca}-iratio-gro-avg_throughput.pdf") 84 | 85 | plt.show() 86 | 87 | 88 | if __name__ == "__main__": 89 | raise SystemExit(main()) 90 | -------------------------------------------------------------------------------- /tools/mtu_experiment: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -eu 4 | 5 | client_runs() 6 | { 7 | for ((i = 0; i < runs; ++i)); do 8 | avg_throughput=$( 9 | iperf3 --client="$srv_ip" \ 10 | --congestion="$cca" \ 11 | --interval=0 \ 12 | --json \ 13 | --no-delay \ 14 | --omit=1 \ 15 | --time="$run_duration" | 16 | jq .end.sum_received.bits_per_second 17 | ) 18 | printf '%s,%d,%f\n' "$cca" "$mtu" "$avg_throughput" 19 | sleep 1 # Give the server a moment to be ready again 20 | done 21 | } 22 | 23 | die() 24 | { 25 | local r=$1 26 | shift 27 | # shellcheck disable=SC2059 28 | printf "$@" >&2 29 | exit "$r" 30 | } 31 | 32 | progress() 33 | { 34 | if [[ $role == server || ! -t 1 ]]; then 35 | # shellcheck disable=SC2059 36 | printf "$@" >&2 37 | fi 38 | } 39 | 40 | server_runs() 41 | { 42 | for ((i = 0; i < runs; ++i)); do 43 | iperf3 --one-off --server 44 | done 45 | } 46 | 47 | readonly role=$1 48 | shift 49 | 50 | if [[ $role == client ]]; then 51 | cca=cubic 52 | link_speed=25 53 | run_duration=10 54 | runs=10 55 | 56 | while getopts 'C:r:s:t:' opt; do 57 | case $opt in 58 | C) cca=$OPTARG ;; 59 | r) runs=$OPTARG ;; 60 | t) run_duration=$OPTARG ;; 61 | ?) exit 2 ;; 62 | esac 63 | done 64 | 65 | readonly cca 66 | readonly run_duration 67 | readonly runs 68 | 69 | shift $((OPTIND - 1)) 70 | else 71 | while getopts '' opt; do 72 | case $opt in 73 | ?) exit 2 ;; 74 | esac 75 | done 76 | fi 77 | 78 | readonly iface=$1 79 | shift 80 | 81 | if [[ $role == client ]]; then 82 | readonly srv_ip=$1 83 | shift 84 | 85 | [[ $# -gt 0 ]] || die 2 'missing MTU(s)\n' 86 | readonly mtus=("$@") 87 | else 88 | if [[ $# -gt 0 ]]; then 89 | die 2 'unexpected arguments -- %s\n' "$*" 90 | fi 91 | fi 92 | 93 | if [[ $role == client ]]; then 94 | progress 'Sending experiment parameters to server …\n' 95 | netcat -q0 "$srv_ip" 5201 <<-EOF || die 1 "Cannot reach server, experiment must be started there first!\n" 96 | cca="$cca" 97 | mtus=(${mtus[*]}) 98 | runs="$runs" 99 | EOF 100 | else 101 | progress 'Waiting for client to send experiment parameters …\n' 102 | # Executing arbitrary remote shell code, what could go wrong O:-) 103 | eval "$(netcat -q0 -l 5201)" 104 | fi 105 | 106 | readonly repo_dir=${0%/*}/.. 107 | if [[ ! -d ${repo_dir}/tools ]]; then 108 | die 2 "I don’t know where I’m called from\n" 109 | fi 110 | 111 | "${repo_dir}/bpf/tcp-int/code/src/tools/tcp_int" unload &>/dev/null || : 112 | "${repo_dir}/bpf/tcp-int/code/src/tools/tcp_int" load 113 | "${repo_dir}/bpf/tcp-int/code/src/tools/tcp_int" enable 114 | printf '%d' "$$" >/sys/fs/cgroup/cgroup.tcp-int/cgroup.procs 115 | 116 | ethtool -K "$iface" gro on 117 | 118 | [[ $role == server ]] || printf 'cca,mtu,avg_throughput\n' 119 | 120 | for mtu in "${mtus[@]}"; do 121 | ip link set dev "$iface" mtu "$mtu" 122 | sleep 1 123 | "${role}_runs" 124 | done 125 | -------------------------------------------------------------------------------- /tools/mtu_plot: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import argparse 4 | 5 | import matplotlib as mpl 6 | import matplotlib.legend_handler 7 | import matplotlib.pyplot as plt 8 | import matplotlib.ticker as ticker 9 | import pandas as pd 10 | 11 | FIGSIZE = (2.5, 2) 12 | 13 | REPLACEMENTS = { 14 | "mtu": "MTU", 15 | } 16 | 17 | 18 | def main(): 19 | plt.rcParams.update({"pdf.fonttype": 42}) 20 | 21 | try: 22 | mpl.style.use("seaborn-v0_8-colorblind") 23 | except Exception as e: 24 | print("Failed to change matplotlib style: {}".format(e)) 25 | 26 | argparser = argparse.ArgumentParser() 27 | argparser.add_argument("csv_file", type=argparse.FileType("r")) 28 | argparser.add_argument("--max-mtu", type=int, help="maximum MTU value to plot") 29 | args = argparser.parse_args() 30 | 31 | df = pd.read_csv(args.csv_file).rename(columns=REPLACEMENTS).replace(REPLACEMENTS) 32 | args.csv_file.close() # Be nice and close files :-) 33 | 34 | if df["cca"].nunique() > 1: 35 | return "cannot plot for multiple CCAs" 36 | cca = df.loc[0, "cca"] 37 | 38 | max_mtu = max(df["MTU"]) 39 | try: 40 | max_mtu = min(args.max_mtu, max_mtu) 41 | except TypeError: 42 | pass 43 | 44 | df = df.loc[df["MTU"] <= max_mtu] 45 | df = df.groupby("MTU").mean(numeric_only=True) 46 | df["avg_throughput"] /= 10**9 47 | 48 | fig, ax = plt.subplots(figsize=FIGSIZE, layout="constrained") 49 | ax.plot( 50 | [max_mtu], [df.loc[max_mtu]] 51 | ) # Quick-n-dirty force same color as for GRO=on in iratio plot 52 | ax.plot(df.index, df["avg_throughput"], label="GRO on", marker="s") 53 | 54 | ax.grid(linestyle="--") 55 | ax.legend( 56 | bbox_to_anchor=(0.45, 1.1, 0.55, 0), 57 | borderaxespad=0, 58 | loc="lower left", 59 | mode="expand", 60 | ) 61 | ax.set_xlabel("MTU (bytes)") 62 | ax.set_ylabel("Throughput (Gbps)") 63 | 64 | ax.xaxis.set_major_locator(ticker.FixedLocator([1500, 4000, 7000, 9000])) 65 | ax.yaxis.set_major_locator(ticker.FixedLocator([1, 10, 15, 20, 25, 50, 100])) 66 | 67 | fig.savefig(f"{cca}-mtu-avg_throughput.pdf") 68 | 69 | plt.show() 70 | 71 | 72 | if __name__ == "__main__": 73 | raise SystemExit(main()) 74 | -------------------------------------------------------------------------------- /tools/plot: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import argparse 4 | import sys 5 | from pathlib import Path 6 | 7 | import matplotlib.pyplot as plt 8 | import matplotlib.ticker as ticker 9 | import numpy as np 10 | import pandas as pd 11 | from scipy.signal import savgol_filter 12 | 13 | # "CN color spec", see https://matplotlib.org/stable/tutorials/colors/colors.html#specifying-colors 14 | COLORS = [f"C{i}" for i in range(11)] 15 | 16 | 17 | COLUMNS = ( 18 | "ack_seq", 19 | "base_rtt", 20 | "beta", 21 | "cwnd", 22 | "delta_t", 23 | "ev", 24 | "p_norm", 25 | "p_smooth", 26 | "qlen", 27 | "rate", 28 | "rtt_grad", 29 | "time", 30 | ) 31 | 32 | FORMATS = { 33 | "ack_seq": ".", 34 | } 35 | 36 | LABELS = { 37 | "ack_seq": "ACK'ed sequence #", 38 | "base_rtt": "Base RTT", 39 | "beta": "Additive increase β", 40 | "cwnd": "Congestion window", 41 | "delta_t": "Time delta between ACKs", 42 | "ev": "Resets", 43 | "p_norm": "Normalized power", 44 | "p_smooth": "Smoothed power", 45 | "qlen": "Queue length", 46 | "rate": "Pacing rate", 47 | "rtt_grad": "RTT gradient", 48 | "time": "Time", 49 | } 50 | 51 | LIMITS = { 52 | "rate": (0, 100 * 10**9), 53 | } 54 | 55 | PRETTY_UNITS = { 56 | "bit^2/s": r"$\frac{bit^2}{s}$", 57 | "us": "µs", 58 | } 59 | 60 | UNIT_FACTORS = { 61 | "bit/s": 8, 62 | "us": 0.001, 63 | "ms": 0.000001, 64 | "s": 0.000000001, 65 | } 66 | 67 | # The first unit specified for a column, if any, is the default unit: 68 | UNITS = { 69 | "base_rtt": ("us", "ms", "s"), 70 | "beta": ("bytes",), 71 | "cwnd": ("bytes",), 72 | "delta_t": ("ns",), 73 | "ev": (None,), 74 | "p_norm": (None,), 75 | "p_smooth": (None,), 76 | "qlen": ("bytes",), 77 | "rate": ("bytes/s", "bit/s"), 78 | "rtt_grad": (None,), 79 | "time": ("ns", "us", "ms", "s"), 80 | } 81 | 82 | 83 | class DictStrArg: 84 | def __init__(self, value_type): 85 | self._value_type = value_type 86 | 87 | def __call__(self, list_str): 88 | def to_key_val(s): 89 | kv = s.split(":", 2) 90 | if len(kv) < 1 or len(kv[0]) == 0: 91 | raise argparse.ArgumentTypeError(f"missing key: '{s}'") 92 | if len(kv) < 2 or len(kv[1]) == 0: 93 | raise argparse.ArgumentTypeError( 94 | f"missing value for key '{kv[0]}': '{s}'" 95 | ) 96 | try: 97 | return kv[0], self._value_type(kv[1]) 98 | except ValueError: 99 | raise argparse.ArgumentTypeError( 100 | f"invalid {self._value_type.__name__} value for key '{kv[0]}': '{kv[1]}'" 101 | ) 102 | 103 | return dict(to_key_val(s) for s in list_str.split(",") if len(s) > 0) 104 | 105 | 106 | class OddIntArg(int): 107 | def __new__(cls, arg_str): 108 | try: 109 | self = super().__new__(cls, arg_str) 110 | except ValueError: 111 | pass 112 | else: 113 | if self % 2 != 0: 114 | return self 115 | raise argparse.ArgumentTypeError(f"invalid odd integer value: '{arg_str}'") 116 | 117 | 118 | def check_unit(col, unit): 119 | units = UNITS.get(col, (None,)) 120 | if unit is None: 121 | unit = units[0] 122 | if unit not in (None, *units): 123 | raise LookupError(f"unit “{unit}“ not available for “{col}“") 124 | return unit 125 | 126 | 127 | def load_df(csv_file): 128 | df = pd.read_csv(csv_file) 129 | csv_file.close() 130 | 131 | # Is this CSV output from iperf? 132 | try: 133 | df["time"] = df["datetime"] / UNIT_FACTORS["s"] 134 | except KeyError: 135 | pass 136 | else: 137 | df["rate"] /= UNIT_FACTORS["bit/s"] 138 | 139 | df["time"] -= df["time"].min() 140 | 141 | try: 142 | df["p_norm"] = df["p_norm_scaled"] / df["power_scale"] 143 | except KeyError: 144 | pass 145 | else: 146 | df = df.drop(columns=["p_norm_scaled", "power_scale"]) 147 | 148 | return df 149 | 150 | 151 | def set_ylim(axis, min_val, max_val, min_lim, max_lim): 152 | try: 153 | min_val = max(min_val, min_lim) 154 | except TypeError: 155 | pass 156 | try: 157 | max_val = min(max_val, max_lim) 158 | except TypeError: 159 | pass 160 | _, ymargin = axis.margins() 161 | the_margin = ymargin * (max_val - min_val) 162 | axis.set_ylim(min_val - the_margin, max_val + the_margin) 163 | 164 | 165 | def setup_axis(axis, col, unit): 166 | pretty_unit = PRETTY_UNITS.get(unit, unit) 167 | axis.set_minor_locator(ticker.AutoMinorLocator()) 168 | 169 | if col in ("ack_seq", "cwnd"): 170 | axis.set_major_formatter(ticker.StrMethodFormatter("{x:.0f}")) 171 | else: 172 | if unit in ("bit/s", "bytes", "bytes/s"): 173 | axis.set_major_formatter(ticker.EngFormatter(unit=pretty_unit)) 174 | elif unit in ("s", "ms", "us", "ns"): 175 | precision = 0 176 | if unit == "s": 177 | precision = 3 178 | axis.set_major_formatter( 179 | ticker.StrMethodFormatter(f"{{x:.{precision}f}} {pretty_unit}") 180 | ) 181 | 182 | 183 | def main(): 184 | early_argparser = argparse.ArgumentParser(add_help=False, allow_abbrev=False) 185 | early_argparser.add_argument( 186 | "--all-flows", 187 | action="store_true", 188 | help="select all flows", 189 | ) 190 | early_argparser.add_argument( 191 | "--flow", 192 | default=0, 193 | type=int, 194 | help="select a single flow by zero-based index", 195 | ) 196 | early_argparser.add_argument( 197 | "--info", 198 | action="store_true", 199 | help="show information about the CSV file and exit", 200 | ) 201 | early_args, remaining_argv = early_argparser.parse_known_args() 202 | 203 | argparser = argparse.ArgumentParser( 204 | add_help=True, allow_abbrev=False, parents=[early_argparser] 205 | ) 206 | argparser.add_argument("csv_file", type=argparse.FileType("r"), nargs="+") 207 | 208 | if not early_args.info: 209 | argparser.add_argument("--fmt", default={}, type=DictStrArg(str)) 210 | argparser.add_argument("--max", default={}, type=DictStrArg(float)) 211 | argparser.add_argument("--min", default={}, type=DictStrArg(float)) 212 | argparser.add_argument( 213 | "--smooth", 214 | const=99, 215 | default=0, 216 | nargs="?", 217 | type=OddIntArg, 218 | help="smooth the plotted data; an optionally given uneven integer number (greater than 2) specifies the filter window size", 219 | ) 220 | argparser.add_argument("--title") 221 | argparser.add_argument("--unit", default={}, type=DictStrArg(str)) 222 | argparser.add_argument("-x", choices=COLUMNS, default="time") 223 | argparser.add_argument("-y", choices=COLUMNS, nargs="+") 224 | 225 | args = argparser.parse_args(remaining_argv, early_args) 226 | 227 | if args.info: 228 | for csv_file in args.csv_file: 229 | df = load_df(csv_file) 230 | pd.options.display.float_format = "{:06.4f}".format 231 | df.info() 232 | print(f"\n{df.describe(percentiles=[])}") 233 | return 234 | 235 | if args.all_flows and len(args.y) > 1: 236 | print("can only plot one data column when plotting multiple flows") 237 | return 238 | 239 | _fig, axs = plt.subplots( 240 | len(args.csv_file), 1, constrained_layout=True, squeeze=False 241 | ) 242 | 243 | for ax, csv_file in zip(axs.flat, args.csv_file): 244 | df = load_df(csv_file) 245 | 246 | if "hash" in df: 247 | hash_col = "hash" 248 | elif "thread" in df: 249 | hash_col = "thread" 250 | else: 251 | print("missing a 'hash' or 'thread' column") 252 | return 253 | 254 | available_hashes = df[hash_col].sort_values().unique() 255 | 256 | if not args.all_flows: 257 | selected_hash = available_hashes[args.flow] 258 | df = df[df[hash_col] == selected_hash] 259 | 260 | xunit = args.unit.get(args.x) 261 | try: 262 | xunit = check_unit(args.x, xunit) 263 | except LookupError as e: 264 | return e 265 | 266 | df[args.x] *= UNIT_FACTORS.get(xunit, 1) 267 | xmin = args.min.get(args.x, df[args.x].min()) 268 | xmax = args.max.get(args.x, df[args.x].max()) 269 | df = df[(df[args.x] >= xmin) & (df[args.x] <= xmax)] 270 | 271 | ax.set_xlabel(LABELS.get(args.x, args.x)) 272 | setup_axis(ax.xaxis, args.x, xunit) 273 | axx = None 274 | lines = [] 275 | 276 | if args.all_flows: 277 | y = args.y[0] 278 | 279 | max_yval = None 280 | min_yval = None 281 | yfmt = args.fmt.get(y) or FORMATS.get(y, "-") 282 | ymax = args.max.get(y) 283 | ymin = args.min.get(y) 284 | yunit = args.unit.get(y) 285 | try: 286 | yunit = check_unit(y, yunit) 287 | except LookupError as e: 288 | return e 289 | df[y] *= UNIT_FACTORS.get(yunit, 1) 290 | 291 | grouped_df = df.groupby(hash_col) 292 | 293 | for i, flow_hash in enumerate(available_hashes): 294 | flow_df = grouped_df.get_group(flow_hash) 295 | y_vals = ( 296 | savgol_filter(flow_df[y], args.smooth, 2) 297 | if args.smooth > 0 298 | else flow_df[y] 299 | ) 300 | 301 | lines.extend( 302 | ax.plot( 303 | flow_df[args.x], 304 | y_vals, 305 | yfmt, 306 | label=f"Flow {i}", 307 | ) 308 | ) 309 | 310 | flow_max_yval = y_vals.max() 311 | flow_min_yval = y_vals.min() 312 | try: 313 | max_yval = max(max_yval, flow_max_yval) 314 | except TypeError: 315 | max_yval = flow_max_yval 316 | try: 317 | min_yval = min(min_yval, flow_min_yval) 318 | except TypeError: 319 | min_yval = flow_min_yval 320 | 321 | ax.set_ylabel(LABELS.get(y, y)) 322 | set_ylim(ax, min_yval, max_yval, ymin, ymax) 323 | setup_axis(ax.yaxis, y, yunit) 324 | else: 325 | for i, y in enumerate(args.y): 326 | if args.x == y: 327 | return f"cannot use the same column “{args.x}” for x and y axis" 328 | 329 | ycolor = COLORS[i % len(COLORS)] 330 | yfmt = args.fmt.get(y) or FORMATS.get(y, "-") 331 | ymax = args.max.get(y) 332 | ymin = args.min.get(y) 333 | yunit = args.unit.get(y) 334 | try: 335 | yunit = check_unit(y, yunit) 336 | except LookupError as e: 337 | return e 338 | df[y] *= UNIT_FACTORS.get(yunit, 1) 339 | 340 | if axx is not None and y not in ("ev",): 341 | axx = ax.twinx() 342 | axx.spines["right"].set_position(("outward", (len(lines) - 1) * 50)) 343 | else: 344 | axx = ax 345 | 346 | if y == "ev": 347 | lines.append( 348 | ax.vlines( 349 | df.loc[df[y].notna(), args.x], 350 | 0, 351 | 1, 352 | color="lightgrey", 353 | label=LABELS.get(y), 354 | transform=ax.get_xaxis_transform(), 355 | ) 356 | ) 357 | else: 358 | y_vals = ( 359 | savgol_filter(df[y], args.smooth, 2) 360 | if args.smooth > 0 361 | else df[y] 362 | ) 363 | lines.extend( 364 | axx.plot( 365 | df[args.x], 366 | y_vals, 367 | yfmt, 368 | color=ycolor, 369 | label=LABELS.get(y, y), 370 | ) 371 | ) 372 | axx.set_ylabel(LABELS.get(y, y)) 373 | set_ylim(axx, y_vals.min(), y_vals.max(), ymin, ymax) 374 | setup_axis(axx.yaxis, y, yunit) 375 | 376 | if len(lines) > 1: 377 | ax.legend(handles=lines) 378 | 379 | if args.title: 380 | ax.set_title(args.title) 381 | elif len(args.csv_file) > 1: 382 | ax.set_title(Path(csv_file.name).name) 383 | 384 | plt.show() 385 | 386 | 387 | if __name__ == "__main__": 388 | raise SystemExit(main()) 389 | -------------------------------------------------------------------------------- /tools/powertcp_experiment: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -eu 4 | 5 | readonly base_srv_port=50000 6 | 7 | clients_run() 8 | { 9 | "$tcp_int_tool" trace >"${tmpdir}/tcp_int_trace" & 10 | local trace_pid=$! 11 | 12 | local iperf_pids=() 13 | for ((i = 0; i < ${#destinations[@]}; ++i)); do 14 | iperf --client "${destinations[$i]}" \ 15 | --interval="$iperf_interval" \ 16 | --nodelay \ 17 | --port=$((base_srv_port + 1 + i)) \ 18 | --reportstyle=C \ 19 | --tcp-congestion="$cca" \ 20 | --time="$run_duration" \ 21 | >"${tmpdir}/$i.iperf" & 22 | iperf_pids+=($!) 23 | done 24 | 25 | wait "${iperf_pids[@]}" 26 | 27 | progress ' capturing traces …' 28 | 29 | sleep 5 # Wait a bit to catch all trace entries. 30 | kill "$trace_pid" 31 | wait 32 | 33 | # The last line of the captured `tcp_int trace` is often broken. 34 | sed -i '$d' "${tmpdir}/tcp_int_trace" 35 | 36 | # The last line of the captured iperf outputs is a summary. We don't need it. 37 | sed -i -s '$d' "${tmpdir}/"*.iperf 38 | 39 | if [[ ! -f $tcp_int_csvfile ]]; then 40 | prepend_lines '2,2' 'cca,' <"${tmpdir}/tcp_int_trace" >"$tcp_int_csvfile" 41 | fi 42 | prepend_lines '4,$' "${cca}," <"${tmpdir}/tcp_int_trace" >>"$tcp_int_csvfile" 43 | 44 | if [[ ! -f $iperf_csvfile ]]; then 45 | printf 'cca,datetime,srcip,srcport,dstip,dstport,thread,interval,cwnd,rate\n' >"$iperf_csvfile" 46 | fi 47 | cat "${tmpdir}/"*.iperf | prepend_lines '' "${cca}," >>"$iperf_csvfile" 48 | 49 | rm -f -- "${tmpdir}/"* 50 | } 51 | 52 | die() 53 | { 54 | local r=$1 55 | shift 56 | # shellcheck disable=SC2059 57 | printf "$@" >&2 58 | exit "$r" 59 | } 60 | 61 | prepend_lines() 62 | { 63 | local addr=$1 64 | local prefix=$2 65 | local script 66 | printf -v script '%ss/^/%s/;%sp;d' "$addr" "$prefix" "$addr" 67 | sed -e "$script" 68 | } 69 | 70 | progress() 71 | { 72 | # shellcheck disable=SC2059 73 | printf "$@" >&2 74 | } 75 | 76 | servers_run() 77 | { 78 | local srv_pids=() 79 | for ((i = 0; i < ${#destinations[@]}; ++i)); do 80 | iperf --port=$((base_srv_port + 1 + i)) --server & 81 | srv_pids+=($!) 82 | done 83 | 84 | netcat -q0 -l "$base_srv_port" 85 | kill "${srv_pids[@]}" 86 | wait 87 | } 88 | 89 | readonly role=$1 90 | shift 91 | 92 | if [[ $role == client ]]; then 93 | ccas=(bpf_powertcp) 94 | iperf_interval=1 95 | powertcp_params=() 96 | run_duration=10 97 | 98 | while getopts 'C:i:P:t:' opt; do 99 | case $opt in 100 | C) ccas+=("$OPTARG") ;; 101 | i) iperf_interval=$OPTARG ;; 102 | P) 103 | # shellcheck disable=SC2206 104 | powertcp_params=($OPTARG) 105 | ;; 106 | t) run_duration=$OPTARG ;; 107 | ?) exit 2 ;; 108 | esac 109 | done 110 | 111 | [[ ${#ccas[@]} -gt 1 ]] || ccas+=(cubic) 112 | 113 | readonly ccas 114 | readonly iperf_interval 115 | readonly powertcp_params 116 | readonly run_duration 117 | 118 | shift $((OPTIND - 1)) 119 | else 120 | while getopts '' opt; do 121 | case $opt in 122 | ?) exit 2 ;; 123 | esac 124 | done 125 | fi 126 | 127 | if [[ $role == client ]]; then 128 | [[ $# -gt 0 ]] || die 2 'missing destination(s)\n' 129 | readonly destinations=("$@") 130 | elif [[ $# -gt 0 ]]; then 131 | die 2 'unexpected arguments -- %s\n' "$*" 132 | fi 133 | 134 | if [[ $role == client ]]; then 135 | progress 'Sending experiment parameters to server …\n' 136 | netcat -q0 "${destinations[0]}" "$base_srv_port" <<-EOF || die 1 "Cannot reach server, experiment must be started there first!\n" 137 | readonly ccas=("${ccas[@]}") 138 | readonly destinations=(${destinations[*]}) 139 | EOF 140 | else 141 | progress 'Waiting for client to send experiment parameters …\n' 142 | # Executing arbitrary remote shell code, what could go wrong O:-) 143 | eval "$(netcat -q0 -l "$base_srv_port")" 144 | fi 145 | 146 | readonly repo_dir=${0%/*}/.. 147 | if [[ ! -d ${repo_dir}/tools ]]; then 148 | printf "I don’t know where I’m called from\n" >&2 149 | exit 2 150 | fi 151 | 152 | tmpdir=$(mktemp --directory) || die 1 'failed to create a tempdir' 153 | readonly tmpdir 154 | # shellcheck disable=SC2064 155 | trap "rm -rf -- '$tmpdir'" EXIT HUP INT TERM 156 | 157 | readonly powertcp_tool=${repo_dir}/bpf/powertcp 158 | readonly tcp_int_tool=${repo_dir}/bpf/tcp-int/code/src/tools/tcp_int 159 | 160 | printf -v csvfile_prefix '%s-' "${ccas[@]}" 161 | readonly iperf_csvfile=${csvfile_prefix}iperf.csv 162 | readonly tcp_int_csvfile=${csvfile_prefix}tcp_int.csv 163 | rm -f "$iperf_csvfile" "$tcp_int_csvfile" 164 | 165 | "$tcp_int_tool" unload &>/dev/null || : 166 | "$tcp_int_tool" load 167 | "$tcp_int_tool" enable 168 | printf "%d" "$$" >/sys/fs/cgroup/cgroup.tcp-int/cgroup.procs 169 | 170 | "$powertcp_tool" register -f tracing "${powertcp_params[@]}" 171 | 172 | if [[ $role == client ]]; then 173 | for cca in "${ccas[@]}"; do 174 | progress '%s …' "$cca" 175 | clients_run 176 | progress ' done.\n' 177 | done 178 | 179 | netcat -w1 -q0 "${destinations[0]}" "$base_srv_port" 180 | else 181 | servers_run 182 | fi 183 | -------------------------------------------------------------------------------- /tools/powertcp_plot: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import argparse 4 | 5 | import matplotlib as mpl 6 | import matplotlib.pyplot as plt 7 | import numpy as np 8 | import pandas as pd 9 | from matplotlib import legend_handler, ticker 10 | 11 | FIGSIZE = (2.5, 2) 12 | 13 | REPLACEMENTS = { 14 | "DIP:DPORT": "Flow", 15 | "QDEPTH(B)": "Queue depth", 16 | "TIME(s)": "time", 17 | "bpf_powertcp": "PowerTCP", 18 | "cca": "CCA", 19 | "cubic": "Cubic", 20 | "dctcp": "DCTCP", 21 | "dstport": "Flow", 22 | } 23 | 24 | LINESTYLES = ["--", "-"] 25 | 26 | 27 | def main(): 28 | plt.rcParams.update({"pdf.fonttype": 42}) 29 | 30 | try: 31 | mpl.style.use("seaborn-v0_8-colorblind") 32 | except Exception as e: 33 | print("Failed to change matplotlib style: {}".format(e)) 34 | 35 | argparser = argparse.ArgumentParser() 36 | argparser.add_argument("--mean-window", default=1000, type=int) 37 | argparser.add_argument("--since", default=0, type=float) 38 | argparser.add_argument("--until", default=None, type=float) 39 | argparser.add_argument("iperf_csv_file", type=argparse.FileType("r")) 40 | argparser.add_argument("tcp_int_csv_file", type=argparse.FileType("r")) 41 | args = argparser.parse_args() 42 | 43 | iperf_df = pd.read_csv(args.iperf_csv_file) 44 | args.iperf_csv_file.close() # Be nice and close files :-) 45 | 46 | ccas = iperf_df["cca"].unique() 47 | filename_prefix = "-".join(ccas) 48 | 49 | iperf_df = iperf_df.rename(columns=REPLACEMENTS).replace(REPLACEMENTS) 50 | iperf_df["datetime"] = iperf_df["datetime"] - iperf_df.groupby("CCA")[ 51 | "datetime" 52 | ].transform("min") 53 | iperf_df["Flow"] -= 50000 54 | iperf_df["rate"] /= 10**9 55 | 56 | iperf_until = ( 57 | args.until 58 | if args.until is not None 59 | else min(iperf_df.groupby("CCA")["datetime"].max()) 60 | ) 61 | iperf_df = iperf_df[ 62 | (iperf_df["datetime"] >= args.since) & (iperf_df["datetime"] <= iperf_until) 63 | ] 64 | iperf_df["datetime"] -= args.since 65 | 66 | fig1, ax1 = plt.subplots(figsize=FIGSIZE, layout="constrained") 67 | cca_rate_lines = { 68 | cca_group[0]: tuple( 69 | ax1.plot("datetime", "rate", data=flow_df, linestyle=linestyle)[0] 70 | for _flow, flow_df in cca_group[1].groupby("Flow") 71 | ) 72 | for linestyle, cca_group in zip(LINESTYLES, iperf_df.groupby("CCA")) 73 | } 74 | 75 | ax1.grid(linestyle="--") 76 | ax1.legend( 77 | bbox_to_anchor=(-0.15, 1.1, 1.15, 0), 78 | borderaxespad=0, 79 | handler_map={tuple: legend_handler.HandlerTuple(None, pad=0)}, 80 | handles=cca_rate_lines.values(), 81 | labels=cca_rate_lines.keys(), 82 | loc="lower left", 83 | mode="expand", 84 | ncols=len(ccas), 85 | ) 86 | ax1.set_xlabel("Time (s)") 87 | ax1.set_ylabel("Throughput (Gbps)") 88 | ax1.yaxis.set_major_locator(ticker.FixedLocator([1, 10, 15, 20, 25, 50, 100])) 89 | 90 | fig1.savefig(f"{filename_prefix}-throughput.pdf") 91 | 92 | tcp_int_df = pd.read_csv(args.tcp_int_csv_file) 93 | args.tcp_int_csv_file.close() 94 | 95 | tcp_int_df.columns = tcp_int_df.columns.str.strip() 96 | tcp_int_df = tcp_int_df.rename(columns=REPLACEMENTS).replace(REPLACEMENTS) 97 | 98 | tcp_int_until = ( 99 | args.until 100 | if args.until is not None 101 | else min(tcp_int_df.groupby("CCA")["time"].max()) 102 | ) 103 | tcp_int_df = tcp_int_df[ 104 | (tcp_int_df["time"] >= args.since) & (tcp_int_df["time"] <= tcp_int_until) 105 | ] 106 | tcp_int_df["Queue depth"] /= 1000 107 | tcp_int_df["time"] -= args.since 108 | 109 | fig2, ax2 = plt.subplots(figsize=FIGSIZE, layout="constrained") 110 | 111 | for i, cca_group in enumerate(tcp_int_df.groupby("CCA")): 112 | cca, cca_df = cca_group 113 | 114 | qdepth_per_cca_flow = pd.pivot_table( 115 | cca_df, index=np.arange(len(cca_df)), values=["Queue depth", "time"] 116 | ) 117 | qdepth_per_cca_flow = qdepth_per_cca_flow.groupby( 118 | np.arange(len(qdepth_per_cca_flow)) // args.mean_window 119 | ).mean() 120 | 121 | ax2.plot( 122 | "time", 123 | "Queue depth", 124 | data=qdepth_per_cca_flow, 125 | linestyle=LINESTYLES[i % len(LINESTYLES)], 126 | label=cca, 127 | ) 128 | 129 | ax2.grid(linestyle="--") 130 | ax2.legend( 131 | bbox_to_anchor=(-0.25, 1.1, 1.25, 0), 132 | borderaxespad=0, 133 | loc="lower left", 134 | mode="expand", 135 | ncols=len(ccas), 136 | ) 137 | ax2.set_xlabel("Time (s)") 138 | ax2.set_ylabel("Queue depth (KB)") 139 | ax2.yaxis.set_major_locator(ticker.MaxNLocator(nbins="auto", steps=[5, 10])) 140 | 141 | fig2.savefig(f"{filename_prefix}-qdepth.pdf") 142 | 143 | plt.show() 144 | 145 | 146 | if __name__ == "__main__": 147 | raise SystemExit(main()) 148 | -------------------------------------------------------------------------------- /tools/reinsmod: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | # 3 | # Remove a module (if loaded) and add it to the kernel (again). 4 | # 5 | # Usage: reinsmod MODULENAME [MODULE_PARAMETER...] 6 | 7 | set -eu 8 | 9 | modpath=$1 10 | shift 11 | 12 | modname=${modpath##*/} 13 | modname=${modname%.ko} 14 | 15 | ! lsmod | grep -q "$modname" || rmmod "$modname" 16 | insmod "$modpath" "$@" 17 | -------------------------------------------------------------------------------- /tools/screen/iperf-client.screen: -------------------------------------------------------------------------------- 1 | screen 2 | exec ./bpf/tcp-int/code/src/tools/tcp_int ecr-disable 3 | -------------------------------------------------------------------------------- /tools/screen/iperf-servers.screen: -------------------------------------------------------------------------------- 1 | screen iperf -s 2 | screen iperf3 -s 3 | -------------------------------------------------------------------------------- /tools/send_something: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | set -eu 3 | 4 | usage() 5 | { 6 | printf "Usage: %s HOST PORT DATA_SIZE [REPEATS [DELAY]]\n" "$0" 7 | } 8 | 9 | if [ $# -lt 3 ]; then 10 | printf "Missing arguments\n" >&2 11 | usage >&2 12 | exit 2 13 | fi 14 | 15 | host=$1 16 | port=$2 17 | count=$3 18 | repeats=${4:-1} 19 | delay=${5:-0} 20 | 21 | for i in $(seq 1 "$repeats"); do 22 | dd count="$count" if=/dev/urandom iflag=count_bytes 23 | [ $i = "$repeats" ] || sleep "$delay" 24 | done | netcat -q0 "$host" "$port" 25 | -------------------------------------------------------------------------------- /tools/setup-bpf: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | # 3 | # Setup the BPF implementation and TCP-INT and start a screen session to use it 4 | # (for e.g. iperf3 usage or with already running iperf(3) servers). 5 | # 6 | 7 | set -eu 8 | 9 | if [ "$(id -u)" -ne 0 ]; then 10 | echo "setup-bpf: you probably want to execute this as root" >&2 11 | exit 2 12 | fi 13 | 14 | repo_dir=${0%/*}/.. 15 | session=$1 16 | user=${SUDO_USER:-$USER} 17 | 18 | shift 19 | 20 | if [ ! -d "$repo_dir/tools" ]; then 21 | echo "must be called from within the repository" >&2 22 | exit 2 23 | fi 24 | 25 | ./bpf/powertcp unregister || : 26 | ./bpf/tcp-int/code/src/tools/tcp_int unload || : 27 | 28 | ./bpf/tcp-int/code/src/tools/tcp_int load 29 | ./bpf/tcp-int/code/src/tools/tcp_int enable 30 | ./bpf/powertcp register "$@" 31 | 32 | echo $$ >> /sys/fs/cgroup/cgroup.tcp-int/cgroup.procs 33 | 34 | screen -S "$session.$user" -c "$repo_dir/tools/screen/$session.screen" 35 | -------------------------------------------------------------------------------- /tools/setup-module: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | # 3 | # Setup the BPF implementation and TCP-INT and start a screen session to use it 4 | # (for e.g. iperf3 usage or with already running iperf(3) servers). 5 | # 6 | 7 | set -eu 8 | 9 | if [ "$(id -u)" -ne 0 ]; then 10 | echo "setup-module: you probably want to execute this as root" >&2 11 | exit 2 12 | fi 13 | 14 | repo_dir=${0%/*}/.. 15 | session=$1 16 | user=${SUDO_USER:-$USER} 17 | 18 | shift 19 | 20 | if [ ! -d "$repo_dir/tools" ]; then 21 | echo "must be called from within the repository" >&2 22 | exit 2 23 | fi 24 | 25 | ! lsmod | grep -q ^tcp_powertcp || rmmod tcp_powertcp 26 | insmod "$repo_dir/tcp_powertcp.ko" "$@" 27 | 28 | screen -S "$session.$user" -c "$repo_dir/tools/screen/$session.screen" 29 | -------------------------------------------------------------------------------- /tools/tracing/to_csv: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bpftrace 2 | 3 | #include 4 | 5 | BEGIN 6 | { 7 | print("time,hash,ack_seq,base_rtt,beta,cwnd,rate,power_scale,p_norm_scaled,ev"); 8 | } 9 | 10 | tracepoint:powertcp:new_ack 11 | { 12 | printf("%llu,%u,%u,%ld,%d,%u,%lu,%ld,%ld,\n", args->time, args->hash, 13 | args->ack_seq, @base_rtt[tid], @beta[tid], args->cwnd, args->rate, 14 | @power_scale[tid], @p_norm[tid]); 15 | } 16 | 17 | tracepoint:powertcp:norm_power 18 | { 19 | @base_rtt[tid] = args->base_rtt; 20 | } 21 | 22 | tracepoint:powertcp:reset 23 | { 24 | if (args->ev != CA_EVENT_CWND_RESTART && @beta[tid] > 0) { 25 | printf("%llu,%u,,%ld,%d,%u,%lu,%ld,%ld,%d\n", args->time, args->hash, 26 | args->base_rtt, @beta[tid], args->cwnd, args->rate, 27 | @power_scale[tid], @p_norm[tid], args->ev); 28 | } else { 29 | printf("%llu,%u,,%ld,,%u,%lu,,,%d\n", args->time, args->hash, 30 | args->base_rtt, args->cwnd, args->rate, args->ev); 31 | } 32 | } 33 | 34 | tracepoint:powertcp:update_window 35 | { 36 | @beta[tid] = args->beta; 37 | @power_scale[tid] = args->power_scale; 38 | @p_norm[tid] = args->p_norm; 39 | } 40 | -------------------------------------------------------------------------------- /tools/tune-eth: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | set -u 3 | 4 | if [ $# -lt 1 ]; then 5 | printf "missing interface name\n" >&2 6 | exit 2 7 | fi 8 | 9 | call_if_found() 10 | { 11 | if command -v "$1" >/dev/null; then 12 | "$@" 13 | else 14 | printf '%s not found in PATH, skipping `%s`\n' "$1" "$*" >&2 15 | fi 16 | } 17 | 18 | call_if_found tuned-adm profile network-latency 19 | 20 | for iface in "$@"; do 21 | call_if_found sysctl -qw \ 22 | vm.overcommit_memory=1 \ 23 | net.core.busy_poll=50000 \ 24 | net.core.busy_read=50000 \ 25 | net.core.somaxconn=4096 \ 26 | net.core.netdev_max_backlog=8192 \ 27 | net.ipv4.tcp_max_syn_backlog=16384 \ 28 | net.core.rmem_max=16777216 \ 29 | net.core.wmem_max=16777216 \ 30 | net.ipv4.tcp_mem="764688 1019584 16777216" \ 31 | net.ipv4.tcp_rmem="8192 87380 16777216" \ 32 | net.ipv4.tcp_wmem="8192 65536 16777216" 33 | 34 | call_if_found sysctl -qw \ 35 | net.ipv4.tcp_sack=0 \ 36 | net.ipv4.tcp_timestamps=0 37 | 38 | call_if_found ethtool -G "$iface" \ 39 | tx 8160 \ 40 | rx 8160 41 | 42 | call_if_found ethtool -K "$iface" \ 43 | gro on \ 44 | gso on \ 45 | lro on \ 46 | rx on \ 47 | tso on \ 48 | tx on 49 | 50 | call_if_found ethtool -C "$iface" \ 51 | adaptive-rx off rx-usecs 0 \ 52 | adaptive-tx off tx-usecs 10 53 | done 54 | --------------------------------------------------------------------------------