├── .gitignore ├── README.md ├── leaf_conweave ├── Makefile ├── README.md ├── figs │ └── system-flowchart-rerouting.pdf └── p4src │ ├── includes │ ├── actions_egress.p4 │ ├── actions_ingress.p4 │ ├── conweave_egress.p4 │ ├── conweave_ingress.p4 │ ├── headers.p4 │ ├── macro.p4 │ ├── parser.p4 │ ├── registers_egress.p4 │ ├── registers_ingress.p4 │ ├── tables_egress.p4 │ └── tables_ingress.p4 │ └── leaf_conweave.p4 ├── leaf_conweave_resource ├── Makefile ├── mau.resources.log └── p4src │ ├── includes │ ├── actions_egress.p4 │ ├── actions_ingress.p4 │ ├── conweave_egress.p4 │ ├── conweave_ingress.p4 │ ├── headers.p4 │ ├── macro.p4 │ ├── parser.p4 │ ├── registers_egress.p4 │ ├── registers_ingress.p4 │ ├── tables_egress.p4 │ └── tables_ingress.p4 │ └── leaf_conweave_resource.p4 ├── native_afc ├── README.md ├── cp │ ├── rpc_afc_config.py │ ├── send_afc_pause.py │ └── setup.py └── p4src │ └── native_afc.p4 └── native_dcqcn ├── README.md ├── cp ├── devtest_cmds.py ├── get_rx_info.py ├── rate_limit.py ├── read_statistics.py └── setup.py └── p4src ├── includes ├── headers.p4 └── parser.p4 └── native_dcqcn.p4 /.gitignore: -------------------------------------------------------------------------------- 1 | # Prerequisites 2 | *.d 3 | 4 | # Vscode 5 | .vscode/* 6 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # ConWeave P4-16 Repository 2 | 3 | This is a Github repository for the SIGCOMM'23 paper "[Network Load Balancing with In-network Reordering Support for RDMA](https://doi.org/10.1145/3603269.3604849)". 4 | 5 | This repository incudes p4 programs of ConWeave. 6 | We used `BF-SDE-9.11.1` to compile and run the program. 7 | 8 | ### Resource Consumption on Tofino2 9 | * `leaf_conweave_resource` is a repo to solely evaluate the data-plane resource consumption of ConWeave mechanism. 10 | * `leaf_conweave_resource/mau.resources.log` is the output log files, showing percentages of resource consumption based on `p4-build`. 11 | 12 | 13 | ### [Artifact - P4 Source Code](leaf_conweave/README.md) 14 | `leaf_conweave` is a repo of ConWeave p4 source code running on leaf (ToR) switches. 15 | 16 | **_NOTE:_** The current repository is simply provided as a reference code. 17 | It would be hard to exactly reproduce the testbed setup and evaluation results in the paper because of our complex testbed environment (e.g., by virtualized topology and its adapted codebase). 18 | 19 | For artifact evaluation, feel free to skip this as the majority of results in the paper are executed by RDMA NS-3 simulator that allows integrating various environment conditions that are hard to be done on physical testbed, and prevent randomness for fair comparative studies versus baseline existing solutions. 20 | 21 | If time allows, we will provide a _simplified_ / _portable_ program that is runnable on much simpler testbed and provides easy reproducibility. 22 | 23 | ### [Toy Example - Advanced Flow Control](native_afc/README.md) 24 | 25 | Some Tofino2 users have difficulty to use the feature of queue pause/resume using AFC. 26 | To this end, we provide a toy example so that you can easily run AFC and test whether the queue is indeed paused or not. 27 | 28 | 29 | ### [Toy Example - DCQCN ECN-Marking Implementation](native_dcqcn/README.md) 30 | 31 | For RDMA testbed setup, you need to configure p4 switch with ECN-marking for DCQCN or DCTCP. 32 | We provide an example script of DCQCN implementation on Tofino1. 33 | 34 | ### Credit 35 | 36 | ``` 37 | @inproceedings{song2023conweave, 38 | title={Network Load Balancing with In-network Reordering Support for RDMA}, 39 | author={Song, Cha Hwan and Khooi, Xin Zhe and Joshi, Raj and Choi, Inho and Li, Jialin and Chan, Mun Choon}, 40 | booktitle={Proceedings of SIGCOMM}, 41 | year={2023} 42 | } 43 | ``` 44 | -------------------------------------------------------------------------------- /leaf_conweave/Makefile: -------------------------------------------------------------------------------- 1 | all: 2 | ${SDE}/p4_build.sh ./p4src/leaf_conweave.p4 --with-tofino2 3 | -------------------------------------------------------------------------------- /leaf_conweave/README.md: -------------------------------------------------------------------------------- 1 | # ConWeave Implementation on Tofino2 Leaf (ToR) Switches 2 | 3 | This repo includes the ConWeave P4 implementation which includes the logic of ConWeave load balancing and DCQCN ECN Marking at switches. 4 | The key feature needed to run ConWeave is Tofino2's Advanced Flow Control (AFC) that enables queue pause/resume. 5 | 6 | ## ConWeave Logic 7 | 8 | Logically, ConWeave implementation is categorized into two parts based on "where the (DATA or CONTROL) packet comes from": 9 | 1. Source ToR (or `SrcToR`) - in case where the packet comes from the sender RNIC, this is the first hop for the packet. For the logical flow, see [flowchart](figs/system-flowchart-rerouting.pdf) 10 | 2. Destination ToR (or `DstToR`) - in case where the switch is connected to the receiver RNIC and this is the last hop for the packet. 11 | 12 | * To configure ConWeave parameters, see [macro.p4](p4src/includes/macro.p4) and comments for details. 13 | 14 | 15 | ### Virtual Topology 16 | Note that this repository is used to evaluate ConWeave on our testbed with 16 RNICs and a _virtualized switching topology_. 17 | Under that virtualized setup, it becomes complicated to distinguish whether the virtual switch corresponds to `SrcToR` or `DstToR` for the given input packet. 18 | This logic is implemented via assignment `switch_id` to each port and match-action tables (see [lines](https://github.com/conweave-project/conweave-p4/blob/1db645659574ffe15100bc4f3c75ba2e99548025/leaf_conweave/p4src/includes/conweave_ingress.p4#L77-L78)): 19 | ```c 20 | ... 21 | get_switch_id.apply(); /* -> meta.switch_id */ 22 | ... 23 | do_categorize_conweave_logical_step.apply(); /* categorize with p4-compiler-friendly coding (SrcToR/DstToR) */ 24 | ... 25 | ``` 26 | -------------------------------------------------------------------------------- /leaf_conweave/figs/system-flowchart-rerouting.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/conweave-project/conweave-p4/39b34beadf3d2b97c59a90fac5819821e0f0ffa9/leaf_conweave/figs/system-flowchart-rerouting.pdf -------------------------------------------------------------------------------- /leaf_conweave/p4src/includes/actions_egress.p4: -------------------------------------------------------------------------------- 1 | /* -*- P4_16 -*- */ 2 | #include 3 | #if __TARGET_TOFINO__ == 2 4 | #include 5 | #else 6 | #include 7 | #endif 8 | 9 | #include "headers.p4" 10 | #include "macro.p4" 11 | #include "parser.p4" 12 | 13 | 14 | /**************************************************************************** 15 | * B A S I C F U N C T I O N S 16 | ****************************************************************************/ 17 | action nop() {} 18 | 19 | action swap_src_dst_fields() { /* swap src and dst */ 20 | /* swap srcip <-> dstip */ 21 | meta.dummy_32b = hdr.ipv4.dst_addr; 22 | hdr.ipv4.dst_addr = hdr.ipv4.src_addr; 23 | hdr.ipv4.src_addr = meta.dummy_32b; 24 | } 25 | 26 | /* HEADER CLEANING */ 27 | action invalid_conweave_eg() { 28 | hdr.cwh.setInvalid(); /* remove conweave header */ 29 | } 30 | action initialize_bth_header_eg() { 31 | hdr.bth.conweave_opcode = 0; 32 | hdr.bth.conweave_phase = 0; 33 | hdr.bth.conweave_epoch = 0; 34 | hdr.bth.conweave_ask_reply = 0; 35 | hdr.bth.conweave_tail_flag = 0; 36 | hdr.bth.out_port = 0; 37 | } 38 | 39 | // ##### DCTCP ECN Marking ##### 40 | action mark_ecn_ce_codepoint(){ 41 | hdr.ipv4.ecn = 0b11; 42 | } 43 | -------------------------------------------------------------------------------- /leaf_conweave/p4src/includes/actions_ingress.p4: -------------------------------------------------------------------------------- 1 | /* -*- P4_16 -*- */ 2 | #include 3 | #if __TARGET_TOFINO__ == 2 4 | #include 5 | #else 6 | #include 7 | #endif 8 | 9 | #include "headers.p4" 10 | #include "macro.p4" 11 | #include "parser.p4" 12 | 13 | /**************************************************************************** 14 | * B A S I C F U N C T I O N S 15 | ****************************************************************************/ 16 | action nop() {} 17 | 18 | action drop(bit<3> drop_bits) { 19 | ig_intr_md_for_dprsr.drop_ctl = drop_bits; 20 | } 21 | 22 | action set_port(PortId_t port) { 23 | meta.out_port = port; 24 | } 25 | 26 | action forward_port(PortId_t port) { 27 | ig_intr_md_for_tm.ucast_egress_port = port; 28 | } 29 | 30 | action forward_queue(QueueId_t qid) { 31 | ig_intr_md_for_tm.qid = qid; 32 | 33 | } 34 | 35 | action bypass_egress() { 36 | ig_intr_md_for_tm.bypass_egress = 1w1; 37 | } 38 | 39 | action resubmit_tx() { /* resubmit */ 40 | ig_intr_md_for_dprsr.resubmit_type = RESUB_DPRSR_DIGEST_REPLY; 41 | } 42 | 43 | action recirculate_rx() { // only for each out_port of ToR->Server 44 | #if (LPBK_FOR_CTRL == 1) 45 | meta.out_port = (bit<9>)16; // loopback port XXX 46 | #else 47 | meta.out_port[8:7] = meta.pipeline_index; // pipeline index 48 | meta.out_port[6:0] = (bit<7>)RECIRC_PORT; /* RECIRC PORT (for each pipe) */ 49 | #endif 50 | hdr.ipv4.ecn = 0x0; /* disable ECN during recirculation */ 51 | } 52 | 53 | action swap_src_dst_fields() { /* swap srcip <-> dstip */ 54 | meta.dummy_32b = hdr.ipv4.dst_addr; 55 | hdr.ipv4.dst_addr = hdr.ipv4.src_addr; 56 | hdr.ipv4.src_addr = meta.dummy_32b; 57 | } 58 | 59 | /**************************************************************************** 60 | * C O N W E A V E - T X T O R 61 | ****************************************************************************/ 62 | 63 | /* REPLY DEADLINE */ 64 | action do_get_new_reply_timeout() { 65 | meta.ts_new_reply_timeout = meta.ts_now + meta.ts_base_rtt; 66 | } 67 | action do_get_max_reply_timeout() { 68 | meta.ts_new_reply_timeout = CONWEAVE_MAX_TIMESTAMP; 69 | } 70 | 71 | /* HEADER UPDATE */ 72 | action do_update_conweave_header_epoch() { 73 | hdr.bth.conweave_epoch = meta.result_epoch; /* conweave epoch */ 74 | } 75 | action do_update_conweave_header_phase() { 76 | hdr.bth.conweave_phase = meta.result_phase; /* conweave phase */ 77 | } 78 | action do_update_conweave_header_opcode(bit<2> opcode) { 79 | hdr.bth.conweave_opcode = opcode; /* conweave data tag */ 80 | } 81 | action update_conweave_header_tail_flag() { /* reply_timeout -> send TAIL packet */ 82 | hdr.bth.conweave_tail_flag = 1; 83 | } 84 | 85 | 86 | 87 | /**************************************************************************** 88 | * C O N W E A V E - R X T O R 89 | ****************************************************************************/ 90 | /* hashkey 32 bits */ 91 | Hash>(HashAlgorithm_t.CRC32) hash_crc32; 92 | action get_hash_flowkey_step1() { /* creates flow hashkey */ 93 | meta.hash_flowkey = (bit<32>)hash_crc32.get({ hdr.ipv4.src_addr, hdr.ipv4.dst_addr, hdr.udp.src_port}); 94 | } 95 | action get_hash_flowkey_step2() { /* crafts non-zero flow hashkey */ 96 | meta.hash_flowkey = meta.hash_flowkey |+| 1; 97 | } 98 | 99 | /* Sample QueueID */ 100 | Hash(HashAlgorithm_t.CRC8) hash_crc8; 101 | Hash(HashAlgorithm_t.CRC16) hash_crc16; 102 | Hash(HashAlgorithm_t.IDENTITY) hash_identity; 103 | action sample_hash_qid_step_one() { 104 | meta.hash_qid_sample_c1 = (QueueId_t)(hash_crc8.get({ hdr.ipv4.src_addr, hdr.ipv4.dst_addr, hdr.udp.src_port})); 105 | meta.hash_qid_sample_c2 = (QueueId_t)(hash_crc16.get({ hdr.ipv4.src_addr, hdr.ipv4.dst_addr, hdr.udp.src_port})); 106 | meta.hash_qid_sample_c3 = (QueueId_t)(hash_identity.get({ hdr.ipv4.src_addr, hdr.ipv4.dst_addr, hdr.udp.src_port})); 107 | } 108 | action sample_hash_qid_step_two() { 109 | meta.hash_qid_sample_c1 = meta.hash_qid_sample_c1 + CONWEAVE_QREG_IDX_OFFSET_C1; 110 | meta.hash_qid_sample_c2 = meta.hash_qid_sample_c2 + CONWEAVE_QREG_IDX_OFFSET_C2; 111 | meta.hash_qid_sample_c3 = meta.hash_qid_sample_c3 + CONWEAVE_QREG_IDX_OFFSET_C3; 112 | } 113 | 114 | 115 | 116 | 117 | /* ADJUST TIMESTAMP WRAP-AROUND */ 118 | action do_calc_tx_timegap_ts_rx() { 119 | meta.ts_timegap_rx = meta.ts_tail |-| meta.ts_phase0_tx; // no wrap-around 120 | } 121 | action do_default_tx_timegap_ts_rx() { 122 | meta.ts_timegap_rx = CONWEAVE_RX_DEFAULT_WAITING_TIME; // default flush waiting time 123 | meta.ts_phase0_rx = meta.ts_now; /** NOTE: overwrite as no phase0 info */ 124 | } 125 | /* CALC EXPECTED TAIL ARRIVAL TIME */ 126 | action do_calc_expected_tail_arrival_phase0_ts_rx() { 127 | meta.ts_expected_tail_arrival_rx = meta.ts_now + meta.ts_timegap_rx; 128 | } 129 | action do_calc_expected_tail_arrival_phase1_ts_rx() { 130 | meta.ts_expected_tail_arrival_rx = meta.ts_phase0_rx + meta.ts_timegap_rx; 131 | } 132 | 133 | 134 | /* HEADER CLEANING */ 135 | action invalid_conweave_ig() { 136 | hdr.cwh.setInvalid(); /* remove conweave header */ 137 | } 138 | action initialize_bth_header_ig() { 139 | hdr.bth.conweave_opcode = 0; 140 | hdr.bth.conweave_phase = 0; 141 | hdr.bth.conweave_epoch = 0; 142 | hdr.bth.conweave_ask_reply = 0; 143 | hdr.bth.conweave_tail_flag = 0; 144 | hdr.bth.out_port = 0; 145 | } 146 | -------------------------------------------------------------------------------- /leaf_conweave/p4src/includes/conweave_egress.p4: -------------------------------------------------------------------------------- 1 | /* -*- P4_16 -*- */ 2 | #include 3 | #if __TARGET_TOFINO__ == 2 4 | #include 5 | #else 6 | #include 7 | #endif 8 | 9 | #include "headers.p4" 10 | #include "macro.p4" 11 | #include "parser.p4" 12 | 13 | /************************************************************************* 14 | **************** E G R E S S P R O C E S S I N G ******************* 15 | *************************************************************************/ 16 | 17 | control SwitchEgress( 18 | inout header_t hdr, 19 | inout metadata_t meta, 20 | in egress_intrinsic_metadata_t eg_intr_md, 21 | in egress_intrinsic_metadata_from_parser_t eg_intr_md_from_prsr, 22 | inout egress_intrinsic_metadata_for_deparser_t eg_intr_md_for_dprsr, 23 | inout egress_intrinsic_metadata_for_output_port_t eg_intr_md_for_oport) { 24 | /* include actions, registers, and tables */ 25 | 26 | #include "actions_egress.p4" 27 | #include "registers_egress.p4" 28 | #include "tables_egress.p4" 29 | 30 | apply { 31 | /*------------------------------------------------------------------------------------------ 32 | Tx REPLY, NOTIFY -> {swap src/dst, update phase, and update opcode} based on metadata 33 | 34 | TODO: REPLY will have no delay by queue. 35 | But, NOTIFY can be delayed because we craft using the "original" packet (mirror_option > 0). 36 | Later, we should craft using the mirrored packet (how?) 37 | -------------------------------------------------------------------------------------------*/ 38 | if (meta.ig_mirror1.mirror_option == 1) { /** Reply of TAIL (NOTE:using original packet) */ 39 | swap_src_dst_fields(); 40 | if (hdr.cwctrl.isValid()) { 41 | /* this step is necessary, because we sometimes mirror CTRL pkt (see CWCTRL part at ingress pipeline) */ 42 | hdr.cwctrl.setInvalid(); 43 | hdr.ethernet.ether_type = ether_type_t.IPV4; 44 | } 45 | hdr.bth.conweave_opcode = 2; 46 | hdr.bth.conweave_phase = 1; 47 | hdr.bth.conweave_ask_reply = 0; 48 | hdr.bth.conweave_tail_flag = 0; 49 | hdr.bth.flags = 0; 50 | hdr.ipv4.ecn = 0b00; 51 | do_debug_eg_cntr1(); // XXX 52 | 53 | // use same epoch 54 | // exit; 55 | } else if (meta.ig_mirror1.mirror_option == 2) { /** Reply of INIT (NOTE:using original packet) */ 56 | swap_src_dst_fields(); 57 | hdr.bth.conweave_opcode = 2; 58 | hdr.bth.conweave_phase = 0; 59 | hdr.bth.conweave_ask_reply = 0; 60 | hdr.bth.conweave_tail_flag = 0; 61 | hdr.bth.flags = (bit<8>)hdr.ipv4.ecn; /* INIT Reply with NOTIFY */ 62 | hdr.ipv4.ecn = 0b00; 63 | do_debug_eg_cntr2(); // XXX 64 | 65 | // use same epoch 66 | // exit; 67 | } 68 | #if (LPBK_FOR_NOTIFY == 1) 69 | else if (eg_intr_md.egress_port == 8) { /** NOTIFY (NOTE: using crafted packet) */ 70 | #else 71 | else if (meta.ig_mirror1.mirror_option == 3) { /* NOTIFY */ 72 | #endif 73 | swap_src_dst_fields(); 74 | hdr.bth.conweave_opcode = 3; 75 | hdr.bth.conweave_ask_reply = 0; 76 | hdr.bth.conweave_tail_flag = 0; 77 | hdr.bth.flags = 0; 78 | hdr.ipv4.ecn = 0b00; 79 | do_debug_eg_cntr3(); // XXX 80 | 81 | // use same epoch 82 | // use same out_port in bth 83 | // exit; 84 | } else { /* rest of packets toward recirc/lbpk -> CWCTRL */ 85 | /** NOTE: RUN ONLY ONCE PER CWCTRL PKT, Newly mirrored packet for Ctrl */ 86 | #if (LPBK_FOR_CTRL == 1) 87 | if (eg_intr_md.egress_port == 16 && hdr.cwctrl.isValid() == false) { /** CTRL (NOTE: using crafted packet) */ 88 | #else 89 | if (eg_intr_md.egress_port [6:0] == (bit<7>)RECIRC_PORT && hdr.cwctrl.isValid() == false) { 90 | #endif 91 | /* validate hdr.cwctrl header!! */ 92 | hdr.cwctrl.setValid(); 93 | hdr.cwctrl.pre_timeout = 0; 94 | hdr.cwctrl.timeout = 0; 95 | hdr.cwctrl.drop = 0; 96 | hdr.cwctrl.cntr_eg = 0; 97 | hdr.cwctrl.afc_msg = 0; 98 | hdr.ethernet.ether_type = (bit<16>)ether_type_t.CWCTRL; 99 | 100 | /* update/initialize header */ 101 | hdr.bth.conweave_ask_reply = 0; 102 | hdr.bth.conweave_tail_flag = 0; 103 | hdr.bth.flags = 0; 104 | hdr.ipv4.ecn = 0b00; 105 | do_debug_eg_cntr4(); // XXX 106 | // use same epoch 107 | // use same phase (phase-1) 108 | // use same opcode (1) 109 | // exit; 110 | } else { 111 | /* mirror_option = 4 -> ORIGINAL NewOoO PACKET !! */ 112 | 113 | #if (LPBK_FOR_CTRL == 1) 114 | if (eg_intr_md.egress_port == 16 && hdr.cwctrl.isValid()) { /* CTRL */ 115 | #else 116 | if (eg_intr_md.egress_port [6:0] == (bit<7>)RECIRC_PORT && hdr.cwctrl.isValid()) { /* CTRL */ 117 | #endif 118 | /*----------------------------------------------------------- 119 | Egress Dequeue Depth History 120 | ------------------------------------------------------------*/ 121 | /* hdr.cwctrl.afc_msg (32bits) -> meta.idx_qdepth_history_rx */ 122 | do_get_idx_queue_occupancy_array_ctrl_eg.apply(); 123 | if (meta.hit_idx_queue_occupancy_tbl_eg == 1) { /* if hit */ 124 | /** DROP: reorder is resolved, reset counter to 0 */ 125 | /** READ: read register and save to cwctrl header */ 126 | do_read_reset_buffer_egress_cntr(); /* READ -> hdr.cwctrl.cntr_eg */ 127 | } 128 | 129 | if (hdr.cwctrl.pre_timeout == 1) { 130 | // hdr.cwctrl.hashidx 131 | do_check_tail_resume(); // -> meta.flag_check_tail_resume = 1 if TAIL already resumed the reorder queue 132 | if (meta.flag_check_tail_resume == 1) { 133 | hdr.cwctrl.pre_timeout = 0; 134 | hdr.cwctrl.timeout = 1; 135 | } 136 | } 137 | 138 | } else { 139 | /*-------------------------------------------------------------------------------------*/ 140 | /*------ Only DATA packets (at both srcToR/dstToR) will be processed by following -----*/ 141 | /*-------------------------------------------------------------------------------------*/ 142 | 143 | /*------------------------------------------ 144 | Resume Reorder Queue - by TAIL 145 | -------------------------------------------*/ 146 | if (hdr.tailh.isValid()) { /* resume the reorder queue at egress deparser*/ 147 | eg_intr_md_for_dprsr.adv_flow_ctl = hdr.tailh.afc_msg_resume; 148 | 149 | /* return back to original packet header */ 150 | hdr.tailh.setInvalid(); 151 | hdr.ethernet.ether_type = ether_type_t.IPV4; 152 | 153 | /* record TAIL has resumed the reorder queue. CTRL will check it. */ 154 | do_update_tail_resume(); 155 | do_debug_eg_cntr5(); // XXX 156 | } 157 | 158 | /*----------------------------------------------------------- 159 | Egress Dequeue Depth History 160 | ------------------------------------------------------------*/ 161 | /* eg_intr_md.egress_port (9 bits), eg_intr_md.egress_qid (7 bits) -> meta.idx_qdepth_history_rx **/ 162 | do_get_idx_queue_occupancy_array_data_eg.apply(); 163 | if (meta.hit_idx_queue_occupancy_tbl_eg == 1) { /* if hit */ 164 | /** DEQUEUE: increase counter by 1 */ 165 | do_increment_buffer_egress_cntr(); 166 | } 167 | 168 | /*----------------------------------------------------------- 169 | ECN MARKING (DCQCN <- RDMA, DCTCP <- TCP) 170 | ------------------------------------------------------------*/ 171 | if (hdr.ipv4.ecn == 0b01 || hdr.ipv4.ecn == 0b10) { 172 | if (meta.is_roce_v2 == 1) { // RoCEv2 Pkt 173 | /* DCQCN (RED-like marking) */ 174 | dcqcn_get_ecn_probability.apply(); // get probability to ecn-mark 175 | dcqcn_get_random_number(); // get random number for sampling 176 | dcqcn_compare_probability.apply(); // fills meta.mark_ecn_codepoint 177 | } else { // use DCTCP-like marking 178 | check_ecn_marking_threshold(); // fills meta.mark_ecn_codepoint 179 | } 180 | 181 | if (meta.mark_ecn_codepoint == 1) { 182 | mark_ecn_ce_codepoint(); 183 | } 184 | } // #### ECN Marking (end) ###### 185 | 186 | /*--------------------------------------------------------------------- 187 | CLEAR BTH & CWH HEADERS OF ORIGINAL (NON-MIRRORING) PKTS TO DST 188 | ----------------------------------------------------------------------*/ 189 | do_check_toward_dst.apply(); /* -> meta.last_hop */ 190 | if (meta.last_hop == 1) { 191 | if (hdr.cwh.isValid()) { 192 | invalid_conweave_eg(); 193 | } 194 | if (hdr.bth.isValid()) { 195 | initialize_bth_header_eg(); 196 | } 197 | 198 | } 199 | } 200 | } 201 | } 202 | } 203 | } // End of SwitchEgress -------------------------------------------------------------------------------- /leaf_conweave/p4src/includes/headers.p4: -------------------------------------------------------------------------------- 1 | #ifndef _HEADERS_ 2 | #define _HEADERS_ 3 | 4 | #include "macro.p4" 5 | 6 | /******************************************************* 7 | **** C L A S S I C H E A D E R **** 8 | ********************************************************/ 9 | 10 | header ethernet_h { 11 | mac_addr_t dst_addr; 12 | mac_addr_t src_addr; 13 | bit<16> ether_type; 14 | } 15 | 16 | header arp_h { 17 | bit<16> htype; 18 | bit<16> ptype; 19 | bit<8> hlen; 20 | bit<8> plen; 21 | bit<16> oper; 22 | mac_addr_t sender_hw_addr; 23 | ipv4_addr_t sender_ip_addr; 24 | mac_addr_t target_hw_addr; 25 | ipv4_addr_t target_ip_addr; 26 | } 27 | 28 | header ipv4_h { 29 | bit<4> version; 30 | bit<4> ihl; 31 | bit<6> dscp; // tos field 32 | bit<2> ecn; // tos field 33 | bit<16> total_len; // 1024B MTU RDMA -> 1084 (CX6), 1068 (CX5 except WR_FIRST) 34 | bit<16> identification; 35 | bit<3> flags; 36 | bit<13> frag_offset; 37 | bit<8> ttl; 38 | bit<8> protocol; 39 | bit<16> hdr_checksum; 40 | ipv4_addr_t src_addr; 41 | ipv4_addr_t dst_addr; 42 | } 43 | 44 | header tcp_h { 45 | bit<16> src_port; 46 | bit<16> dst_port; 47 | bit<32> seq_no; 48 | bit<32> ack_no; 49 | bit<4> data_offset; 50 | bit<4> res; 51 | bit<8> flags; 52 | bit<16> window; 53 | bit<16> checksum; 54 | bit<16> urgent_ptr; 55 | } 56 | 57 | header udp_h { 58 | bit<16> src_port; 59 | bit<16> dst_port; 60 | bit<16> hdr_length; 61 | bit<16> checksum; 62 | } 63 | 64 | header icmp_h { 65 | bit<8> type_; 66 | bit<8> code; 67 | bit<16> hdr_checksum; 68 | bit<16> id; 69 | bit<16> seq_no; 70 | bit<64> data_time; 71 | } 72 | 73 | /*---- RDMA (12 bytes) ----*/ 74 | header ib_bth_h { 75 | bit<8> opcode; 76 | bit<8> flags; /** NOTE: "flags" field is used for REPLY INIT's ECN (0x3) between SrcToR/DstToR. No effect on RDMA. */ 77 | bit<16> partition_key; 78 | 79 | /*--- RC reserved0 (8 bits) ----*/ 80 | bit<8> out_port; 81 | /*---------------------*/ 82 | 83 | bit<24> destination_qp; 84 | bit<1> ack_request; 85 | 86 | /*--- RC reserved1 (7 bits)----*/ 87 | bit<2> conweave_opcode; /* 0: NOTHING, 1: DATA, 2: REPLY, 3: NOTIFY */ 88 | bit<1> conweave_phase; 89 | bit<2> conweave_epoch; 90 | bit<1> conweave_ask_reply; 91 | bit<1> conweave_tail_flag; /* TAIL */ 92 | /*---------------------*/ 93 | 94 | bit<24> packet_seqnum; 95 | } 96 | 97 | // ACK 98 | header ib_aeth_h { 99 | bit<1> reserved; 100 | bit<2> opcode; // (0: ACK, 3: NACK) 101 | bit<5> error_code; // (PSN SEQ ERROR) 102 | bit<8> msg_seq_number; 103 | } 104 | 105 | /******************************************************* 106 | **** A D V A N C E D F L O W C O N T R O L **** 107 | *******************************************************/ 108 | header conweave_ctrl_h { 109 | @padding bit<5> _pad1; 110 | bit<1> pre_timeout; /* 1: pre_timeout (must check egress register) */ 111 | bit<1> timeout; /* 1: timeout triggered */ 112 | bit<1> drop; /* 1: must be dropped */ 113 | bit<32> cntr_eg; /* reorder-buffer egress counter */ 114 | bit<32> afc_msg; /* without credit setup (i.e., the least significant 15 bits are empty) */ 115 | bit<16> hashidx; // hashidx for egress pipeline 116 | 117 | 118 | /** AFC: Format */ 119 | // bit<1> qfc; 120 | // bit<2> tm_pipe_id; 121 | // bit<4> tm_mac_id; 122 | // bit<3> _pad; 123 | // bit<7> tm_mac_qid; 124 | // bit<15> credit; 125 | } 126 | 127 | header conweave_tail_h { 128 | bit<32> afc_msg_resume; 129 | bit<16> hashidx; // hashidx for egress pipeline 130 | } 131 | 132 | /******************************************************* 133 | **** C O N W E A V E H E A D E R **** 134 | *******************************************************/ 135 | header conweave_h { 136 | bit<16> ts_tx; 137 | bit<16> ts_tail; 138 | } 139 | 140 | header resubmit_h { 141 | } 142 | 143 | header eg_mirror1_h { 144 | } 145 | 146 | header ig_mirror1_h { 147 | bit<8> mirror_option; /* 1: TAIL's REPLY (CLEAR), 2: INIT's REPLY, 3: NOTIFY, 4: Reorder-Ctrl */ 148 | } 149 | 150 | struct header_t { 151 | ethernet_h ethernet; 152 | ipv4_h ipv4; 153 | arp_h arp; 154 | tcp_h tcp; 155 | udp_h udp; 156 | icmp_h icmp; 157 | ib_bth_h bth; /* RDMA headers */ 158 | conweave_h cwh; /* ConWeave header */ 159 | conweave_ctrl_h cwctrl; /* ConWeave Ctrl header */ 160 | conweave_tail_h tailh; /* ConWeave TAIL header (if needed) */ 161 | } 162 | 163 | /******************************************************* 164 | **** H E A D E R & M E T A D A T A **** 165 | ********************************************************/ 166 | 167 | struct metadata_t { 168 | /* resubmit or mirroring */ 169 | resubmit_h resubmit_hdr; 170 | eg_mirror1_h eg_mirror1; 171 | ig_mirror1_h ig_mirror1; 172 | MirrorId_t mirror_session; 173 | 174 | /* ConWeave */ 175 | bit<1> conweave_on_off; /* switch on/off */ 176 | bit<2> conweave_logic; /* 1: TxToR, 2: RxToR, 3: WRONG, 0: intra-ToR */ 177 | bit<2> pipeline_index; /* ig_intr_md.ingress_port[8:7], see parser */ 178 | 179 | /* switch's ID for our virtual topology */ 180 | switch_id_t switch_id; 181 | nexthop_id_t nexthop_id; 182 | bit<1> last_hop; 183 | PortId_t out_port; // final 184 | QueueId_t out_queue_id; // final 185 | 186 | 187 | /* dummy and common metadata */ 188 | ipv4_addr_t dummy_32b; /* for sip<->dip swap (REPLY & NOTIFY) */ 189 | ipv4_addr_t meta_src_addr; 190 | ipv4_addr_t meta_dst_addr; 191 | timestamp_t ts_now; 192 | timestamp_t ts_tail; 193 | hashidx_t hashidx; /* key & table idx */ 194 | bit<1> digest_on; /* digest flowkey */ 195 | 196 | 197 | /* packet metadata */ 198 | bit<2> pkt_epoch; /* <- hdr.bth.conweave_epoch */ 199 | bit<1> pkt_phase; /* <- hdr.bth.conweave_phase */ 200 | bit<1> pkt_ask_reply; /* <- hdr.bth.conweave_ask_reply */ 201 | bit<1> pkt_tail_flag; /* <- hdr.bth.conweave_tail_flag */ 202 | 203 | bit<1> flag_cwctrl_active; /* hdr.cwctrl.isValid() */ 204 | bit<1> pkt_cwctrl_timeout; /* <- hdr.cwctrl.timeout */ 205 | bit<1> pkt_cwctrl_drop; /* <- hdr.cwctrl.drop */ 206 | bit<32> pkt_cwctrl_cntr_eg; /* <- hdr.cwctrl.cntr_eg */ 207 | bit<32> pkt_cwctrl_afc_msg; /* <- hdr.cwctrl.afc_msg */ 208 | 209 | /* pair for initialization */ 210 | pair init_cntr_ig; 211 | /*********************************************************** 212 | * C O N W E A V E - T X M E T A D A T A 213 | ***********************************************************/ 214 | /* timestamp */ 215 | timestamp_t ts_base_rtt; 216 | timestamp_t ts_new_reply_timeout; 217 | 218 | /* sampled port info */ 219 | bit<8> sample_port_c1; // chance 1 220 | bit<8> sample_port_c2; // chance 2 221 | bit<8> good_port; // good port without ECN marking 222 | bit<8> final_port; // final port to send a current packet 223 | bit<1> no_good_port; // if good_port is not actually good enough 224 | bit<2> stage_to_record_port; // CRC8 or out_port[1:0] 225 | 226 | /* metadata at TX */ 227 | bit<1> flag_rdma_data; 228 | bit<1> flag_matched; // 1: found from get_hash_idx table 229 | bit<1> flag_enforce_no_reroute; // 1: enforce not to reroute, since TS_MAX - 10ms 230 | bit<1> result_expired; // 1: expired 231 | bit<1> result_stability; // 1: stable 232 | bit<1> result_reply_timeout; // 1: timeout 233 | bit<1> result_timely_replied; // 1: timely replied 234 | bit<1> result_phase; // phase 1 is possible only when we call "do_get_phase()" 235 | bit<2> result_epoch; // current epoch 236 | bit<1> result_port_c1_bad; // 1: sample_c1 is bad port 237 | bit<1> result_port_c2_bad; // 1: sample_c2 is bad port 238 | bit<1> result_reply_with_notify; // 1: INIT's reply with NOTIFY 239 | 240 | /*********************************************************** 241 | * C O N W E A V E - R X M E T A D A T A 242 | ***********************************************************/ 243 | bit<32> hash_flowkey; 244 | 245 | timestamp_t ts_phase0_tx; 246 | timestamp_t ts_phase0_rx; 247 | timestamp_t ts_timegap_rx; /* tail_tx - phase0_tx */ 248 | timestamp_t ts_expected_tail_arrival_rx; /* time to flush queue */ 249 | 250 | bit<2> result_epoch_rx; /* 1: new epoch, 2: prev epoch so bypass, 0: process */ 251 | bit<1> result_phase0_cch_rx; /* 1: phase-0 pkt has passed (or is passing) */ 252 | bit<1> result_tail_cch_rx; /* 1: tail has passed (or is passing) */ 253 | bit<1> result_out_of_order_rx; /* 1: out-of-ordered packet */ 254 | bit<2> result_reorder_status; /* 1: reorder is on-going, 2: new register */ 255 | 256 | QueueId_t hash_qid_sample_c1; // 25G: 4 queues (2 bits), 100G: 8 queues (3 bits) 257 | QueueId_t hash_qid_sample_c2; // 25G: 4 queues (2 bits), 100G: 8 queues (3 bits) 258 | QueueId_t hash_qid_sample_c3; // 25G: 4 queues (2 bits), 100G: 8 queues (3 bits) 259 | 260 | conweave_qreg_idx_width_t idx_q_occup_arr_rx_c1; // 12 bits - port(9) + queue(3) 261 | conweave_qreg_idx_width_t idx_q_occup_arr_rx_c2; // 12 bits - port(9) + queue(3) 262 | conweave_qreg_idx_width_t idx_q_occup_arr_rx_c3; // 12 bits - port(9) + queue(3) 263 | 264 | bit<1> result_q_occupancy_c1; /* 1: registered, or matched */ 265 | bit<1> result_q_occupancy_c2; /* 1: registered, or matched */ 266 | bit<1> result_q_occupancy_c3; /* 1: registered, or matched */ 267 | 268 | bit<1> result_time_flush_queue_rx; /* 1: timeout */ 269 | bit<1> possibly_tail_before_timeout; /* 1: possibly TAIL before timeout */ 270 | bit<1> flag_mirr_for_ctrl_loop; /* 1: mirror */ 271 | bit<1> result_tail_send_reply_rx; /* 1: send TAIL's reply */ 272 | bit<1> flag_finish_reorder_process; /* 1: reorder is resolved */ 273 | bit<1> flag_resume_reorder_queue; /* 1: resume reorder queue */ 274 | bit<1> flag_check_tail_resume; /* 1: queue is resumed in advance by TAIL */ 275 | bit<32> result_q_pkt_cntr_ig; /* counter */ 276 | 277 | /* Egress qdepth metadata */ 278 | conweave_qdepth_idx_width_t idx_qdepth_history_rx; // 13 bits 279 | 280 | 281 | /*********** T E M P O R A R I L Y ********/ 282 | bit<1> cntr_additive; 283 | 284 | 285 | 286 | /*********************************************************** 287 | * A D V A N C E D F L O W C O N T R O L 288 | ***********************************************************/ 289 | afc_msg_t afc_msg_c1; // 32 bits, without PAUSE/RESUME instruction yet 290 | afc_msg_t afc_msg_c2; // 32 bits, without PAUSE/RESUME instruction yet 291 | afc_msg_t afc_msg_c3; // 32 bits, without PAUSE/RESUME instruction yet 292 | 293 | 294 | /*********************************************************** 295 | * D C Q C N - E C N M A R K I N G 296 | ***********************************************************/ 297 | bit<1> mark_ecn_codepoint; 298 | bit<1> is_roce_v2; 299 | bit<8> dcqcn_prob_output; 300 | bit<8> dcqcn_random_number; 301 | 302 | 303 | /*********************************************************** 304 | * S O M E T H I N G D E B U G 305 | ***********************************************************/ 306 | bit<1> flag_something_wrong; 307 | 308 | /*********************************************************** 309 | * S O M E T H I N G E G R E S S 310 | ***********************************************************/ 311 | bit<1> hit_idx_queue_occupancy_tbl_eg; 312 | } 313 | 314 | #endif -------------------------------------------------------------------------------- /leaf_conweave/p4src/includes/macro.p4: -------------------------------------------------------------------------------- 1 | #ifndef _MACROS_ 2 | #define _MACROS_ 3 | 4 | #define LPBK_FOR_CTRL (1) 5 | #define LPBK_FOR_NOTIFY (1) 6 | 7 | /*************************************************************************/ 8 | /****** IMPORTANT: Different configuration for 25G/100G link speed *******/ 9 | #define CONWEAVE_EVAL_Q16_OR_Q32 (0) // 0: 16 per port, 1: 32 per port 10 | // check all {config_leaf.py, leaf_conweave.cpp, macro.p4} 11 | /*************************************************************************/ 12 | 13 | /** IMPORTANT: We assume using 32 queues per front-panel port. (see python script) */ 14 | /*************************************************************************/ 15 | 16 | /************************************************************************* 17 | ************* C O N S T A N T S A N D T Y P E S ******************* 18 | *************************************************************************/ 19 | 20 | /* for ConWeave Table */ 21 | #define CONWEAVE_HASH_WIDTH (12) // maximum 16 bits 22 | #define CONWEAVE_TABLE_SIZE (1 << CONWEAVE_HASH_WIDTH) 23 | typedef bit hashidx_t; 24 | 25 | /* for ConWeave Reordering Queue */ 26 | #define CONWEAVE_QREG_IDX_WIDTH (10) // 13 bits, at ingress 27 | #define CONWEAVE_QREG_IDX_SIZE (1 << CONWEAVE_QREG_IDX_WIDTH) 28 | typedef bit conweave_qreg_idx_width_t; 29 | 30 | #if (CONWEAVE_EVAL_Q16_OR_Q32 == 0) // 0: Q16 (25Gbps), 1: Q32 (100Gbps) 31 | #define CONWEAVE_QREG_IDX_OFFSET_C1 (2) 32 | #define CONWEAVE_QREG_IDX_OFFSET_C2 (6) 33 | #define CONWEAVE_QREG_IDX_OFFSET_C3 (10) 34 | #else 35 | #define CONWEAVE_QREG_IDX_OFFSET_C1 (2) 36 | #define CONWEAVE_QREG_IDX_OFFSET_C2 (10) 37 | #define CONWEAVE_QREG_IDX_OFFSET_C3 (18) 38 | #endif 39 | 40 | #define CONWEAVE_QDEPTH_IDX_WIDTH (10) // 13 bits, at egress 41 | #define CONWEAVE_QDEPTH_IDX_SIZE (1 << CONWEAVE_QDEPTH_IDX_WIDTH) 42 | typedef bit conweave_qdepth_idx_width_t; 43 | 44 | /* type definitions */ 45 | typedef bit<32> afc_msg_t; 46 | typedef bit<48> mac_addr_t; 47 | typedef bit<32> ipv4_addr_t; 48 | 49 | typedef bit<12> nexthop_id_t; 50 | typedef bit<8> switch_id_t; 51 | typedef bit<32> timestamp_t; // use middle of 31 bits, e.g., (bit<32>)X[40:10] 52 | #if (CONWEAVE_EVAL_Q16_OR_Q32 == 0) 53 | typedef bit<2> conweave_qid_width_t; // 2 bits - 4 queues with 3 stages - total 12 queues 54 | #else 55 | typedef bit<3> conweave_qid_width_t; // 3 bits - 8 queues with 3 stages - total 24 queues 56 | #endif 57 | 58 | 59 | /************************************************************************************************/ 60 | /* FOR DEBUGGING (MAKE SLOW) */ 61 | #define TIME_RESOLUTION_OFFSET1 (0) /* 17: (debug) 0.13 actual sec per unit, 0: original speed */ 62 | #define TIME_RESOLUTION_OFFSET2 (0) /* 7: (debug) max resolution, 0: original speed */ 63 | /************************************************************************************************/ 64 | 65 | /** CONWEAVE: PARAMETERS */ 66 | const timestamp_t CONWEAVE_MAX_TIMESTAMP = 2147483647; // 2**31 - 1 67 | 68 | const timestamp_t CONWEAVE_TX_EXPIRED_TS = 10000000; // for lossless RDMA, timegap to resume new epoch, "inf" for test-purpose 69 | const timestamp_t CONWEAVE_TX_ECN_PORT_TS = 32; // time to drain Kmin-bytes queue (us), e.g., 100KB 100G -> 8us, 100KB 25G -> 32us 70 | const timestamp_t CONWEAVE_TX_REPLY_TIMEOUT_EXTENSION_TS = 4; // 4us, when resubmit a reply pkt, we extend the reply timer to avoid reply_timeout during the resubmit 71 | const timestamp_t CONWEAVE_TX_STOP_REROUTING_TS = 2147473647; // 2**31 - 10ms, just stop re-routing during 10ms to avoid timestamp wrap-around at RxToR 72 | 73 | const timestamp_t CONWEAVE_RX_DEFAULT_WAITING_TIME = 10000; // for lossless RDMA, 10ms for sanity for test-purpose 74 | const timestamp_t CONWEAVE_RX_BASE_WAITING_TIME = 1000; // 32 (us) for 25G, extra waiting time for uncertainty 75 | const timestamp_t CONWEAVE_RX_ADJUST_TS_TAIL_WRAP = 65536; // 65536, when TAIL arrives 76 | const timestamp_t CONWEAVE_RX_ADJUST_TS_TAIL_WRAP_WITH_BASE = 66536; // 66536 = 65536 + CONWEAVE_RX_BASE_WAITING_TIME 77 | 78 | /** CONWEAVE: ADVANCED FLOW CONTROL */ 79 | #define AFC_CREDIT_PAUSE (1) 80 | #define AFC_CREDIT_RESUME (0) 81 | 82 | /* for resubmission */ 83 | const bit<3> RESUB_DPRSR_DIGEST_REPLY = 7; 84 | 85 | /* for mirroring */ 86 | const bit<8> MIRROR_SESSION_CONWEAVE = 220; // + pipe_id (0,1,2,3) 87 | 88 | /* for custom hashing (crc32_mpeg) */ 89 | CRCPolynomial>(32w0x04C11DB7, false, false, false, 32w0xFFFFFFFF, 32w0x00000000) CRC32_MPEG; 90 | 91 | /************************************************************************* 92 | ************* C O N S T A N T S A N D T Y P E S ******************* 93 | *************************************************************************/ 94 | 95 | /* ARP */ 96 | #define MCAST_GRP_ID (1) 97 | 98 | /* Mirror Types & Recirculation */ 99 | #if __TARGET_TOFINO__ == 2 100 | #define RECIRC_PORT (6) // recirc port on Tofino2 101 | const bit<4> EG_MIRROR_TYPE_1 = 1; // corresponds to eg_mirror1_h 102 | const bit<4> IG_MIRROR_TYPE_1 = 2; // corresponds to ig_mirror1_h 103 | #else 104 | #define RECIRC_PORT (68) // recirc port on Tofino1 105 | const bit<3> EG_MIRROR_TYPE_1 = 1; // corresponds to eg_mirror1_h 106 | const bit<3> IG_MIRROR_TYPE_1 = 2; // corresponds to ig_mirror1_h 107 | #endif 108 | 109 | /* Hashing and Registers */ 110 | struct pair { // for 32-bit pair 111 | bit<32> lo; 112 | bit<32> hi; 113 | } 114 | 115 | /* for ECMP LAG */ 116 | #define MAX_GROUP_SIZE (32) 117 | #define MAX_GROUPS (256) 118 | #define MAX_PROFILE_MEMBERS (2048) 119 | #define TABLE_IPV4_SIZE (2048) 120 | #define TABLE_NEXTHOP_SIZE (2048) 121 | #define SCRAMBLE_ENABLE (1) 122 | #define HASH_WIDTH (16) 123 | 124 | #endif -------------------------------------------------------------------------------- /leaf_conweave/p4src/includes/parser.p4: -------------------------------------------------------------------------------- 1 | #ifndef _PARSER_ 2 | #define _PARSER_ 3 | 4 | #include "macro.p4" 5 | 6 | enum bit<16> ether_type_t { 7 | IPV4 = 0x0800, 8 | ARP = 0x0806, 9 | CWCTRL = 0x2001, // conweave's ctrl-loop header 10 | CWTAIL = 0x2002 // conweave's TAIL header 11 | } 12 | 13 | enum bit<8> ipv4_proto_t { 14 | TCP = 6, 15 | UDP = 17, 16 | ICMP = 1 17 | } 18 | 19 | enum bit<16> udp_proto_t{ 20 | ROCE_V2 = 4791, 21 | FAKE_ROCE_V2 = 4792 // XXX 22 | } 23 | 24 | // --------------------------------------------------------------------------- 25 | // Ingress parser 26 | // --------------------------------------------------------------------------- 27 | parser SwitchIngressParser( 28 | packet_in pkt, 29 | out header_t hdr, 30 | out metadata_t meta, 31 | out ingress_intrinsic_metadata_t ig_intr_md, 32 | out ingress_intrinsic_metadata_for_tm_t ig_intr_md_for_tm, 33 | out ingress_intrinsic_metadata_from_parser_t ig_intr_md_from_prsr) { 34 | state start { 35 | pkt.extract(ig_intr_md); 36 | /**************************************************************************** 37 | * M E T A D A T A I N I T I A L I Z A T I O N 38 | ****************************************************************************/ 39 | 40 | meta.pipeline_index = ig_intr_md.ingress_port [8:7]; // index of pipeline 41 | meta.mirror_session = 0; 42 | meta.conweave_on_off = 0; 43 | meta.conweave_logic = 0; 44 | meta.switch_id = 0; 45 | meta.nexthop_id = 0; 46 | meta.out_port = 0; 47 | meta.out_queue_id = 0; 48 | meta.last_hop = 0; 49 | 50 | meta.dummy_32b = 0; 51 | meta.ts_now = 0; 52 | meta.ts_tail = 0; 53 | meta.hashidx = 0; 54 | meta.digest_on = 0; 55 | 56 | /*----- C O N W E A V E - TxToR M E T A D A T A -----*/ 57 | meta.ts_base_rtt = 0; 58 | meta.ts_new_reply_timeout = 0; 59 | 60 | meta.sample_port_c1 = 0; 61 | meta.sample_port_c2 = 0; 62 | meta.good_port = 0; 63 | meta.final_port = 0; 64 | meta.no_good_port = 0; 65 | meta.stage_to_record_port = 0; 66 | 67 | meta.flag_rdma_data = 0; 68 | meta.flag_matched = 0; 69 | meta.flag_enforce_no_reroute = 0; 70 | meta.result_expired = 0; 71 | meta.result_stability = 0; 72 | meta.result_reply_timeout = 0; 73 | meta.result_timely_replied = 0; 74 | meta.result_phase = 0; 75 | meta.result_epoch = 0; 76 | 77 | meta.result_port_c1_bad = 0; 78 | meta.result_port_c2_bad = 0; 79 | meta.result_reply_with_notify = 0; 80 | 81 | /*----- C O N W E A V E - RxToR (DstToR) M E T A D A T A -----*/ 82 | meta.hash_flowkey = 0; 83 | 84 | meta.ts_phase0_tx = 0; 85 | meta.ts_phase0_rx = 0; 86 | meta.ts_timegap_rx = 0; 87 | meta.ts_expected_tail_arrival_rx = 0; 88 | 89 | meta.result_epoch_rx = 0; 90 | meta.result_phase0_cch_rx = 0; 91 | meta.result_tail_cch_rx = 0; 92 | meta.result_out_of_order_rx = 0; 93 | meta.result_reorder_status = 0; 94 | 95 | meta.hash_qid_sample_c1 = 0; 96 | meta.hash_qid_sample_c2 = 0; 97 | meta.hash_qid_sample_c3 = 0; 98 | meta.idx_q_occup_arr_rx_c1 = 0; 99 | meta.idx_q_occup_arr_rx_c2 = 0; 100 | meta.idx_q_occup_arr_rx_c3 = 0; 101 | meta.result_q_occupancy_c1 = 0; 102 | meta.result_q_occupancy_c2 = 0; 103 | meta.result_q_occupancy_c3 = 0; 104 | meta.result_time_flush_queue_rx = 0; 105 | meta.possibly_tail_before_timeout = 0; 106 | meta.flag_mirr_for_ctrl_loop = 0; 107 | meta.result_tail_send_reply_rx = 0; 108 | meta.result_q_pkt_cntr_ig = 0; 109 | meta.flag_finish_reorder_process = 0; 110 | meta.flag_resume_reorder_queue = 0; 111 | meta.idx_qdepth_history_rx = 0; 112 | 113 | /**** TEMPORARILY *****/ 114 | meta.cntr_additive = 0; 115 | 116 | 117 | /*----- A D V A N C E D F L O W C O N T R O L -----*/ 118 | meta.afc_msg_c1 = 0; 119 | meta.afc_msg_c2 = 0; 120 | meta.afc_msg_c3 = 0; 121 | 122 | /*------ D C Q C N -----*/ 123 | meta.mark_ecn_codepoint = 0; 124 | meta.is_roce_v2 = 0; 125 | meta.dcqcn_prob_output = 0; 126 | meta.dcqcn_random_number = 0; 127 | 128 | /*------ M I R R O R I N G ------*/ 129 | meta.ig_mirror1.mirror_option = 0; 130 | 131 | /*---- R E A D H E A D E R ----*/ 132 | meta.pkt_epoch = 0; 133 | meta.pkt_phase = 0; 134 | meta.pkt_ask_reply = 0; 135 | meta.pkt_tail_flag = 0; 136 | 137 | meta.flag_cwctrl_active = 0; 138 | meta.pkt_cwctrl_timeout = 0; 139 | meta.pkt_cwctrl_cntr_eg = 0; 140 | meta.pkt_cwctrl_drop = 0; 141 | meta.pkt_cwctrl_afc_msg = 0; 142 | 143 | meta.init_cntr_ig.lo = 0; 144 | meta.init_cntr_ig.hi = CONWEAVE_MAX_TIMESTAMP; 145 | 146 | /*----- D E B U G -----*/ 147 | meta.flag_something_wrong = 0; 148 | 149 | transition select(ig_intr_md.resubmit_flag) { 150 | (0) : init_metadata; 151 | (1) : parse_resubmit; 152 | } 153 | } 154 | 155 | state parse_resubmit { 156 | pkt.extract(meta.resubmit_hdr); 157 | pkt.advance(PORT_METADATA_SIZE - sizeInBits(meta.resubmit_hdr)); 158 | transition parse_ethernet; 159 | } 160 | 161 | state init_metadata { 162 | pkt.advance(PORT_METADATA_SIZE); // macro defined in tofino.p4 163 | transition parse_ethernet; 164 | } 165 | 166 | state parse_ethernet { 167 | pkt.extract(hdr.ethernet); 168 | transition select(hdr.ethernet.ether_type) { 169 | (bit<16>)ether_type_t.IPV4 : parse_ipv4; 170 | (bit<16>)ether_type_t.ARP : parse_arp; 171 | (bit<16>)ether_type_t.CWCTRL : parse_ipv4; 172 | default: accept; 173 | } 174 | } 175 | 176 | state parse_ipv4 { 177 | pkt.extract(hdr.ipv4); 178 | 179 | /* copy src/dst ip address */ 180 | meta.meta_src_addr = hdr.ipv4.src_addr; 181 | meta.meta_dst_addr = hdr.ipv4.dst_addr; 182 | 183 | transition select(hdr.ipv4.protocol) { 184 | (bit<8>)ipv4_proto_t.TCP : parse_tcp; 185 | (bit<8>)ipv4_proto_t.UDP : parse_udp; 186 | (bit<8>)ipv4_proto_t.ICMP : parse_icmp; 187 | default: accept; 188 | } 189 | } 190 | 191 | state parse_arp { 192 | pkt.extract(hdr.arp); 193 | transition accept; 194 | } 195 | 196 | state parse_tcp { 197 | pkt.extract(hdr.tcp); 198 | transition accept; 199 | } 200 | 201 | state parse_udp { 202 | pkt.extract(hdr.udp); 203 | transition select(hdr.udp.dst_port) { 204 | (bit<16>)udp_proto_t.ROCE_V2 : parse_bth; 205 | (bit<16>)udp_proto_t.FAKE_ROCE_V2 : parse_bth; // XXX 206 | default: accept; 207 | } 208 | } 209 | 210 | state parse_bth { 211 | pkt.extract(hdr.bth); 212 | meta.is_roce_v2 = 1; // RDMA packet 213 | transition select(hdr.bth.conweave_opcode) { 214 | (bit<2>)1 : parse_conweave; 215 | (bit<2>)2 : parse_conweave; 216 | (bit<2>)3 : parse_conweave; 217 | default: accept; 218 | } 219 | } 220 | 221 | state parse_conweave { 222 | /* pkt metadata */ 223 | meta.pkt_epoch = hdr.bth.conweave_epoch; /* get pkt's epoch */ 224 | meta.pkt_phase = hdr.bth.conweave_phase; /* get pkt's phase */ 225 | meta.pkt_ask_reply = hdr.bth.conweave_ask_reply; /* get pkt's ask_reply */ 226 | meta.pkt_tail_flag = hdr.bth.conweave_tail_flag; /* get tail flag */ 227 | 228 | pkt.extract(hdr.cwh); 229 | transition select(hdr.ethernet.ether_type) { 230 | (bit<16>)ether_type_t.CWCTRL : parse_cwctrl; 231 | default: accept; 232 | } 233 | } 234 | 235 | 236 | state parse_cwctrl { 237 | pkt.extract(hdr.cwctrl); 238 | meta.flag_cwctrl_active = 1; 239 | meta.pkt_cwctrl_timeout = hdr.cwctrl.timeout; 240 | meta.pkt_cwctrl_drop = hdr.cwctrl.drop; 241 | meta.pkt_cwctrl_cntr_eg = hdr.cwctrl.cntr_eg; 242 | meta.pkt_cwctrl_afc_msg = hdr.cwctrl.afc_msg; 243 | transition accept; 244 | } 245 | 246 | 247 | 248 | state parse_icmp { 249 | pkt.extract(hdr.icmp); 250 | transition accept; 251 | } 252 | } 253 | 254 | // --------------------------------------------------------------------------- 255 | // Ingress Deparser 256 | // --------------------------------------------------------------------------- 257 | 258 | control SwitchIngressDeparser( 259 | packet_out pkt, 260 | inout header_t hdr, 261 | in metadata_t meta, 262 | in ingress_intrinsic_metadata_for_deparser_t ig_dprsr_md) { 263 | Checksum() ipv4_checksum; 264 | Mirror() mirror; 265 | Resubmit() resubmit; 266 | 267 | apply { 268 | /* CHECKSUM */ 269 | hdr.ipv4.hdr_checksum = ipv4_checksum.update({hdr.ipv4.version, 270 | hdr.ipv4.ihl, 271 | hdr.ipv4.dscp, 272 | hdr.ipv4.ecn, 273 | hdr.ipv4.total_len, 274 | hdr.ipv4.identification, 275 | hdr.ipv4.flags, 276 | hdr.ipv4.frag_offset, 277 | hdr.ipv4.ttl, 278 | hdr.ipv4.protocol, 279 | hdr.ipv4.src_addr, 280 | hdr.ipv4.dst_addr}); 281 | 282 | /* RESUBMIT */ 283 | if (ig_dprsr_md.resubmit_type == RESUB_DPRSR_DIGEST_REPLY) { 284 | resubmit.emit(meta.resubmit_hdr); 285 | } 286 | 287 | /* INGRESS MIRRORING FOR REPLY/NOTIFY */ 288 | if (ig_dprsr_md.mirror_type == IG_MIRROR_TYPE_1) { 289 | mirror.emit(meta.mirror_session, {meta.ig_mirror1.mirror_option}); 290 | } 291 | 292 | pkt.emit(hdr); 293 | } 294 | } 295 | 296 | // --------------------------------------------------------------------------- 297 | // Egress parser 298 | // --------------------------------------------------------------------------- 299 | parser SwitchEgressParser( 300 | packet_in pkt, 301 | out header_t hdr, 302 | out metadata_t meta, 303 | out egress_intrinsic_metadata_t eg_intr_md, 304 | out egress_intrinsic_metadata_from_parser_t eg_intr_md_from_prsr) { 305 | state start { 306 | pkt.extract(eg_intr_md); 307 | 308 | 309 | /*---- R E A D H E A D E R ----*/ 310 | meta.pkt_epoch = 0; 311 | meta.pkt_phase = 0; 312 | meta.pkt_ask_reply = 0; 313 | meta.pkt_tail_flag = 0; 314 | 315 | meta.flag_cwctrl_active = 0; 316 | meta.pkt_cwctrl_timeout = 0; 317 | meta.pkt_cwctrl_drop = 0; 318 | meta.pkt_cwctrl_cntr_eg = 0; 319 | meta.pkt_cwctrl_afc_msg = 0; 320 | 321 | 322 | transition parse_metadata; 323 | } 324 | 325 | state parse_metadata { 326 | /* D C Q C N */ 327 | meta.mark_ecn_codepoint = 0; 328 | meta.is_roce_v2 = 0; 329 | meta.dcqcn_prob_output = 0; 330 | meta.dcqcn_random_number = 0; 331 | 332 | /*---- M E T A D A T A ----*/ 333 | meta.flag_check_tail_resume = 0; 334 | 335 | ig_mirror1_h mirror_md = pkt.lookahead(); 336 | transition select(mirror_md.mirror_option) { 337 | 1 : parse_mirror_reply_notify; 338 | 2 : parse_mirror_reply_notify; 339 | 3 : parse_mirror_reply_notify; 340 | 4 : parse_mirror_reply_notify; 341 | default: parse_ethernet; 342 | } 343 | } 344 | 345 | /* mirroring */ 346 | state parse_mirror_reply_notify { 347 | pkt.extract(meta.ig_mirror1); 348 | transition parse_ethernet; 349 | } 350 | 351 | state parse_ethernet { 352 | pkt.extract(hdr.ethernet); 353 | transition select(hdr.ethernet.ether_type) { 354 | (bit<16>)ether_type_t.IPV4 : parse_ipv4; 355 | (bit<16>)ether_type_t.CWCTRL : parse_ipv4; 356 | (bit<16>)ether_type_t.CWTAIL : parse_ipv4; 357 | default: accept; 358 | } 359 | } 360 | 361 | state parse_ipv4 { 362 | pkt.extract(hdr.ipv4); 363 | transition select(hdr.ipv4.protocol) { 364 | // (bit<8>) ipv4_proto_t.TCP: parse_tcp; 365 | (bit<8>)ipv4_proto_t.UDP : parse_udp; 366 | default: accept; 367 | } 368 | } 369 | 370 | state parse_udp { 371 | pkt.extract(hdr.udp); 372 | transition select(hdr.udp.dst_port) { 373 | (bit<16>)udp_proto_t.ROCE_V2 : parse_bth; 374 | (bit<16>)udp_proto_t.FAKE_ROCE_V2 : parse_bth; // XXX 375 | default: accept; 376 | } 377 | } 378 | 379 | state parse_bth { 380 | pkt.extract(hdr.bth); 381 | meta.is_roce_v2 = 1; // RDMA packet 382 | transition select(hdr.bth.conweave_opcode) { 383 | (bit<2>)1 : parse_conweave; 384 | (bit<2>)2 : parse_conweave; 385 | (bit<2>)3 : parse_conweave; 386 | default: accept; 387 | } 388 | } 389 | 390 | state parse_conweave { 391 | /* pkt metadata */ 392 | meta.pkt_epoch = hdr.bth.conweave_epoch; /* get pkt's epoch */ 393 | meta.pkt_phase = hdr.bth.conweave_phase; /* get pkt's phase */ 394 | meta.pkt_ask_reply = hdr.bth.conweave_ask_reply; /* get pkt's ask_reply */ 395 | meta.pkt_tail_flag = hdr.bth.conweave_tail_flag; /* get tail flag */ 396 | 397 | pkt.extract(hdr.cwh); 398 | transition select(hdr.ethernet.ether_type) { 399 | (bit<16>)ether_type_t.CWCTRL : parse_cwctrl; 400 | (bit<16>)ether_type_t.CWTAIL : parse_cwtail; 401 | default: accept; 402 | } 403 | } 404 | 405 | 406 | state parse_cwctrl { 407 | pkt.extract(hdr.cwctrl); 408 | meta.flag_cwctrl_active = 1; 409 | meta.pkt_cwctrl_timeout = hdr.cwctrl.timeout; 410 | meta.pkt_cwctrl_drop = hdr.cwctrl.drop; 411 | meta.pkt_cwctrl_cntr_eg = hdr.cwctrl.cntr_eg; 412 | meta.pkt_cwctrl_afc_msg = hdr.cwctrl.afc_msg; 413 | transition accept; 414 | } 415 | 416 | state parse_cwtail { 417 | pkt.extract(hdr.tailh); 418 | transition accept; 419 | } 420 | 421 | 422 | // do more stuff here if needed 423 | } 424 | 425 | // --------------------------------------------------------------------------- 426 | // Egress Deparser 427 | // --------------------------------------------------------------------------- 428 | control SwitchEgressDeparser( 429 | packet_out pkt, 430 | inout header_t hdr, 431 | in metadata_t meta, 432 | in egress_intrinsic_metadata_for_deparser_t eg_intr_md_for_dprsr, 433 | in egress_intrinsic_metadata_t eg_intr_md, 434 | in egress_intrinsic_metadata_from_parser_t eg_intr_md_from_prsr) { 435 | apply { 436 | // do more stuff here if needed 437 | pkt.emit(hdr); 438 | } 439 | } 440 | 441 | #endif -------------------------------------------------------------------------------- /leaf_conweave/p4src/includes/registers_egress.p4: -------------------------------------------------------------------------------- 1 | /* -*- P4_16 -*- */ 2 | #include 3 | #if __TARGET_TOFINO__ == 2 4 | #include 5 | #else 6 | #include 7 | #endif 8 | 9 | #include "headers.p4" 10 | #include "macro.p4" 11 | #include "parser.p4" 12 | 13 | 14 | 15 | 16 | 17 | /**************************************************************************** 18 | * D E Q U E U E C O U N T E R A T E G R E S S 19 | ****************************************************************************/ 20 | Register, conweave_qdepth_idx_width_t>(size=CONWEAVE_QDEPTH_IDX_SIZE) reg_buffer_egress_cntr; 21 | RegisterAction, conweave_qdepth_idx_width_t, bit<32>>(reg_buffer_egress_cntr) reg_read_reset_buffer_egress_cntr = { 22 | void apply(inout bit<32> reg, out bit<32> result){ 23 | if (hdr.cwctrl.drop == 1) { 24 | reg = 0; /** DROP: reorder is resolved, reset counter to 0 */ 25 | } 26 | result = reg; /** READ: read register and save to cwctrl header */ 27 | } 28 | }; 29 | RegisterAction, conweave_qdepth_idx_width_t, bit<32>>(reg_buffer_egress_cntr) reg_increment_buffer_egress_cntr = { 30 | void apply(inout bit<32> reg){ 31 | reg = reg |+| 1; /** DEQUEUE: increase counter by 1 */ 32 | } 33 | }; 34 | action do_read_reset_buffer_egress_cntr() { 35 | hdr.cwctrl.cntr_eg = reg_read_reset_buffer_egress_cntr.execute(meta.idx_qdepth_history_rx); 36 | } 37 | action do_increment_buffer_egress_cntr() { 38 | reg_increment_buffer_egress_cntr.execute(meta.idx_qdepth_history_rx); 39 | } 40 | 41 | 42 | 43 | 44 | /**************************************************************************** 45 | * R E O R D E R Q U E U E F L U S H B Y T A I L 46 | ****************************************************************************/ 47 | Register, hashidx_t>(size=CONWEAVE_TABLE_SIZE) reg_tail_resume; 48 | RegisterAction, hashidx_t, bit<1>>(reg_tail_resume) reg_check_tail_resume = { 49 | void apply(inout bit<8> reg, out bit<1> result){ 50 | result = (bit<1>)reg; 51 | if (reg == 1) { 52 | reg = 0; 53 | } 54 | } 55 | }; 56 | RegisterAction, hashidx_t, bit<1>>(reg_tail_resume) reg_update_tail_resume = { 57 | void apply(inout bit<8> reg, out bit<1> result){ 58 | reg = 1; 59 | } 60 | }; 61 | action do_check_tail_resume() { 62 | meta.flag_check_tail_resume = reg_check_tail_resume.execute((hashidx_t)hdr.cwctrl.hashidx); 63 | } 64 | action do_update_tail_resume() { 65 | reg_update_tail_resume.execute((hashidx_t)hdr.tailh.hashidx); 66 | } 67 | 68 | 69 | 70 | /**************************************************************************** 71 | * E C N M A R K I N G 72 | ****************************************************************************/ 73 | 74 | // ##### DCTCP ECN Marking ##### 75 | Register,bit<1>>(1,524287) reg_ecn_marking_threshold; // default = 2^19 - 1 76 | RegisterAction,bit<1>,bit<1>>(reg_ecn_marking_threshold) cmp_ecn_marking_threshold = { 77 | void apply(inout bit<32> reg_val, out bit<1> rv){ 78 | if((bit<32>)eg_intr_md.deq_qdepth >= reg_val){ 79 | rv = 1; 80 | } 81 | else{ 82 | rv = 0; 83 | } 84 | } 85 | }; 86 | action check_ecn_marking_threshold(){ 87 | meta.mark_ecn_codepoint = cmp_ecn_marking_threshold.execute(0); 88 | } 89 | 90 | 91 | 92 | /**************************************************************************** 93 | * D E B U G G I N G 94 | ****************************************************************************/ 95 | Register,bit<1>>(1, 0) reg_debug_eg_cntr1; 96 | RegisterAction, bit<1>, bit<32>>(reg_debug_eg_cntr1) reg_debug_eg_cntr1_action = { 97 | void apply(inout bit<32> reg, out bit<32> result) { 98 | reg = reg + 1; 99 | } 100 | }; 101 | action do_debug_eg_cntr1() { 102 | reg_debug_eg_cntr1_action.execute(0); 103 | } 104 | Register,bit<1>>(1, 0) reg_debug_eg_cntr2; 105 | RegisterAction, bit<1>, bit<32>>(reg_debug_eg_cntr2) reg_debug_eg_cntr2_action = { 106 | void apply(inout bit<32> reg, out bit<32> result) { 107 | reg = reg + 1; 108 | } 109 | }; 110 | action do_debug_eg_cntr2() { 111 | reg_debug_eg_cntr2_action.execute(0); 112 | } 113 | Register,bit<1>>(1, 0) reg_debug_eg_cntr3; 114 | RegisterAction, bit<1>, bit<32>>(reg_debug_eg_cntr3) reg_debug_eg_cntr3_action = { 115 | void apply(inout bit<32> reg, out bit<32> result) { 116 | reg = reg + 1; 117 | } 118 | }; 119 | action do_debug_eg_cntr3() { 120 | reg_debug_eg_cntr3_action.execute(0); 121 | } 122 | Register,bit<1>>(1, 0) reg_debug_eg_cntr4; 123 | RegisterAction, bit<1>, bit<32>>(reg_debug_eg_cntr4) reg_debug_eg_cntr4_action = { 124 | void apply(inout bit<32> reg, out bit<32> result) { 125 | reg = reg + 1; 126 | } 127 | }; 128 | action do_debug_eg_cntr4() { 129 | reg_debug_eg_cntr4_action.execute(0); 130 | } 131 | Register,bit<1>>(1, 0) reg_debug_eg_cntr5; 132 | RegisterAction, bit<1>, bit<32>>(reg_debug_eg_cntr5) reg_debug_eg_cntr5_action = { 133 | void apply(inout bit<32> reg, out bit<32> result) { 134 | reg = reg + 1; 135 | } 136 | }; 137 | action do_debug_eg_cntr5() { 138 | reg_debug_eg_cntr5_action.execute(0); 139 | } 140 | -------------------------------------------------------------------------------- /leaf_conweave/p4src/includes/tables_egress.p4: -------------------------------------------------------------------------------- 1 | /* -*- P4_16 -*- */ 2 | #include 3 | #if __TARGET_TOFINO__ == 2 4 | #include 5 | #else 6 | #include 7 | #endif 8 | 9 | #include "headers.p4" 10 | #include "macro.p4" 11 | #include "parser.p4" 12 | 13 | 14 | /**************************************************************************** 15 | * M A P R E G - I N D E X F O R Q D E P T H H I S T O R Y 16 | ****************************************************************************/ 17 | action get_idx_queue_occupancy_array_data_eg(conweave_qdepth_idx_width_t idx) { 18 | meta.idx_qdepth_history_rx = idx; 19 | meta.hit_idx_queue_occupancy_tbl_eg = 1; 20 | } 21 | 22 | table do_get_idx_queue_occupancy_array_data_eg { 23 | key = { 24 | eg_intr_md.egress_port: exact; // 9 bits 25 | eg_intr_md.egress_qid: exact; // 7 bits 26 | } 27 | actions = { get_idx_queue_occupancy_array_data_eg; @defaultonly nop; } 28 | const default_action = nop(); 29 | size = CONWEAVE_QDEPTH_IDX_SIZE; 30 | } 31 | 32 | action get_idx_queue_occupancy_array_ctrl_eg(conweave_qdepth_idx_width_t idx) { 33 | meta.idx_qdepth_history_rx = idx; 34 | meta.hit_idx_queue_occupancy_tbl_eg = 1; 35 | } 36 | 37 | table do_get_idx_queue_occupancy_array_ctrl_eg { 38 | key = { 39 | hdr.cwctrl.afc_msg: exact; // 32 bits 40 | } 41 | actions = { get_idx_queue_occupancy_array_ctrl_eg; @defaultonly nop; } 42 | const default_action = nop(); 43 | size = CONWEAVE_QDEPTH_IDX_SIZE; 44 | } 45 | 46 | /**************************************************************************** 47 | * D C Q C N C O N F I G U R A T I O N 48 | ****************************************************************************/ 49 | 50 | 51 | // ##### DCQCN ECN Marking ##### 52 | action dcqcn_mark_probability(bit<8> value) { 53 | meta.dcqcn_prob_output = value; 54 | } 55 | 56 | table dcqcn_get_ecn_probability { 57 | key = { 58 | eg_intr_md.deq_qdepth : range; // 19 bits 59 | } 60 | actions = { 61 | dcqcn_mark_probability; 62 | } 63 | const default_action = dcqcn_mark_probability(0); // default: no ecn mark 64 | size = 1024; 65 | } 66 | 67 | Random>() random; // random seed for sampling 68 | action dcqcn_get_random_number(){ 69 | meta.dcqcn_random_number = random.get(); 70 | } 71 | 72 | action dcqcn_check_ecn_marking() { 73 | meta.mark_ecn_codepoint = 1; 74 | } 75 | 76 | table dcqcn_compare_probability { 77 | key = { 78 | meta.dcqcn_prob_output : exact; 79 | meta.dcqcn_random_number : exact; 80 | } 81 | actions = { 82 | dcqcn_check_ecn_marking; 83 | @defaultonly nop; 84 | } 85 | const default_action = nop(); 86 | size = 65536; 87 | } 88 | // ##### DCQCN ECN Marking (end) ##### 89 | 90 | 91 | 92 | /**************************************************************************** 93 | * P O R T S F R O M T O R T O D E S T I N A T I O N 94 | ****************************************************************************/ 95 | 96 | action check_toward_dst() { 97 | meta.last_hop = 1; 98 | } 99 | table do_check_toward_dst { 100 | key = { 101 | eg_intr_md.egress_port : exact; 102 | } 103 | actions = { 104 | check_toward_dst; @defaultonly nop; 105 | } 106 | const default_action = nop(); 107 | size = 256; 108 | } 109 | 110 | -------------------------------------------------------------------------------- /leaf_conweave/p4src/includes/tables_ingress.p4: -------------------------------------------------------------------------------- 1 | /* -*- P4_16 -*- */ 2 | #include 3 | #if __TARGET_TOFINO__ == 2 4 | #include 5 | #else 6 | #include 7 | #endif 8 | 9 | #include "headers.p4" 10 | #include "macro.p4" 11 | #include "parser.p4" 12 | 13 | 14 | /**************************************************************************** 15 | * C O M M O N F U N C T I O N S - E C M P , L A S T H O P, R D M A 16 | ****************************************************************************/ 17 | 18 | /* Check RDMA data packets (to dynamically reroute) */ 19 | action set_rdma_data() { 20 | meta.flag_rdma_data = 1; 21 | } 22 | table check_rdma_data { 23 | key = { 24 | hdr.udp.dst_port : exact; 25 | hdr.bth.opcode : exact; 26 | } 27 | actions = { 28 | set_rdma_data; @defaultonly nop; 29 | } 30 | const default_action = nop(); 31 | size = 64; 32 | } 33 | 34 | /* manually get switch_id from ingress port (PORT_METADATA has issue in bf-sde-9.11.0) */ 35 | action write_switch_id(switch_id_t switch_id) { 36 | meta.switch_id = switch_id; 37 | } 38 | table get_switch_id { 39 | key = { 40 | ig_intr_md.ingress_port: exact; 41 | hdr.bth.conweave_opcode: ternary; 42 | meta.meta_src_addr: ternary; // hdr.ipv4.src_addr 43 | meta.meta_dst_addr: ternary; // hdr.ipv4.dst_addr 44 | } 45 | actions = { 46 | write_switch_id; 47 | @defaultonly nop; 48 | } 49 | const default_action = nop(); 50 | size = 1024; 51 | } 52 | 53 | /* Check last_hop */ 54 | action acknowledge_last_hop() { 55 | meta.last_hop = 1; 56 | } 57 | table check_last_hop { /* check last-hop pkt (including intra-ToR traffic) */ 58 | key = { 59 | meta.switch_id: exact; // switchId 60 | meta.meta_dst_addr: exact; // hdr.ipv4.dst_addr 61 | } 62 | actions = { 63 | acknowledge_last_hop; 64 | @defaultonly nop; 65 | } 66 | const default_action = nop(); 67 | size = TABLE_IPV4_SIZE; 68 | } 69 | 70 | // action write_nexthop_id(nexthop_id_t nexthop_id) { meta.nexthop_id = nexthop_id; /* group id */ } // BUGGY 71 | action write_nexthop_id(nexthop_id_t nexthop_id) { 72 | /* NOTE: this is a bad implementation, but we did as our SDE version (9.10.0) had a buggy compiler issue */ 73 | ig_intr_md_for_tm.level1_exclusion_id = (bit<16>)nexthop_id; 74 | meta.nexthop_id = (nexthop_id_t)ig_intr_md_for_tm.level1_exclusion_id; 75 | } 76 | table get_nexthop_id { 77 | key = { 78 | meta.switch_id: exact; 79 | hdr.ipv4.dst_addr: exact; 80 | } 81 | actions = { write_nexthop_id; @defaultonly nop; } 82 | const default_action = nop(); 83 | size = TABLE_IPV4_SIZE; 84 | } 85 | 86 | Hash> (HashAlgorithm_t.CRC16) lag_ecmp_hash; 87 | ActionProfile(size = MAX_PROFILE_MEMBERS) lag_ecmp; 88 | ActionSelector( 89 | action_profile = lag_ecmp /* profile */, 90 | hash = lag_ecmp_hash /* hash */, 91 | mode = SelectorMode_t.FAIR /* fair */, 92 | max_group_size = MAX_GROUP_SIZE, 93 | num_groups = MAX_GROUPS) lag_ecmp_sel /* selector */; 94 | 95 | @selector_enable_scramble(SCRAMBLE_ENABLE) /* enable non-linear hash */ 96 | table nexthop { 97 | key = { 98 | meta.nexthop_id : exact; 99 | hdr.ipv4.src_addr : selector; 100 | hdr.ipv4.dst_addr : selector; 101 | hdr.udp.src_port : selector; 102 | } 103 | actions = { set_port; drop; } 104 | const default_action = drop(0x1); 105 | size = TABLE_NEXTHOP_SIZE; 106 | implementation = lag_ecmp_sel; 107 | } 108 | 109 | 110 | 111 | /**************************************************************************** 112 | * C O M M O N C O N W E A V E F U N C T I O N 113 | ****************************************************************************/ 114 | 115 | /* ConWeave Logical Cateogory -> TxToR / RxToR processing */ 116 | action categorize_conweave_logical_step(bit<2> val) { 117 | meta.conweave_logic = val; 118 | } 119 | table do_categorize_conweave_logical_step { 120 | key = { 121 | meta.last_hop : exact; // 1b 122 | hdr.bth.conweave_opcode : exact; // 2b 123 | } 124 | actions = { categorize_conweave_logical_step; nop; } 125 | const entries = { 126 | (0, 0) : categorize_conweave_logical_step(1); // TxToR - Tx 127 | (1, 2) : categorize_conweave_logical_step(1); // TxToR - Received REPLY 128 | (1, 3) : categorize_conweave_logical_step(1); // TxToR - Received NOTIFY 129 | (1, 1) : categorize_conweave_logical_step(2); // RxToR - Rx 130 | (0, 2) : categorize_conweave_logical_step(2); // RxToR - Sending REPLY 131 | (0, 3) : categorize_conweave_logical_step(2); // RxToR - Sending NOTIFY 132 | (0, 1) : categorize_conweave_logical_step(3); // WRONG CONFIG!! not last hop but has cwh header 133 | (1, 0) : categorize_conweave_logical_step(0); // Intra-ToR traffic - bypass 134 | } 135 | size = 8; 136 | } 137 | 138 | /* persistent connection -> hash_index (register idx), and base_rtt (for reply-deadline) */ 139 | action write_hashidx_basertt(hashidx_t idx, timestamp_t base_rtt) { 140 | meta.hashidx = idx; 141 | meta.ts_base_rtt = base_rtt; 142 | meta.flag_matched = 1; 143 | } 144 | table get_hashidx_basertt { 145 | key = { 146 | hdr.ipv4.src_addr : exact; 147 | hdr.ipv4.dst_addr : exact; 148 | hdr.udp.src_port : exact; 149 | } 150 | actions = {write_hashidx_basertt; @defaultonly nop; } 151 | const default_action = nop(); 152 | size = CONWEAVE_TABLE_SIZE; 153 | } 154 | 155 | 156 | 157 | /**************************************************************************** 158 | * C O N W E A V E - T X T O R 159 | ****************************************************************************/ 160 | 161 | /* TWO OUTPORT SAMPLING */ 162 | action set_port_c1(bit<8> port) { meta.sample_port_c1 = port; } 163 | Hash> (HashAlgorithm_t.CRC16) lag_ecmp_hash_c1; 164 | ActionProfile(size = MAX_PROFILE_MEMBERS) lag_ecmp_c1; 165 | ActionSelector( 166 | action_profile = lag_ecmp_c1 /* profile */, 167 | hash = lag_ecmp_hash_c1 /* hash */, 168 | mode = SelectorMode_t.FAIR /* fair */, 169 | max_group_size = MAX_GROUP_SIZE, 170 | num_groups = MAX_GROUPS) lag_ecmp_sel_c1; 171 | 172 | @selector_enable_scramble(SCRAMBLE_ENABLE) /* enable non-linear hash */ 173 | table nexthop_c1 { 174 | key = { 175 | meta.nexthop_id : exact; 176 | meta.ts_now : selector; 177 | } 178 | actions = { set_port_c1; drop; } 179 | const default_action = drop(0x1); 180 | size = TABLE_NEXTHOP_SIZE; 181 | implementation = lag_ecmp_sel_c1; 182 | } 183 | 184 | action set_port_c2(bit<8> port) { meta.sample_port_c2 = port; } 185 | Hash> (HashAlgorithm_t.RANDOM) lag_ecmp_hash_c2; 186 | ActionProfile(size = MAX_PROFILE_MEMBERS) lag_ecmp_c2; 187 | ActionSelector( 188 | action_profile = lag_ecmp_c2 /* profile */, 189 | hash = lag_ecmp_hash_c2 /* hash */, 190 | mode = SelectorMode_t.FAIR /* fair */, 191 | max_group_size = MAX_GROUP_SIZE, 192 | num_groups = MAX_GROUPS) lag_ecmp_sel_c2; 193 | 194 | @selector_enable_scramble(SCRAMBLE_ENABLE) /* enable non-linear hash */ 195 | table nexthop_c2 { 196 | key = { 197 | meta.nexthop_id : exact; 198 | meta.ts_now : selector; 199 | } 200 | actions = { set_port_c2; @defaultonly drop; } 201 | const default_action = drop(0x1); 202 | size = TABLE_NEXTHOP_SIZE; 203 | implementation = lag_ecmp_sel_c2; 204 | } 205 | 206 | 207 | 208 | table do_check_and_update_port { 209 | key = { 210 | meta.result_expired: ternary; 211 | meta.result_reply_timeout: ternary; 212 | } 213 | actions = { 214 | do_update_port_if_expired; 215 | do_update_port_if_reply_timeout; 216 | do_get_current_port; 217 | } 218 | const entries = { 219 | (1, _) : do_update_port_if_expired(); /* change port from now (i.e., this packet) */ 220 | (_, 1) : do_update_port_if_reply_timeout(); /* change port from next packet */ 221 | (0, 0) : do_get_current_port(); /* e.g., meta.result_stability = 1 */ 222 | } 223 | size = 3; 224 | } 225 | 226 | 227 | 228 | table do_check_and_update_tail_ts { 229 | key = { 230 | meta.result_expired: ternary; 231 | meta.result_reply_timeout: ternary; 232 | meta.result_stability: ternary; 233 | } 234 | actions = { 235 | do_get_tail_ts; 236 | do_set_tail_ts_now; 237 | do_set_tail_ts_zero; 238 | } 239 | const entries = { 240 | (1, _, _) : do_set_tail_ts_zero(); // meta.ts_tail = 0 241 | (_, 1, _) : do_set_tail_ts_now(); // meta.ts_tail = meta.ts_now 242 | (_, _, 1) : do_set_tail_ts_zero(); // meta.ts_tail = 0 243 | (0, 0, 0) : do_get_tail_ts(); 244 | } 245 | size = 4; 246 | } 247 | 248 | /* update header and decode 8bits -> 9bits port */ 249 | action update_header_and_decode_port(PortId_t decoded_port) { 250 | hdr.bth.out_port = meta.final_port; /* conweave's encoded port */ 251 | meta.out_port = decoded_port; /* decoded port to forward pkt at switch (PortId_t = bit<9>) */ 252 | } 253 | table do_update_conweave_header_out_port { /* decoding 8-bits portId to 9-bits */ 254 | key = { 255 | meta.final_port: exact; 256 | } 257 | actions = { update_header_and_decode_port; @defaultonly nop; } 258 | const default_action = nop(); 259 | size = 256; 260 | } 261 | 262 | 263 | /* ask reply */ 264 | action update_conweave_header_ask_reply() { 265 | hdr.bth.conweave_ask_reply = 1; 266 | } 267 | table do_update_conweave_header_ask_reply { 268 | key = { 269 | meta.result_expired: ternary; // new start 270 | meta.result_reply_timeout: ternary; // TAIL 271 | meta.result_stability: ternary; // new start 272 | } 273 | actions = { update_conweave_header_ask_reply; @defaultonly nop; } 274 | const entries = { 275 | (1, _, _) : update_conweave_header_ask_reply(); // INIT 276 | (_, 1, _) : update_conweave_header_ask_reply(); // TAIL 277 | (_, _, 1) : update_conweave_header_ask_reply(); // INIT 278 | } 279 | const default_action = nop(); 280 | size = 4; 281 | } 282 | 283 | 284 | 285 | 286 | /**************************************************************************** 287 | * C O N W E A V E - R X T O R 288 | ****************************************************************************/ 289 | /* check epoch - output: prev(2), curr(0), next(1) -> meta.result_epoch_rx */ 290 | table do_check_epoch_rx { 291 | key = { 292 | hdr.bth.conweave_epoch: exact; 293 | } 294 | actions = { 295 | do_check_epoch_pkt_0_rx; 296 | do_check_epoch_pkt_1_rx; 297 | do_check_epoch_pkt_2_rx; 298 | do_check_epoch_pkt_3_rx; 299 | } 300 | const entries = { /* for bits wrap-around issue */ 301 | (0) : do_check_epoch_pkt_0_rx(); 302 | (1) : do_check_epoch_pkt_1_rx(); 303 | (2) : do_check_epoch_pkt_2_rx(); 304 | (3) : do_check_epoch_pkt_3_rx(); 305 | } 306 | size = 4; 307 | } 308 | 309 | /* give default queue_id (phtsical) for a given dev_port */ 310 | action get_default_queue_id(QueueId_t qid) { 311 | meta.out_queue_id = qid; 312 | } 313 | table do_get_default_queue_id { 314 | key = { 315 | meta.out_port: exact; // 9 bits 316 | } 317 | actions = { get_default_queue_id; @defaultonly nop; } 318 | const default_action = nop(); 319 | size = 512; 320 | } 321 | 322 | 323 | /* get register index for queue occupancy & afc_msg */ 324 | action get_idx_queue_occupancy_array_c1(conweave_qreg_idx_width_t idx, afc_msg_t afc_msg) { 325 | meta.idx_q_occup_arr_rx_c1 = idx; // 12 bits 326 | meta.afc_msg_c1 = afc_msg; // 32 bits 327 | } 328 | table do_get_idx_queue_occupancy_array_c1 { 329 | key = { 330 | meta.out_port: exact; // 9 bits 331 | meta.hash_qid_sample_c1: exact; // 7 bits (e.g., 2 ~ 5 for 25G, 2 ~ 9 for 100G) 332 | } 333 | actions = { get_idx_queue_occupancy_array_c1; @defaultonly nop; } 334 | const default_action = nop(); 335 | size = CONWEAVE_QREG_IDX_SIZE; 336 | } 337 | /* get register index for queue occupancy & afc_msg */ 338 | action get_idx_queue_occupancy_array_c2(conweave_qreg_idx_width_t idx, afc_msg_t afc_msg) { 339 | meta.idx_q_occup_arr_rx_c2 = idx; // 12 bits 340 | meta.afc_msg_c2 = afc_msg; // 32 bits 341 | } 342 | table do_get_idx_queue_occupancy_array_c2 { 343 | key = { 344 | meta.out_port: exact; // 9 bits 345 | meta.hash_qid_sample_c2: exact; // 7 bits (e.g., 6 ~ 9 for 25G, 10 ~ 17 for 100G) 346 | } 347 | actions = { get_idx_queue_occupancy_array_c2; @defaultonly nop; } 348 | const default_action = nop(); 349 | size = CONWEAVE_QREG_IDX_SIZE; 350 | } 351 | 352 | /* get register index for queue occupancy & afc_msg */ 353 | action get_idx_queue_occupancy_array_c3(conweave_qreg_idx_width_t idx, afc_msg_t afc_msg) { 354 | meta.idx_q_occup_arr_rx_c3 = idx; // 12 bits 355 | meta.afc_msg_c3 = afc_msg; // 32 bits 356 | } 357 | table do_get_idx_queue_occupancy_array_c3 { 358 | key = { 359 | meta.out_port: exact; // 9 bits 360 | meta.hash_qid_sample_c3: exact; // 7 bits (e.g., 10 ~ 13 for 25G, 18 ~ 25 for 100G) 361 | } 362 | actions = { get_idx_queue_occupancy_array_c3; @defaultonly nop; } 363 | const default_action = nop(); 364 | size = CONWEAVE_QREG_IDX_SIZE; 365 | } 366 | 367 | 368 | table update_q_occupancy_c1 { 369 | key = { 370 | meta.flag_finish_reorder_process: exact; // 1b 371 | meta.result_reorder_status: exact; // 2b 372 | } 373 | actions = { 374 | do_reset_q_occupancy_c1; 375 | do_register_q_occupancy_c1; 376 | do_check_q_occupancy_c1; 377 | @defaultonly nop; 378 | } 379 | const entries = { 380 | (1, 0) : do_reset_q_occupancy_c1(); 381 | (1, 1) : do_reset_q_occupancy_c1(); 382 | (1, 2) : do_reset_q_occupancy_c1(); 383 | (1, 3) : do_reset_q_occupancy_c1(); 384 | (0, 1) : do_check_q_occupancy_c1(); 385 | (0, 2) : do_register_q_occupancy_c1(); 386 | } 387 | const default_action = nop(); 388 | size = 8; 389 | } 390 | 391 | 392 | table update_q_occupancy_c2 { 393 | key = { 394 | meta.flag_finish_reorder_process: exact; // 1b 395 | meta.result_reorder_status: exact; // 2b 396 | } 397 | actions = { 398 | do_reset_q_occupancy_c2; 399 | do_register_q_occupancy_c2; 400 | do_check_q_occupancy_c2; 401 | @defaultonly nop; 402 | } 403 | const entries = { 404 | (1, 0) : do_reset_q_occupancy_c2(); 405 | (1, 1) : do_reset_q_occupancy_c2(); 406 | (1, 2) : do_reset_q_occupancy_c2(); 407 | (1, 3) : do_reset_q_occupancy_c2(); 408 | (0, 1) : do_check_q_occupancy_c2(); 409 | (0, 2) : do_register_q_occupancy_c2(); 410 | } 411 | const default_action = nop(); 412 | size = 8; 413 | } 414 | 415 | 416 | 417 | table update_q_occupancy_c3 { 418 | key = { 419 | meta.flag_finish_reorder_process: exact; // 1b 420 | meta.result_reorder_status: exact; // 2b 421 | } 422 | actions = { 423 | do_reset_q_occupancy_c3; 424 | do_register_q_occupancy_c3; 425 | do_check_q_occupancy_c3; 426 | @defaultonly nop; 427 | } 428 | const entries = { 429 | (1, 0) : do_reset_q_occupancy_c3(); 430 | (1, 1) : do_reset_q_occupancy_c3(); 431 | (1, 2) : do_reset_q_occupancy_c3(); 432 | (1, 3) : do_reset_q_occupancy_c3(); 433 | (0, 1) : do_check_q_occupancy_c3(); 434 | (0, 2) : do_register_q_occupancy_c3(); 435 | } 436 | const default_action = nop(); 437 | size = 8; 438 | } 439 | 440 | 441 | 442 | // if (meta.result_tail_send_reply_rx == 1) { /* Send REPLY of TAIL (CLEAR) */ 443 | // ingress_mirroring(1); 444 | // } else if (meta.pkt_ask_reply == 1 && meta.pkt_tail_flag == 0) { /* Send REPLY of INIT */ 445 | // ingress_mirroring(2); 446 | // } else if (meta.flag_mirr_for_ctrl_loop == 1) { /* Craft ConWeave CTRL pkt */ 447 | // ingress_mirroring(4); /* Note: NewOoO is always by phase1 pkt, which does not need to be replied */ 448 | // } else if (hdr.ipv4.ecn == 0x3) { /* Send NOTIFY */ 449 | // ingress_mirroring(3); 450 | // } 451 | table do_ingress_mirroring { 452 | key = { 453 | meta.result_tail_send_reply_rx : exact; // 1b 454 | meta.pkt_ask_reply : ternary; // 1b 455 | meta.pkt_tail_flag : ternary; // 1b 456 | meta.flag_mirr_for_ctrl_loop : ternary; // 1b 457 | hdr.ipv4.ecn: ternary; // 2b 458 | } 459 | actions = { 460 | ingress_mirroring; 461 | @defaultonly nop; 462 | } 463 | const entries = { 464 | (1, _, _, _, _): ingress_mirroring(1); // TAIL 465 | (0, 1, 0, _, _): ingress_mirroring(2); // INIT 466 | (0, _, _, 1, _): ingress_mirroring(4); // CWCTRL 467 | (0, _, _, _, 3): ingress_mirroring(3); // NOTIFY 468 | } 469 | const default_action = nop(); 470 | size = 4; 471 | } 472 | 473 | 474 | -------------------------------------------------------------------------------- /leaf_conweave/p4src/leaf_conweave.p4: -------------------------------------------------------------------------------- 1 | /* -*- P4_16 -*- */ 2 | #include 3 | #if __TARGET_TOFINO__ == 2 4 | #include 5 | #else 6 | #include 7 | #endif 8 | 9 | #include "includes/conweave_egress.p4" 10 | #include "includes/conweave_ingress.p4" 11 | #include "includes/headers.p4" 12 | #include "includes/macro.p4" 13 | #include "includes/parser.p4" 14 | 15 | Pipeline(SwitchIngressParser(), 16 | SwitchIngress(), 17 | SwitchIngressDeparser(), 18 | SwitchEgressParser(), 19 | SwitchEgress(), 20 | SwitchEgressDeparser() 21 | ) pipe; 22 | 23 | Switch(pipe) main; -------------------------------------------------------------------------------- /leaf_conweave_resource/Makefile: -------------------------------------------------------------------------------- 1 | all: 2 | ${SDE}/p4_build.sh ./p4src/leaf_conweave_resource.p4 --with-tofino2 3 | -------------------------------------------------------------------------------- /leaf_conweave_resource/mau.resources.log: -------------------------------------------------------------------------------- 1 | +---------------------------------------------------------------------+ 2 | | Log file: mau.resources.log | 3 | | Compiler version: 9.11.1 | 4 | | Created on: Wed Aug 9 23:16:05 2023 | 5 | | Run ID: 2ad741efbee043dd | 6 | +---------------------------------------------------------------------+ 7 | ... 8 | ------------------------------------------------------------------------------------------------------------------------------------ 9 | | Stage Number | Exact Match Input xbar | Ternary Match Input xbar | Hash Bit | Hash Dist Unit | Gateway | SRAM | Map RAM | TCAM | VLIW Instr | Meter ALU | Stats ALU | Stash | Exact Match Search Bus | Exact Match Result Bus | Tind Result Bus | Action Data Bus Bytes | 8-bit Action Slots | 16-bit Action Slots | 32-bit Action Slots | Logical TableID | 10 | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ 11 | | 0 | 28.91% | 0.00% | 45.91% | 83.33% | 68.75% | 20.00% | 14.58% | 0.00% | 15.62% | 50.00% | 0.00% | 0.00% | 87.50% | 43.75% | 68.75% | 19.53% | 0.00% | 0.00% | 0.00% | 100.00% | 12 | | 1 | 7.81% | 0.00% | 12.26% | 16.67% | 37.50% | 8.75% | 4.17% | 0.00% | 15.62% | 25.00% | 0.00% | 0.00% | 43.75% | 18.75% | 68.75% | 9.38% | 0.00% | 0.00% | 0.00% | 81.25% | 13 | | 2 | 10.94% | 0.00% | 11.54% | 0.00% | 37.50% | 81.25% | 0.00% | 0.00% | 6.25% | 0.00% | 0.00% | 0.00% | 37.50% | 6.25% | 18.75% | 14.06% | 0.00% | 0.00% | 0.00% | 43.75% | 14 | | 3 | 8.59% | 0.00% | 10.58% | 33.33% | 68.75% | 28.75% | 45.83% | 0.00% | 15.62% | 50.00% | 0.00% | 0.00% | 68.75% | 12.50% | 31.25% | 1.56% | 0.00% | 0.00% | 0.00% | 75.00% | 15 | | 4 | 17.19% | 0.00% | 42.55% | 33.33% | 43.75% | 63.75% | 85.42% | 0.00% | 15.62% | 100.00% | 0.00% | 0.00% | 37.50% | 37.50% | 12.50% | 14.84% | 0.00% | 0.00% | 0.00% | 56.25% | 16 | | 5 | 23.44% | 0.00% | 31.25% | 33.33% | 68.75% | 48.75% | 60.42% | 0.00% | 28.12% | 100.00% | 0.00% | 0.00% | 75.00% | 75.00% | 50.00% | 10.16% | 0.00% | 0.00% | 0.00% | 87.50% | 17 | | 6 | 11.72% | 0.00% | 21.88% | 50.00% | 68.75% | 40.00% | 56.25% | 0.00% | 18.75% | 75.00% | 0.00% | 0.00% | 56.25% | 56.25% | 43.75% | 17.19% | 0.00% | 0.00% | 0.00% | 75.00% | 18 | | 7 | 18.75% | 0.00% | 20.91% | 50.00% | 81.25% | 32.50% | 54.17% | 0.00% | 18.75% | 100.00% | 0.00% | 0.00% | 62.50% | 62.50% | 50.00% | 12.50% | 0.00% | 0.00% | 0.00% | 93.75% | 19 | | 8 | 32.03% | 3.03% | 24.76% | 66.67% | 75.00% | 33.75% | 54.17% | 4.17% | 28.12% | 100.00% | 0.00% | 0.00% | 68.75% | 50.00% | 62.50% | 10.16% | 0.00% | 0.00% | 0.00% | 87.50% | 20 | | 9 | 23.44% | 0.00% | 18.03% | 33.33% | 62.50% | 8.75% | 12.50% | 0.00% | 21.88% | 75.00% | 0.00% | 0.00% | 56.25% | 31.25% | 62.50% | 7.03% | 0.00% | 0.00% | 0.00% | 93.75% | 21 | | 10 | 20.31% | 0.00% | 21.39% | 50.00% | 56.25% | 15.00% | 22.92% | 0.00% | 18.75% | 100.00% | 0.00% | 0.00% | 43.75% | 50.00% | 43.75% | 10.16% | 0.00% | 0.00% | 0.00% | 75.00% | 22 | | 11 | 8.59% | 0.00% | 4.81% | 16.67% | 31.25% | 3.75% | 4.17% | 0.00% | 12.50% | 25.00% | 0.00% | 0.00% | 31.25% | 6.25% | 18.75% | 3.12% | 0.00% | 0.00% | 0.00% | 37.50% | 23 | | 12 | 15.62% | 0.00% | 3.85% | 16.67% | 93.75% | 21.25% | 35.42% | 0.00% | 12.50% | 25.00% | 0.00% | 0.00% | 93.75% | 31.25% | 37.50% | 3.12% | 0.00% | 0.00% | 0.00% | 100.00% | 24 | | 13 | 7.03% | 9.09% | 6.49% | 16.67% | 31.25% | 11.25% | 14.58% | 8.33% | 18.75% | 50.00% | 0.00% | 0.00% | 31.25% | 0.00% | 56.25% | 2.34% | 0.00% | 0.00% | 0.00% | 56.25% | 25 | | 14 | 3.91% | 0.00% | 2.40% | 0.00% | 25.00% | 1.25% | 0.00% | 0.00% | 12.50% | 0.00% | 0.00% | 0.00% | 25.00% | 6.25% | 25.00% | 3.12% | 0.00% | 0.00% | 0.00% | 37.50% | 26 | | 15 | 1.56% | 0.00% | 0.00% | 0.00% | 12.50% | 0.00% | 0.00% | 0.00% | 6.25% | 0.00% | 0.00% | 0.00% | 12.50% | 0.00% | 12.50% | 0.00% | 0.00% | 0.00% | 0.00% | 12.50% | 27 | | 16 | 0.78% | 0.00% | 0.00% | 0.00% | 6.25% | 0.00% | 0.00% | 0.00% | 6.25% | 0.00% | 0.00% | 0.00% | 6.25% | 0.00% | 12.50% | 0.00% | 0.00% | 0.00% | 0.00% | 12.50% | 28 | | 17 | 0.78% | 0.00% | 0.00% | 0.00% | 6.25% | 0.00% | 0.00% | 0.00% | 3.12% | 0.00% | 0.00% | 0.00% | 6.25% | 0.00% | 6.25% | 0.00% | 0.00% | 0.00% | 0.00% | 6.25% | 29 | | 18 | 2.34% | 0.00% | 0.00% | 0.00% | 25.00% | 0.00% | 0.00% | 0.00% | 12.50% | 0.00% | 0.00% | 0.00% | 25.00% | 0.00% | 25.00% | 1.56% | 0.00% | 0.00% | 0.00% | 25.00% | 30 | | 19 | 0.00% | 0.00% | 0.00% | 0.00% | 0.00% | 0.00% | 0.00% | 0.00% | 0.00% | 0.00% | 0.00% | 0.00% | 0.00% | 0.00% | 0.00% | 0.00% | 0.00% | 0.00% | 0.00% | 0.00% | 31 | ------------------------------------------------------------------------------------------------------------------------------------ 32 | ... 33 | -------------------------------------------------------------------------------- /leaf_conweave_resource/p4src/includes/actions_egress.p4: -------------------------------------------------------------------------------- 1 | /* -*- P4_16 -*- */ 2 | #include 3 | #if __TARGET_TOFINO__ == 2 4 | #include 5 | #else 6 | #include 7 | #endif 8 | 9 | #include "headers.p4" 10 | #include "macro.p4" 11 | #include "parser.p4" 12 | 13 | 14 | /**************************************************************************** 15 | * B A S I C F U N C T I O N S 16 | ****************************************************************************/ 17 | action nop() {} 18 | 19 | action swap_src_dst_fields() { /* swap src and dst */ 20 | /* swap srcip <-> dstip */ 21 | meta.dummy_32b = hdr.ipv4.dst_addr; 22 | hdr.ipv4.dst_addr = hdr.ipv4.src_addr; 23 | hdr.ipv4.src_addr = meta.dummy_32b; 24 | } 25 | 26 | /* HEADER CLEANING */ 27 | action invalid_conweave_eg() { 28 | hdr.cwh.setInvalid(); /* remove conweave header */ 29 | } 30 | action initialize_bth_header_eg() { 31 | hdr.bth.conweave_opcode = 0; 32 | hdr.bth.conweave_phase = 0; 33 | hdr.bth.conweave_epoch = 0; 34 | hdr.bth.conweave_ask_reply = 0; 35 | hdr.bth.conweave_tail_flag = 0; 36 | hdr.bth.out_port = 0; 37 | } 38 | 39 | // ##### DCTCP ECN Marking ##### 40 | action mark_ecn_ce_codepoint(){ 41 | hdr.ipv4.ecn = 0b11; 42 | } 43 | -------------------------------------------------------------------------------- /leaf_conweave_resource/p4src/includes/actions_ingress.p4: -------------------------------------------------------------------------------- 1 | /* -*- P4_16 -*- */ 2 | #include 3 | #if __TARGET_TOFINO__ == 2 4 | #include 5 | #else 6 | #include 7 | #endif 8 | 9 | #include "headers.p4" 10 | #include "macro.p4" 11 | #include "parser.p4" 12 | 13 | /**************************************************************************** 14 | * B A S I C F U N C T I O N S 15 | ****************************************************************************/ 16 | action nop() {} 17 | 18 | action drop(bit<3> drop_bits) { 19 | ig_intr_md_for_dprsr.drop_ctl = drop_bits; 20 | } 21 | 22 | action set_port(PortId_t port) { 23 | meta.out_port = port; 24 | } 25 | 26 | action forward_port(PortId_t port) { 27 | ig_intr_md_for_tm.ucast_egress_port = port; 28 | } 29 | 30 | action forward_queue(QueueId_t qid) { 31 | ig_intr_md_for_tm.qid = qid; 32 | 33 | } 34 | 35 | action bypass_egress() { 36 | ig_intr_md_for_tm.bypass_egress = 1w1; 37 | } 38 | 39 | action resubmit_tx() { /* resubmit */ 40 | ig_intr_md_for_dprsr.resubmit_type = RESUB_DPRSR_DIGEST_REPLY; 41 | } 42 | 43 | action recirculate_rx() { // only for each out_port of ToR->Server 44 | #if (LPBK_FOR_CTRL == 1) 45 | meta.out_port = (bit<9>)16; // loopback port XXX 46 | #else 47 | meta.out_port[8:7] = meta.pipeline_index; // pipeline index 48 | meta.out_port[6:0] = (bit<7>)RECIRC_PORT; /* RECIRC PORT (for each pipe) */ 49 | #endif 50 | hdr.ipv4.ecn = 0x0; /* disable ECN during recirculation */ 51 | } 52 | 53 | action swap_src_dst_fields() { /* swap srcip <-> dstip */ 54 | meta.dummy_32b = hdr.ipv4.dst_addr; 55 | hdr.ipv4.dst_addr = hdr.ipv4.src_addr; 56 | hdr.ipv4.src_addr = meta.dummy_32b; 57 | } 58 | 59 | /**************************************************************************** 60 | * C O N W E A V E - T X T O R 61 | ****************************************************************************/ 62 | 63 | /* REPLY DEADLINE */ 64 | action do_get_new_reply_timeout() { 65 | meta.ts_new_reply_timeout = meta.ts_now + meta.ts_base_rtt; 66 | } 67 | action do_get_max_reply_timeout() { 68 | meta.ts_new_reply_timeout = CONWEAVE_MAX_TIMESTAMP; 69 | } 70 | 71 | /* HEADER UPDATE */ 72 | action do_update_conweave_header_epoch() { 73 | hdr.bth.conweave_epoch = meta.result_epoch; /* conweave epoch */ 74 | } 75 | action do_update_conweave_header_phase() { 76 | hdr.bth.conweave_phase = meta.result_phase; /* conweave phase */ 77 | } 78 | action do_update_conweave_header_opcode(bit<2> opcode) { 79 | hdr.bth.conweave_opcode = opcode; /* conweave data tag */ 80 | } 81 | action update_conweave_header_tail_flag() { /* reply_timeout -> send TAIL packet */ 82 | hdr.bth.conweave_tail_flag = 1; 83 | } 84 | 85 | 86 | 87 | /**************************************************************************** 88 | * C O N W E A V E - R X T O R 89 | ****************************************************************************/ 90 | /* hashkey 32 bits */ 91 | Hash>(HashAlgorithm_t.CRC32) hash_crc32; 92 | action get_hash_flowkey_step1() { /* creates flow hashkey */ 93 | meta.hash_flowkey = (bit<32>)hash_crc32.get({ hdr.ipv4.src_addr, hdr.ipv4.dst_addr, hdr.udp.src_port}); 94 | } 95 | action get_hash_flowkey_step2() { /* crafts non-zero flow hashkey */ 96 | meta.hash_flowkey = meta.hash_flowkey |+| 1; 97 | } 98 | 99 | /* Sample QueueID */ 100 | Hash(HashAlgorithm_t.CRC8) hash_crc8; 101 | Hash(HashAlgorithm_t.CRC16) hash_crc16; 102 | Hash(HashAlgorithm_t.IDENTITY) hash_identity; 103 | action sample_hash_qid_step_one() { 104 | meta.hash_qid_sample_c1 = (QueueId_t)(hash_crc8.get({ hdr.ipv4.src_addr, hdr.ipv4.dst_addr, hdr.udp.src_port})); 105 | meta.hash_qid_sample_c2 = (QueueId_t)(hash_crc16.get({ hdr.ipv4.src_addr, hdr.ipv4.dst_addr, hdr.udp.src_port})); 106 | meta.hash_qid_sample_c3 = (QueueId_t)(hash_identity.get({ hdr.ipv4.src_addr, hdr.ipv4.dst_addr, hdr.udp.src_port})); 107 | } 108 | action sample_hash_qid_step_two() { 109 | meta.hash_qid_sample_c1 = meta.hash_qid_sample_c1 + CONWEAVE_QREG_IDX_OFFSET_C1; 110 | meta.hash_qid_sample_c2 = meta.hash_qid_sample_c2 + CONWEAVE_QREG_IDX_OFFSET_C2; 111 | meta.hash_qid_sample_c3 = meta.hash_qid_sample_c3 + CONWEAVE_QREG_IDX_OFFSET_C3; 112 | } 113 | 114 | 115 | 116 | 117 | /* ADJUST TIMESTAMP WRAP-AROUND */ 118 | action do_calc_tx_timegap_ts_rx() { 119 | meta.ts_timegap_rx = meta.ts_tail |-| meta.ts_phase0_tx; // no wrap-around 120 | } 121 | action do_default_tx_timegap_ts_rx() { 122 | meta.ts_timegap_rx = CONWEAVE_RX_DEFAULT_WAITING_TIME; // default flush waiting time 123 | meta.ts_phase0_rx = meta.ts_now; /** NOTE: overwrite as no phase0 info */ 124 | } 125 | /* CALC EXPECTED TAIL ARRIVAL TIME */ 126 | action do_calc_expected_tail_arrival_phase0_ts_rx() { 127 | meta.ts_expected_tail_arrival_rx = meta.ts_now + meta.ts_timegap_rx; 128 | } 129 | action do_calc_expected_tail_arrival_phase1_ts_rx() { 130 | meta.ts_expected_tail_arrival_rx = meta.ts_phase0_rx + meta.ts_timegap_rx; 131 | } 132 | 133 | 134 | /* HEADER CLEANING */ 135 | action invalid_conweave_ig() { 136 | hdr.cwh.setInvalid(); /* remove conweave header */ 137 | } 138 | action initialize_bth_header_ig() { 139 | hdr.bth.conweave_opcode = 0; 140 | hdr.bth.conweave_phase = 0; 141 | hdr.bth.conweave_epoch = 0; 142 | hdr.bth.conweave_ask_reply = 0; 143 | hdr.bth.conweave_tail_flag = 0; 144 | hdr.bth.out_port = 0; 145 | } 146 | -------------------------------------------------------------------------------- /leaf_conweave_resource/p4src/includes/conweave_egress.p4: -------------------------------------------------------------------------------- 1 | /* -*- P4_16 -*- */ 2 | #include 3 | #if __TARGET_TOFINO__ == 2 4 | #include 5 | #else 6 | #include 7 | #endif 8 | 9 | #include "headers.p4" 10 | #include "macro.p4" 11 | #include "parser.p4" 12 | 13 | /************************************************************************* 14 | **************** E G R E S S P R O C E S S I N G ******************* 15 | *************************************************************************/ 16 | 17 | control SwitchEgress( 18 | inout header_t hdr, 19 | inout metadata_t meta, 20 | in egress_intrinsic_metadata_t eg_intr_md, 21 | in egress_intrinsic_metadata_from_parser_t eg_intr_md_from_prsr, 22 | inout egress_intrinsic_metadata_for_deparser_t eg_intr_md_for_dprsr, 23 | inout egress_intrinsic_metadata_for_output_port_t eg_intr_md_for_oport) { 24 | /* include actions, registers, and tables */ 25 | 26 | #include "actions_egress.p4" 27 | #include "registers_egress.p4" 28 | #include "tables_egress.p4" 29 | 30 | apply { 31 | /*------------------------------------------------------------------------------------------ 32 | Tx REPLY, NOTIFY -> {swap src/dst, update phase, and update opcode} based on metadata 33 | 34 | TODO: REPLY will have no delay by queue. 35 | But, NOTIFY can be delayed because we craft using the "original" packet (mirror_option > 0). 36 | Later, we should craft using the mirrored packet (how?) 37 | -------------------------------------------------------------------------------------------*/ 38 | if (meta.ig_mirror1.mirror_option == 1) { /** Reply of TAIL (NOTE:using original packet) */ 39 | swap_src_dst_fields(); 40 | if (hdr.cwctrl.isValid()) { 41 | /* this step is necessary, because we sometimes mirror CTRL pkt (see CWCTRL part at ingress pipeline) */ 42 | hdr.cwctrl.setInvalid(); 43 | hdr.ethernet.ether_type = ether_type_t.IPV4; 44 | } 45 | hdr.bth.conweave_opcode = 2; 46 | hdr.bth.conweave_phase = 1; 47 | hdr.bth.conweave_ask_reply = 0; 48 | hdr.bth.conweave_tail_flag = 0; 49 | hdr.bth.flags = 0; 50 | hdr.ipv4.ecn = 0b00; 51 | #ifdef FULL_CODE 52 | do_debug_eg_cntr1(); // XXX 53 | #endif 54 | 55 | // use same epoch 56 | // exit; 57 | } else if (meta.ig_mirror1.mirror_option == 2) { /** Reply of INIT (NOTE:using original packet) */ 58 | swap_src_dst_fields(); 59 | hdr.bth.conweave_opcode = 2; 60 | hdr.bth.conweave_phase = 0; 61 | hdr.bth.conweave_ask_reply = 0; 62 | hdr.bth.conweave_tail_flag = 0; 63 | hdr.bth.flags = (bit<8>)hdr.ipv4.ecn; /* INIT Reply with NOTIFY */ 64 | hdr.ipv4.ecn = 0b00; 65 | #ifdef FULL_CODE 66 | do_debug_eg_cntr2(); // XXX 67 | #endif 68 | 69 | // use same epoch 70 | // exit; 71 | } 72 | #if (LPBK_FOR_NOTIFY == 1) 73 | else if (eg_intr_md.egress_port == 8) { /** NOTIFY (NOTE: using crafted packet) */ 74 | #else 75 | else if (meta.ig_mirror1.mirror_option == 3) { /* NOTIFY */ 76 | #endif 77 | swap_src_dst_fields(); 78 | hdr.bth.conweave_opcode = 3; 79 | hdr.bth.conweave_ask_reply = 0; 80 | hdr.bth.conweave_tail_flag = 0; 81 | hdr.bth.flags = 0; 82 | hdr.ipv4.ecn = 0b00; 83 | #ifdef FULL_CODE 84 | do_debug_eg_cntr3(); // XXX 85 | #endif 86 | 87 | // use same epoch 88 | // use same out_port in bth 89 | // exit; 90 | } else { /* rest of packets toward recirc/lbpk -> CWCTRL */ 91 | /** NOTE: RUN ONLY ONCE PER CWCTRL PKT, Newly mirrored packet for Ctrl */ 92 | #if (LPBK_FOR_CTRL == 1) 93 | if (eg_intr_md.egress_port == 16 && hdr.cwctrl.isValid() == false) { /** CTRL (NOTE: using crafted packet) */ 94 | #else 95 | if (eg_intr_md.egress_port [6:0] == (bit<7>)RECIRC_PORT && hdr.cwctrl.isValid() == false) { 96 | #endif 97 | /* validate hdr.cwctrl header!! */ 98 | hdr.cwctrl.setValid(); 99 | hdr.cwctrl.pre_timeout = 0; 100 | hdr.cwctrl.timeout = 0; 101 | hdr.cwctrl.drop = 0; 102 | hdr.cwctrl.cntr_eg = 0; 103 | hdr.cwctrl.afc_msg = 0; 104 | hdr.ethernet.ether_type = (bit<16>)ether_type_t.CWCTRL; 105 | 106 | /* update/initialize header */ 107 | hdr.bth.conweave_ask_reply = 0; 108 | hdr.bth.conweave_tail_flag = 0; 109 | hdr.bth.flags = 0; 110 | hdr.ipv4.ecn = 0b00; 111 | #ifdef FULL_CODE 112 | do_debug_eg_cntr4(); // XXX 113 | #endif 114 | // use same epoch 115 | // use same phase (phase-1) 116 | // use same opcode (1) 117 | // exit; 118 | } else { 119 | /* mirror_option = 4 -> ORIGINAL NewOoO PACKET !! */ 120 | 121 | #if (LPBK_FOR_CTRL == 1) 122 | if (eg_intr_md.egress_port == 16 && hdr.cwctrl.isValid()) { /* CTRL */ 123 | #else 124 | if (eg_intr_md.egress_port [6:0] == (bit<7>)RECIRC_PORT && hdr.cwctrl.isValid()) { /* CTRL */ 125 | #endif 126 | /*----------------------------------------------------------- 127 | Egress Dequeue Depth History 128 | ------------------------------------------------------------*/ 129 | /* hdr.cwctrl.afc_msg (32bits) -> meta.idx_qdepth_history_rx */ 130 | do_get_idx_queue_occupancy_array_ctrl_eg.apply(); 131 | if (meta.hit_idx_queue_occupancy_tbl_eg == 1) { /* if hit */ 132 | /** DROP: reorder is resolved, reset counter to 0 */ 133 | /** READ: read register and save to cwctrl header */ 134 | do_read_reset_buffer_egress_cntr(); /* READ -> hdr.cwctrl.cntr_eg */ 135 | } 136 | 137 | if (hdr.cwctrl.pre_timeout == 1) { 138 | // hdr.cwctrl.hashidx 139 | do_check_tail_resume(); // -> meta.flag_check_tail_resume = 1 if TAIL already resumed the reorder queue 140 | if (meta.flag_check_tail_resume == 1) { 141 | hdr.cwctrl.pre_timeout = 0; 142 | hdr.cwctrl.timeout = 1; 143 | } 144 | } 145 | 146 | } else { 147 | /*-------------------------------------------------------------------------------------*/ 148 | /*------ Only DATA packets (at both srcToR/dstToR) will be processed by following -----*/ 149 | /*-------------------------------------------------------------------------------------*/ 150 | 151 | /*------------------------------------------ 152 | Resume Reorder Queue - by TAIL 153 | -------------------------------------------*/ 154 | if (hdr.tailh.isValid()) { /* resume the reorder queue at egress deparser*/ 155 | eg_intr_md_for_dprsr.adv_flow_ctl = hdr.tailh.afc_msg_resume; 156 | 157 | /* return back to original packet header */ 158 | hdr.tailh.setInvalid(); 159 | hdr.ethernet.ether_type = ether_type_t.IPV4; 160 | 161 | /* record TAIL has resumed the reorder queue. CTRL will check it. */ 162 | do_update_tail_resume(); 163 | #ifdef FULL_CODE 164 | do_debug_eg_cntr5(); // XXX 165 | #endif 166 | } 167 | 168 | /*----------------------------------------------------------- 169 | Egress Dequeue Depth History 170 | ------------------------------------------------------------*/ 171 | /* eg_intr_md.egress_port (9 bits), eg_intr_md.egress_qid (7 bits) -> meta.idx_qdepth_history_rx **/ 172 | do_get_idx_queue_occupancy_array_data_eg.apply(); 173 | if (meta.hit_idx_queue_occupancy_tbl_eg == 1) { /* if hit */ 174 | /** DEQUEUE: increase counter by 1 */ 175 | do_increment_buffer_egress_cntr(); 176 | } 177 | 178 | #ifdef FULL_CODE 179 | /*----------------------------------------------------------- 180 | ECN MARKING (DCQCN <- RDMA, DCTCP <- TCP) 181 | ------------------------------------------------------------*/ 182 | if (hdr.ipv4.ecn == 0b01 || hdr.ipv4.ecn == 0b10) { 183 | if (meta.is_roce_v2 == 1) { // RoCEv2 Pkt 184 | /* DCQCN (RED-like marking) */ 185 | dcqcn_get_ecn_probability.apply(); // get probability to ecn-mark 186 | dcqcn_get_random_number(); // get random number for sampling 187 | dcqcn_compare_probability.apply(); // fills meta.mark_ecn_codepoint 188 | } else { // use DCTCP-like marking 189 | check_ecn_marking_threshold(); // fills meta.mark_ecn_codepoint 190 | } 191 | 192 | if (meta.mark_ecn_codepoint == 1) { 193 | mark_ecn_ce_codepoint(); 194 | } 195 | } // #### ECN Marking (end) ###### 196 | #endif 197 | /*--------------------------------------------------------------------- 198 | CLEAR BTH & CWH HEADERS OF ORIGINAL (NON-MIRRORING) PKTS TO DST 199 | ----------------------------------------------------------------------*/ 200 | do_check_toward_dst.apply(); /* -> meta.last_hop */ 201 | if (meta.last_hop == 1) { 202 | if (hdr.cwh.isValid()) { 203 | invalid_conweave_eg(); 204 | } 205 | if (hdr.bth.isValid()) { 206 | initialize_bth_header_eg(); 207 | } 208 | 209 | } 210 | } 211 | } 212 | } 213 | } 214 | } // End of SwitchEgress -------------------------------------------------------------------------------- /leaf_conweave_resource/p4src/includes/headers.p4: -------------------------------------------------------------------------------- 1 | #ifndef _HEADERS_ 2 | #define _HEADERS_ 3 | 4 | #include "macro.p4" 5 | 6 | /******************************************************* 7 | **** C L A S S I C H E A D E R **** 8 | ********************************************************/ 9 | 10 | header ethernet_h { 11 | mac_addr_t dst_addr; 12 | mac_addr_t src_addr; 13 | bit<16> ether_type; 14 | } 15 | 16 | header arp_h { 17 | bit<16> htype; 18 | bit<16> ptype; 19 | bit<8> hlen; 20 | bit<8> plen; 21 | bit<16> oper; 22 | mac_addr_t sender_hw_addr; 23 | ipv4_addr_t sender_ip_addr; 24 | mac_addr_t target_hw_addr; 25 | ipv4_addr_t target_ip_addr; 26 | } 27 | 28 | header ipv4_h { 29 | bit<4> version; 30 | bit<4> ihl; 31 | bit<6> dscp; // tos field 32 | bit<2> ecn; // tos field 33 | bit<16> total_len; // 1024B MTU RDMA -> 1084 (CX6), 1068 (CX5 except WR_FIRST) 34 | bit<16> identification; 35 | bit<3> flags; 36 | bit<13> frag_offset; 37 | bit<8> ttl; 38 | bit<8> protocol; 39 | bit<16> hdr_checksum; 40 | ipv4_addr_t src_addr; 41 | ipv4_addr_t dst_addr; 42 | } 43 | 44 | header tcp_h { 45 | bit<16> src_port; 46 | bit<16> dst_port; 47 | bit<32> seq_no; 48 | bit<32> ack_no; 49 | bit<4> data_offset; 50 | bit<4> res; 51 | bit<8> flags; 52 | bit<16> window; 53 | bit<16> checksum; 54 | bit<16> urgent_ptr; 55 | } 56 | 57 | header udp_h { 58 | bit<16> src_port; 59 | bit<16> dst_port; 60 | bit<16> hdr_length; 61 | bit<16> checksum; 62 | } 63 | 64 | header icmp_h { 65 | bit<8> type_; 66 | bit<8> code; 67 | bit<16> hdr_checksum; 68 | bit<16> id; 69 | bit<16> seq_no; 70 | bit<64> data_time; 71 | } 72 | 73 | /*---- RDMA (12 bytes) ----*/ 74 | header ib_bth_h { 75 | bit<8> opcode; 76 | bit<8> flags; /** NOTE: "flags" field is used for REPLY INIT's ECN (0x3) between SrcToR/DstToR. No effect on RDMA. */ 77 | bit<16> partition_key; 78 | 79 | /*--- RC reserved0 (8 bits) ----*/ 80 | bit<8> out_port; 81 | /*---------------------*/ 82 | 83 | bit<24> destination_qp; 84 | bit<1> ack_request; 85 | 86 | /*--- RC reserved1 (7 bits)----*/ 87 | bit<2> conweave_opcode; /* 0: NOTHING, 1: DATA, 2: REPLY, 3: NOTIFY */ 88 | bit<1> conweave_phase; 89 | bit<2> conweave_epoch; 90 | bit<1> conweave_ask_reply; 91 | bit<1> conweave_tail_flag; /* TAIL */ 92 | /*---------------------*/ 93 | 94 | bit<24> packet_seqnum; 95 | } 96 | 97 | // ACK 98 | header ib_aeth_h { 99 | bit<1> reserved; 100 | bit<2> opcode; // (0: ACK, 3: NACK) 101 | bit<5> error_code; // (PSN SEQ ERROR) 102 | bit<8> msg_seq_number; 103 | } 104 | 105 | /******************************************************* 106 | **** A D V A N C E D F L O W C O N T R O L **** 107 | *******************************************************/ 108 | header conweave_ctrl_h { 109 | @padding bit<5> _pad1; 110 | bit<1> pre_timeout; /* 1: pre_timeout (must check egress register) */ 111 | bit<1> timeout; /* 1: timeout triggered */ 112 | bit<1> drop; /* 1: must be dropped */ 113 | bit<32> cntr_eg; /* reorder-buffer egress counter */ 114 | bit<32> afc_msg; /* without credit setup (i.e., the least significant 15 bits are empty) */ 115 | bit<16> hashidx; // hashidx for egress pipeline 116 | 117 | 118 | /** AFC: Format */ 119 | // bit<1> qfc; 120 | // bit<2> tm_pipe_id; 121 | // bit<4> tm_mac_id; 122 | // bit<3> _pad; 123 | // bit<7> tm_mac_qid; 124 | // bit<15> credit; 125 | } 126 | 127 | header conweave_tail_h { 128 | bit<32> afc_msg_resume; 129 | bit<16> hashidx; // hashidx for egress pipeline 130 | } 131 | 132 | /******************************************************* 133 | **** C O N W E A V E H E A D E R **** 134 | *******************************************************/ 135 | header conweave_h { 136 | bit<16> ts_tx; 137 | bit<16> ts_tail; 138 | } 139 | 140 | header resubmit_h { 141 | } 142 | 143 | header eg_mirror1_h { 144 | } 145 | 146 | header ig_mirror1_h { 147 | bit<8> mirror_option; /* 1: TAIL's REPLY (CLEAR), 2: INIT's REPLY, 3: NOTIFY, 4: Reorder-Ctrl */ 148 | } 149 | 150 | struct header_t { 151 | ethernet_h ethernet; 152 | ipv4_h ipv4; 153 | arp_h arp; 154 | tcp_h tcp; 155 | udp_h udp; 156 | icmp_h icmp; 157 | ib_bth_h bth; /* RDMA headers */ 158 | conweave_h cwh; /* ConWeave header */ 159 | conweave_ctrl_h cwctrl; /* ConWeave Ctrl header */ 160 | conweave_tail_h tailh; /* ConWeave TAIL header (if needed) */ 161 | } 162 | 163 | /******************************************************* 164 | **** H E A D E R & M E T A D A T A **** 165 | ********************************************************/ 166 | 167 | struct metadata_t { 168 | /* resubmit or mirroring */ 169 | resubmit_h resubmit_hdr; 170 | eg_mirror1_h eg_mirror1; 171 | ig_mirror1_h ig_mirror1; 172 | MirrorId_t mirror_session; 173 | 174 | /* ConWeave */ 175 | bit<1> conweave_on_off; /* switch on/off */ 176 | bit<2> conweave_logic; /* 1: TxToR, 2: RxToR, 3: WRONG, 0: intra-ToR */ 177 | bit<2> pipeline_index; /* ig_intr_md.ingress_port[8:7], see parser */ 178 | 179 | /* switch's ID for our virtual topology */ 180 | switch_id_t switch_id; 181 | nexthop_id_t nexthop_id; 182 | bit<1> last_hop; 183 | PortId_t out_port; // final 184 | QueueId_t out_queue_id; // final 185 | 186 | 187 | /* dummy and common metadata */ 188 | ipv4_addr_t dummy_32b; /* for sip<->dip swap (REPLY & NOTIFY) */ 189 | ipv4_addr_t meta_src_addr; 190 | ipv4_addr_t meta_dst_addr; 191 | timestamp_t ts_now; 192 | timestamp_t ts_tail; 193 | hashidx_t hashidx; /* key & table idx */ 194 | bit<1> digest_on; /* digest flowkey */ 195 | 196 | 197 | /* packet metadata */ 198 | bit<2> pkt_epoch; /* <- hdr.bth.conweave_epoch */ 199 | bit<1> pkt_phase; /* <- hdr.bth.conweave_phase */ 200 | bit<1> pkt_ask_reply; /* <- hdr.bth.conweave_ask_reply */ 201 | bit<1> pkt_tail_flag; /* <- hdr.bth.conweave_tail_flag */ 202 | 203 | bit<1> flag_cwctrl_active; /* hdr.cwctrl.isValid() */ 204 | bit<1> pkt_cwctrl_timeout; /* <- hdr.cwctrl.timeout */ 205 | bit<1> pkt_cwctrl_drop; /* <- hdr.cwctrl.drop */ 206 | bit<32> pkt_cwctrl_cntr_eg; /* <- hdr.cwctrl.cntr_eg */ 207 | bit<32> pkt_cwctrl_afc_msg; /* <- hdr.cwctrl.afc_msg */ 208 | 209 | /* pair for initialization */ 210 | pair init_cntr_ig; 211 | /*********************************************************** 212 | * C O N W E A V E - T X M E T A D A T A 213 | ***********************************************************/ 214 | /* timestamp */ 215 | timestamp_t ts_base_rtt; 216 | timestamp_t ts_new_reply_timeout; 217 | 218 | /* sampled port info */ 219 | bit<8> sample_port_c1; // chance 1 220 | bit<8> sample_port_c2; // chance 2 221 | bit<8> good_port; // good port without ECN marking 222 | bit<8> final_port; // final port to send a current packet 223 | bit<1> no_good_port; // if good_port is not actually good enough 224 | bit<2> stage_to_record_port; // CRC8 or out_port[1:0] 225 | 226 | /* metadata at TX */ 227 | bit<1> flag_rdma_data; 228 | bit<1> flag_matched; // 1: found from get_hash_idx table 229 | bit<1> flag_enforce_no_reroute; // 1: enforce not to reroute, since TS_MAX - 10ms 230 | bit<1> result_expired; // 1: expired 231 | bit<1> result_stability; // 1: stable 232 | bit<1> result_reply_timeout; // 1: timeout 233 | bit<1> result_timely_replied; // 1: timely replied 234 | bit<1> result_phase; // phase 1 is possible only when we call "do_get_phase()" 235 | bit<2> result_epoch; // current epoch 236 | bit<1> result_port_c1_bad; // 1: sample_c1 is bad port 237 | bit<1> result_port_c2_bad; // 1: sample_c2 is bad port 238 | bit<1> result_reply_with_notify; // 1: INIT's reply with NOTIFY 239 | 240 | /*********************************************************** 241 | * C O N W E A V E - R X M E T A D A T A 242 | ***********************************************************/ 243 | bit<32> hash_flowkey; 244 | 245 | timestamp_t ts_phase0_tx; 246 | timestamp_t ts_phase0_rx; 247 | timestamp_t ts_timegap_rx; /* tail_tx - phase0_tx */ 248 | timestamp_t ts_expected_tail_arrival_rx; /* time to flush queue */ 249 | 250 | bit<2> result_epoch_rx; /* 1: new epoch, 2: prev epoch so bypass, 0: process */ 251 | bit<1> result_phase0_cch_rx; /* 1: phase-0 pkt has passed (or is passing) */ 252 | bit<1> result_tail_cch_rx; /* 1: tail has passed (or is passing) */ 253 | bit<1> result_out_of_order_rx; /* 1: out-of-ordered packet */ 254 | bit<2> result_reorder_status; /* 1: reorder is on-going, 2: new register */ 255 | 256 | QueueId_t hash_qid_sample_c1; // 25G: 4 queues (2 bits), 100G: 8 queues (3 bits) 257 | QueueId_t hash_qid_sample_c2; // 25G: 4 queues (2 bits), 100G: 8 queues (3 bits) 258 | QueueId_t hash_qid_sample_c3; // 25G: 4 queues (2 bits), 100G: 8 queues (3 bits) 259 | 260 | conweave_qreg_idx_width_t idx_q_occup_arr_rx_c1; // 12 bits - port(9) + queue(3) 261 | conweave_qreg_idx_width_t idx_q_occup_arr_rx_c2; // 12 bits - port(9) + queue(3) 262 | conweave_qreg_idx_width_t idx_q_occup_arr_rx_c3; // 12 bits - port(9) + queue(3) 263 | 264 | bit<1> result_q_occupancy_c1; /* 1: registered, or matched */ 265 | bit<1> result_q_occupancy_c2; /* 1: registered, or matched */ 266 | bit<1> result_q_occupancy_c3; /* 1: registered, or matched */ 267 | 268 | bit<1> result_time_flush_queue_rx; /* 1: timeout */ 269 | bit<1> possibly_tail_before_timeout; /* 1: possibly TAIL before timeout */ 270 | bit<1> flag_mirr_for_ctrl_loop; /* 1: mirror */ 271 | bit<1> result_tail_send_reply_rx; /* 1: send TAIL's reply */ 272 | bit<1> flag_finish_reorder_process; /* 1: reorder is resolved */ 273 | bit<1> flag_resume_reorder_queue; /* 1: resume reorder queue */ 274 | bit<1> flag_check_tail_resume; /* 1: queue is resumed in advance by TAIL */ 275 | bit<32> result_q_pkt_cntr_ig; /* counter */ 276 | 277 | /* Egress qdepth metadata */ 278 | conweave_qdepth_idx_width_t idx_qdepth_history_rx; // 13 bits 279 | 280 | 281 | /*********** T E M P O R A R I L Y ********/ 282 | bit<1> cntr_additive; 283 | 284 | 285 | 286 | /*********************************************************** 287 | * A D V A N C E D F L O W C O N T R O L 288 | ***********************************************************/ 289 | afc_msg_t afc_msg_c1; // 32 bits, without PAUSE/RESUME instruction yet 290 | afc_msg_t afc_msg_c2; // 32 bits, without PAUSE/RESUME instruction yet 291 | afc_msg_t afc_msg_c3; // 32 bits, without PAUSE/RESUME instruction yet 292 | 293 | 294 | /*********************************************************** 295 | * D C Q C N - E C N M A R K I N G 296 | ***********************************************************/ 297 | bit<1> mark_ecn_codepoint; 298 | bit<1> is_roce_v2; 299 | bit<8> dcqcn_prob_output; 300 | bit<8> dcqcn_random_number; 301 | 302 | 303 | /*********************************************************** 304 | * S O M E T H I N G D E B U G 305 | ***********************************************************/ 306 | bit<1> flag_something_wrong; 307 | 308 | /*********************************************************** 309 | * S O M E T H I N G E G R E S S 310 | ***********************************************************/ 311 | bit<1> hit_idx_queue_occupancy_tbl_eg; 312 | } 313 | 314 | #endif -------------------------------------------------------------------------------- /leaf_conweave_resource/p4src/includes/macro.p4: -------------------------------------------------------------------------------- 1 | #ifndef _MACROS_ 2 | #define _MACROS_ 3 | 4 | #define LPBK_FOR_CTRL (1) 5 | #define LPBK_FOR_NOTIFY (1) 6 | 7 | /*************************************************************************/ 8 | /****** IMPORTANT: Different configuration for 25G/100G link speed *******/ 9 | #define CONWEAVE_EVAL_Q16_OR_Q32 (0) // 0: 16 per port, 1: 32 per port 10 | // check all {config_leaf.py, leaf_conweave.cpp, macro.p4} 11 | /*************************************************************************/ 12 | 13 | /*************************************************************************/ 14 | 15 | /************************************************************************* 16 | ************* C O N S T A N T S A N D T Y P E S ******************* 17 | *************************************************************************/ 18 | 19 | /* for ConWeave Table */ 20 | #define CONWEAVE_HASH_WIDTH (16) // maximum 16 bits 21 | #define CONWEAVE_TABLE_SIZE (1 << CONWEAVE_HASH_WIDTH) 22 | typedef bit hashidx_t; 23 | 24 | /* for ConWeave Reordering Queue */ 25 | #define CONWEAVE_QREG_IDX_WIDTH (10) // 13 bits, at ingress 26 | #define CONWEAVE_QREG_IDX_SIZE (1 << CONWEAVE_QREG_IDX_WIDTH) 27 | typedef bit conweave_qreg_idx_width_t; 28 | 29 | #if (CONWEAVE_EVAL_Q16_OR_Q32 == 0) // 0: Q16 (25Gbps), 1: Q32 (100Gbps) 30 | #define CONWEAVE_QREG_IDX_OFFSET_C1 (2) 31 | #define CONWEAVE_QREG_IDX_OFFSET_C2 (6) 32 | #define CONWEAVE_QREG_IDX_OFFSET_C3 (10) 33 | #else 34 | #define CONWEAVE_QREG_IDX_OFFSET_C1 (2) 35 | #define CONWEAVE_QREG_IDX_OFFSET_C2 (10) 36 | #define CONWEAVE_QREG_IDX_OFFSET_C3 (18) 37 | #endif 38 | 39 | #define CONWEAVE_QDEPTH_IDX_WIDTH (10) // 13 bits, at egress 40 | #define CONWEAVE_QDEPTH_IDX_SIZE (1 << CONWEAVE_QDEPTH_IDX_WIDTH) 41 | typedef bit conweave_qdepth_idx_width_t; 42 | 43 | /* type definitions */ 44 | typedef bit<32> afc_msg_t; 45 | typedef bit<48> mac_addr_t; 46 | typedef bit<32> ipv4_addr_t; 47 | 48 | typedef bit<12> nexthop_id_t; 49 | typedef bit<8> switch_id_t; 50 | typedef bit<32> timestamp_t; // use middle of 31 bits, e.g., (bit<32>)X[40:10] 51 | #if (CONWEAVE_EVAL_Q16_OR_Q32 == 0) 52 | typedef bit<2> conweave_qid_width_t; // 2 bits - 4 queues with 3 stages - total 12 queues 53 | #else 54 | typedef bit<3> conweave_qid_width_t; // 3 bits - 8 queues with 3 stages - total 24 queues 55 | #endif 56 | 57 | 58 | /************************************************************************************************/ 59 | /* FOR DEBUGGING (MAKE SLOW) */ 60 | #define TIME_RESOLUTION_OFFSET1 (0) /* 17: (debug) 0.13 actual sec per unit, 0: original speed */ 61 | #define TIME_RESOLUTION_OFFSET2 (0) /* 7: (debug) max resolution, 0: original speed */ 62 | /************************************************************************************************/ 63 | 64 | /** CONWEAVE: PARAMETERS */ 65 | const timestamp_t CONWEAVE_MAX_TIMESTAMP = 0; // 2**31 - 1 66 | 67 | const timestamp_t CONWEAVE_TX_EXPIRED_TS = 0; 68 | const timestamp_t CONWEAVE_TX_ECN_PORT_TS = 0; 69 | const timestamp_t CONWEAVE_TX_REPLY_TIMEOUT_EXTENSION_TS = 0; 70 | const timestamp_t CONWEAVE_TX_STOP_REROUTING_TS = 0; 71 | 72 | const timestamp_t CONWEAVE_RX_DEFAULT_WAITING_TIME = 0; 73 | const timestamp_t CONWEAVE_RX_BASE_WAITING_TIME = 0; 74 | const timestamp_t CONWEAVE_RX_ADJUST_TS_TAIL_WRAP = 0; 75 | const timestamp_t CONWEAVE_RX_ADJUST_TS_TAIL_WRAP_WITH_BASE = 0; 76 | 77 | /** CONWEAVE: ADVANCED FLOW CONTROL */ 78 | #define AFC_CREDIT_PAUSE (1) 79 | #define AFC_CREDIT_RESUME (0) 80 | 81 | /* for resubmission */ 82 | const bit<3> RESUB_DPRSR_DIGEST_REPLY = 7; 83 | 84 | /* for mirroring */ 85 | const bit<8> MIRROR_SESSION_CONWEAVE = 220; // + pipe_id (0,1,2,3) 86 | 87 | /* for custom hashing (crc32_mpeg) */ 88 | CRCPolynomial>(32w0x04C11DB7, false, false, false, 32w0xFFFFFFFF, 32w0x00000000) CRC32_MPEG; 89 | 90 | /************************************************************************* 91 | ************* C O N S T A N T S A N D T Y P E S ******************* 92 | *************************************************************************/ 93 | 94 | /* ARP */ 95 | #define MCAST_GRP_ID (1) 96 | 97 | /* Mirror Types & Recirculation */ 98 | #if __TARGET_TOFINO__ == 2 99 | #define RECIRC_PORT (6) // recirc port on Tofino2 100 | const bit<4> EG_MIRROR_TYPE_1 = 1; // corresponds to eg_mirror1_h 101 | const bit<4> IG_MIRROR_TYPE_1 = 2; // corresponds to ig_mirror1_h 102 | #else 103 | #define RECIRC_PORT (68) // recirc port on Tofino1 104 | const bit<3> EG_MIRROR_TYPE_1 = 1; // corresponds to eg_mirror1_h 105 | const bit<3> IG_MIRROR_TYPE_1 = 2; // corresponds to ig_mirror1_h 106 | #endif 107 | 108 | /* Hashing and Registers */ 109 | struct pair { // for 32-bit pair 110 | bit<32> lo; 111 | bit<32> hi; 112 | } 113 | 114 | /* for ECMP LAG */ 115 | #define MAX_GROUP_SIZE (32) 116 | #define MAX_GROUPS (256) 117 | #define MAX_PROFILE_MEMBERS (2048) 118 | #define TABLE_IPV4_SIZE (2048) 119 | #define TABLE_NEXTHOP_SIZE (2048) 120 | #define SCRAMBLE_ENABLE (1) 121 | #define HASH_WIDTH (16) 122 | 123 | #endif -------------------------------------------------------------------------------- /leaf_conweave_resource/p4src/includes/parser.p4: -------------------------------------------------------------------------------- 1 | #ifndef _PARSER_ 2 | #define _PARSER_ 3 | 4 | #include "macro.p4" 5 | 6 | enum bit<16> ether_type_t { 7 | IPV4 = 0x0800, 8 | ARP = 0x0806, 9 | CWCTRL = 0x2001, // conweave's ctrl-loop header 10 | CWTAIL = 0x2002 // conweave's TAIL header 11 | } 12 | 13 | enum bit<8> ipv4_proto_t { 14 | TCP = 6, 15 | UDP = 17, 16 | ICMP = 1 17 | } 18 | 19 | enum bit<16> udp_proto_t{ 20 | ROCE_V2 = 4791, 21 | FAKE_ROCE_V2 = 4792 // XXX 22 | } 23 | 24 | // --------------------------------------------------------------------------- 25 | // Ingress parser 26 | // --------------------------------------------------------------------------- 27 | parser SwitchIngressParser( 28 | packet_in pkt, 29 | out header_t hdr, 30 | out metadata_t meta, 31 | out ingress_intrinsic_metadata_t ig_intr_md, 32 | out ingress_intrinsic_metadata_for_tm_t ig_intr_md_for_tm, 33 | out ingress_intrinsic_metadata_from_parser_t ig_intr_md_from_prsr) { 34 | state start { 35 | pkt.extract(ig_intr_md); 36 | /**************************************************************************** 37 | * M E T A D A T A I N I T I A L I Z A T I O N 38 | ****************************************************************************/ 39 | 40 | meta.pipeline_index = ig_intr_md.ingress_port [8:7]; // index of pipeline 41 | meta.mirror_session = 0; 42 | meta.conweave_on_off = 0; 43 | meta.conweave_logic = 0; 44 | meta.switch_id = 0; 45 | meta.nexthop_id = 0; 46 | meta.out_port = 0; 47 | meta.out_queue_id = 0; 48 | meta.last_hop = 0; 49 | 50 | meta.dummy_32b = 0; 51 | meta.ts_now = 0; 52 | meta.ts_tail = 0; 53 | meta.hashidx = 0; 54 | meta.digest_on = 0; 55 | 56 | /*----- C O N W E A V E - TxToR M E T A D A T A -----*/ 57 | meta.ts_base_rtt = 0; 58 | meta.ts_new_reply_timeout = 0; 59 | 60 | meta.sample_port_c1 = 0; 61 | meta.sample_port_c2 = 0; 62 | meta.good_port = 0; 63 | meta.final_port = 0; 64 | meta.no_good_port = 0; 65 | meta.stage_to_record_port = 0; 66 | 67 | meta.flag_rdma_data = 0; 68 | meta.flag_matched = 0; 69 | meta.flag_enforce_no_reroute = 0; 70 | meta.result_expired = 0; 71 | meta.result_stability = 0; 72 | meta.result_reply_timeout = 0; 73 | meta.result_timely_replied = 0; 74 | meta.result_phase = 0; 75 | meta.result_epoch = 0; 76 | 77 | meta.result_port_c1_bad = 0; 78 | meta.result_port_c2_bad = 0; 79 | meta.result_reply_with_notify = 0; 80 | 81 | /*----- C O N W E A V E - RxToR (DstToR) M E T A D A T A -----*/ 82 | meta.hash_flowkey = 0; 83 | 84 | meta.ts_phase0_tx = 0; 85 | meta.ts_phase0_rx = 0; 86 | meta.ts_timegap_rx = 0; 87 | meta.ts_expected_tail_arrival_rx = 0; 88 | 89 | meta.result_epoch_rx = 0; 90 | meta.result_phase0_cch_rx = 0; 91 | meta.result_tail_cch_rx = 0; 92 | meta.result_out_of_order_rx = 0; 93 | meta.result_reorder_status = 0; 94 | 95 | meta.hash_qid_sample_c1 = 0; 96 | meta.hash_qid_sample_c2 = 0; 97 | meta.hash_qid_sample_c3 = 0; 98 | meta.idx_q_occup_arr_rx_c1 = 0; 99 | meta.idx_q_occup_arr_rx_c2 = 0; 100 | meta.idx_q_occup_arr_rx_c3 = 0; 101 | meta.result_q_occupancy_c1 = 0; 102 | meta.result_q_occupancy_c2 = 0; 103 | meta.result_q_occupancy_c3 = 0; 104 | meta.result_time_flush_queue_rx = 0; 105 | meta.possibly_tail_before_timeout = 0; 106 | meta.flag_mirr_for_ctrl_loop = 0; 107 | meta.result_tail_send_reply_rx = 0; 108 | meta.result_q_pkt_cntr_ig = 0; 109 | meta.flag_finish_reorder_process = 0; 110 | meta.flag_resume_reorder_queue = 0; 111 | meta.idx_qdepth_history_rx = 0; 112 | 113 | /**** TEMPORARILY *****/ 114 | meta.cntr_additive = 0; 115 | 116 | 117 | /*----- A D V A N C E D F L O W C O N T R O L -----*/ 118 | meta.afc_msg_c1 = 0; 119 | meta.afc_msg_c2 = 0; 120 | meta.afc_msg_c3 = 0; 121 | 122 | /*------ D C Q C N -----*/ 123 | meta.mark_ecn_codepoint = 0; 124 | meta.is_roce_v2 = 0; 125 | meta.dcqcn_prob_output = 0; 126 | meta.dcqcn_random_number = 0; 127 | 128 | /*------ M I R R O R I N G ------*/ 129 | meta.ig_mirror1.mirror_option = 0; 130 | 131 | /*---- R E A D H E A D E R ----*/ 132 | meta.pkt_epoch = 0; 133 | meta.pkt_phase = 0; 134 | meta.pkt_ask_reply = 0; 135 | meta.pkt_tail_flag = 0; 136 | 137 | meta.flag_cwctrl_active = 0; 138 | meta.pkt_cwctrl_timeout = 0; 139 | meta.pkt_cwctrl_cntr_eg = 0; 140 | meta.pkt_cwctrl_drop = 0; 141 | meta.pkt_cwctrl_afc_msg = 0; 142 | 143 | meta.init_cntr_ig.lo = 0; 144 | meta.init_cntr_ig.hi = CONWEAVE_MAX_TIMESTAMP; 145 | 146 | /*----- D E B U G -----*/ 147 | meta.flag_something_wrong = 0; 148 | 149 | transition select(ig_intr_md.resubmit_flag) { 150 | (0) : init_metadata; 151 | (1) : parse_resubmit; 152 | } 153 | } 154 | 155 | state parse_resubmit { 156 | pkt.extract(meta.resubmit_hdr); 157 | pkt.advance(PORT_METADATA_SIZE - sizeInBits(meta.resubmit_hdr)); 158 | transition parse_ethernet; 159 | } 160 | 161 | state init_metadata { 162 | pkt.advance(PORT_METADATA_SIZE); // macro defined in tofino.p4 163 | transition parse_ethernet; 164 | } 165 | 166 | state parse_ethernet { 167 | pkt.extract(hdr.ethernet); 168 | transition select(hdr.ethernet.ether_type) { 169 | (bit<16>)ether_type_t.IPV4 : parse_ipv4; 170 | (bit<16>)ether_type_t.ARP : parse_arp; 171 | (bit<16>)ether_type_t.CWCTRL : parse_ipv4; 172 | default: accept; 173 | } 174 | } 175 | 176 | state parse_ipv4 { 177 | pkt.extract(hdr.ipv4); 178 | 179 | /* copy src/dst ip address */ 180 | meta.meta_src_addr = hdr.ipv4.src_addr; 181 | meta.meta_dst_addr = hdr.ipv4.dst_addr; 182 | 183 | transition select(hdr.ipv4.protocol) { 184 | (bit<8>)ipv4_proto_t.TCP : parse_tcp; 185 | (bit<8>)ipv4_proto_t.UDP : parse_udp; 186 | (bit<8>)ipv4_proto_t.ICMP : parse_icmp; 187 | default: accept; 188 | } 189 | } 190 | 191 | state parse_arp { 192 | pkt.extract(hdr.arp); 193 | transition accept; 194 | } 195 | 196 | state parse_tcp { 197 | pkt.extract(hdr.tcp); 198 | transition accept; 199 | } 200 | 201 | state parse_udp { 202 | pkt.extract(hdr.udp); 203 | transition select(hdr.udp.dst_port) { 204 | (bit<16>)udp_proto_t.ROCE_V2 : parse_bth; 205 | (bit<16>)udp_proto_t.FAKE_ROCE_V2 : parse_bth; // XXX 206 | default: accept; 207 | } 208 | } 209 | 210 | state parse_bth { 211 | pkt.extract(hdr.bth); 212 | meta.is_roce_v2 = 1; // RDMA packet 213 | transition select(hdr.bth.conweave_opcode) { 214 | (bit<2>)1 : parse_conweave; 215 | (bit<2>)2 : parse_conweave; 216 | (bit<2>)3 : parse_conweave; 217 | default: accept; 218 | } 219 | } 220 | 221 | state parse_conweave { 222 | /* pkt metadata */ 223 | meta.pkt_epoch = hdr.bth.conweave_epoch; /* get pkt's epoch */ 224 | meta.pkt_phase = hdr.bth.conweave_phase; /* get pkt's phase */ 225 | meta.pkt_ask_reply = hdr.bth.conweave_ask_reply; /* get pkt's ask_reply */ 226 | meta.pkt_tail_flag = hdr.bth.conweave_tail_flag; /* get tail flag */ 227 | 228 | pkt.extract(hdr.cwh); 229 | transition select(hdr.ethernet.ether_type) { 230 | (bit<16>)ether_type_t.CWCTRL : parse_cwctrl; 231 | default: accept; 232 | } 233 | } 234 | 235 | 236 | state parse_cwctrl { 237 | pkt.extract(hdr.cwctrl); 238 | meta.flag_cwctrl_active = 1; 239 | meta.pkt_cwctrl_timeout = hdr.cwctrl.timeout; 240 | meta.pkt_cwctrl_drop = hdr.cwctrl.drop; 241 | meta.pkt_cwctrl_cntr_eg = hdr.cwctrl.cntr_eg; 242 | meta.pkt_cwctrl_afc_msg = hdr.cwctrl.afc_msg; 243 | transition accept; 244 | } 245 | 246 | 247 | 248 | state parse_icmp { 249 | pkt.extract(hdr.icmp); 250 | transition accept; 251 | } 252 | } 253 | 254 | // --------------------------------------------------------------------------- 255 | // Ingress Deparser 256 | // --------------------------------------------------------------------------- 257 | 258 | control SwitchIngressDeparser( 259 | packet_out pkt, 260 | inout header_t hdr, 261 | in metadata_t meta, 262 | in ingress_intrinsic_metadata_for_deparser_t ig_dprsr_md) { 263 | Checksum() ipv4_checksum; 264 | Mirror() mirror; 265 | Resubmit() resubmit; 266 | 267 | apply { 268 | /* CHECKSUM */ 269 | hdr.ipv4.hdr_checksum = ipv4_checksum.update({hdr.ipv4.version, 270 | hdr.ipv4.ihl, 271 | hdr.ipv4.dscp, 272 | hdr.ipv4.ecn, 273 | hdr.ipv4.total_len, 274 | hdr.ipv4.identification, 275 | hdr.ipv4.flags, 276 | hdr.ipv4.frag_offset, 277 | hdr.ipv4.ttl, 278 | hdr.ipv4.protocol, 279 | hdr.ipv4.src_addr, 280 | hdr.ipv4.dst_addr}); 281 | 282 | /* RESUBMIT */ 283 | if (ig_dprsr_md.resubmit_type == RESUB_DPRSR_DIGEST_REPLY) { 284 | resubmit.emit(meta.resubmit_hdr); 285 | } 286 | 287 | /* INGRESS MIRRORING FOR REPLY/NOTIFY */ 288 | if (ig_dprsr_md.mirror_type == IG_MIRROR_TYPE_1) { 289 | mirror.emit(meta.mirror_session, {meta.ig_mirror1.mirror_option}); 290 | } 291 | 292 | pkt.emit(hdr); 293 | } 294 | } 295 | 296 | // --------------------------------------------------------------------------- 297 | // Egress parser 298 | // --------------------------------------------------------------------------- 299 | parser SwitchEgressParser( 300 | packet_in pkt, 301 | out header_t hdr, 302 | out metadata_t meta, 303 | out egress_intrinsic_metadata_t eg_intr_md, 304 | out egress_intrinsic_metadata_from_parser_t eg_intr_md_from_prsr) { 305 | state start { 306 | pkt.extract(eg_intr_md); 307 | 308 | 309 | /*---- R E A D H E A D E R ----*/ 310 | meta.pkt_epoch = 0; 311 | meta.pkt_phase = 0; 312 | meta.pkt_ask_reply = 0; 313 | meta.pkt_tail_flag = 0; 314 | 315 | meta.flag_cwctrl_active = 0; 316 | meta.pkt_cwctrl_timeout = 0; 317 | meta.pkt_cwctrl_drop = 0; 318 | meta.pkt_cwctrl_cntr_eg = 0; 319 | meta.pkt_cwctrl_afc_msg = 0; 320 | 321 | 322 | transition parse_metadata; 323 | } 324 | 325 | state parse_metadata { 326 | /* D C Q C N */ 327 | meta.mark_ecn_codepoint = 0; 328 | meta.is_roce_v2 = 0; 329 | meta.dcqcn_prob_output = 0; 330 | meta.dcqcn_random_number = 0; 331 | 332 | /*---- M E T A D A T A ----*/ 333 | meta.flag_check_tail_resume = 0; 334 | 335 | ig_mirror1_h mirror_md = pkt.lookahead(); 336 | transition select(mirror_md.mirror_option) { 337 | 1 : parse_mirror_reply_notify; 338 | 2 : parse_mirror_reply_notify; 339 | 3 : parse_mirror_reply_notify; 340 | 4 : parse_mirror_reply_notify; 341 | default: parse_ethernet; 342 | } 343 | } 344 | 345 | /* mirroring */ 346 | state parse_mirror_reply_notify { 347 | pkt.extract(meta.ig_mirror1); 348 | transition parse_ethernet; 349 | } 350 | 351 | state parse_ethernet { 352 | pkt.extract(hdr.ethernet); 353 | transition select(hdr.ethernet.ether_type) { 354 | (bit<16>)ether_type_t.IPV4 : parse_ipv4; 355 | (bit<16>)ether_type_t.CWCTRL : parse_ipv4; 356 | (bit<16>)ether_type_t.CWTAIL : parse_ipv4; 357 | default: accept; 358 | } 359 | } 360 | 361 | state parse_ipv4 { 362 | pkt.extract(hdr.ipv4); 363 | transition select(hdr.ipv4.protocol) { 364 | // (bit<8>) ipv4_proto_t.TCP: parse_tcp; 365 | (bit<8>)ipv4_proto_t.UDP : parse_udp; 366 | default: accept; 367 | } 368 | } 369 | 370 | state parse_udp { 371 | pkt.extract(hdr.udp); 372 | transition select(hdr.udp.dst_port) { 373 | (bit<16>)udp_proto_t.ROCE_V2 : parse_bth; 374 | (bit<16>)udp_proto_t.FAKE_ROCE_V2 : parse_bth; // XXX 375 | default: accept; 376 | } 377 | } 378 | 379 | state parse_bth { 380 | pkt.extract(hdr.bth); 381 | meta.is_roce_v2 = 1; // RDMA packet 382 | transition select(hdr.bth.conweave_opcode) { 383 | (bit<2>)1 : parse_conweave; 384 | (bit<2>)2 : parse_conweave; 385 | (bit<2>)3 : parse_conweave; 386 | default: accept; 387 | } 388 | } 389 | 390 | state parse_conweave { 391 | /* pkt metadata */ 392 | meta.pkt_epoch = hdr.bth.conweave_epoch; /* get pkt's epoch */ 393 | meta.pkt_phase = hdr.bth.conweave_phase; /* get pkt's phase */ 394 | meta.pkt_ask_reply = hdr.bth.conweave_ask_reply; /* get pkt's ask_reply */ 395 | meta.pkt_tail_flag = hdr.bth.conweave_tail_flag; /* get tail flag */ 396 | 397 | pkt.extract(hdr.cwh); 398 | transition select(hdr.ethernet.ether_type) { 399 | (bit<16>)ether_type_t.CWCTRL : parse_cwctrl; 400 | (bit<16>)ether_type_t.CWTAIL : parse_cwtail; 401 | default: accept; 402 | } 403 | } 404 | 405 | 406 | state parse_cwctrl { 407 | pkt.extract(hdr.cwctrl); 408 | meta.flag_cwctrl_active = 1; 409 | meta.pkt_cwctrl_timeout = hdr.cwctrl.timeout; 410 | meta.pkt_cwctrl_drop = hdr.cwctrl.drop; 411 | meta.pkt_cwctrl_cntr_eg = hdr.cwctrl.cntr_eg; 412 | meta.pkt_cwctrl_afc_msg = hdr.cwctrl.afc_msg; 413 | transition accept; 414 | } 415 | 416 | state parse_cwtail { 417 | pkt.extract(hdr.tailh); 418 | transition accept; 419 | } 420 | 421 | 422 | // do more stuff here if needed 423 | } 424 | 425 | // --------------------------------------------------------------------------- 426 | // Egress Deparser 427 | // --------------------------------------------------------------------------- 428 | control SwitchEgressDeparser( 429 | packet_out pkt, 430 | inout header_t hdr, 431 | in metadata_t meta, 432 | in egress_intrinsic_metadata_for_deparser_t eg_intr_md_for_dprsr, 433 | in egress_intrinsic_metadata_t eg_intr_md, 434 | in egress_intrinsic_metadata_from_parser_t eg_intr_md_from_prsr) { 435 | apply { 436 | // do more stuff here if needed 437 | pkt.emit(hdr); 438 | } 439 | } 440 | 441 | #endif -------------------------------------------------------------------------------- /leaf_conweave_resource/p4src/includes/registers_egress.p4: -------------------------------------------------------------------------------- 1 | /* -*- P4_16 -*- */ 2 | #include 3 | #if __TARGET_TOFINO__ == 2 4 | #include 5 | #else 6 | #include 7 | #endif 8 | 9 | #include "headers.p4" 10 | #include "macro.p4" 11 | #include "parser.p4" 12 | 13 | 14 | 15 | 16 | 17 | /**************************************************************************** 18 | * D E Q U E U E C O U N T E R A T E G R E S S 19 | ****************************************************************************/ 20 | Register, conweave_qdepth_idx_width_t>(size=CONWEAVE_QDEPTH_IDX_SIZE) reg_buffer_egress_cntr; 21 | RegisterAction, conweave_qdepth_idx_width_t, bit<32>>(reg_buffer_egress_cntr) reg_read_reset_buffer_egress_cntr = { 22 | void apply(inout bit<32> reg, out bit<32> result){ 23 | if (hdr.cwctrl.drop == 1) { 24 | reg = 0; /** DROP: reorder is resolved, reset counter to 0 */ 25 | } 26 | result = reg; /** READ: read register and save to cwctrl header */ 27 | } 28 | }; 29 | RegisterAction, conweave_qdepth_idx_width_t, bit<32>>(reg_buffer_egress_cntr) reg_increment_buffer_egress_cntr = { 30 | void apply(inout bit<32> reg){ 31 | reg = reg |+| 1; /** DEQUEUE: increase counter by 1 */ 32 | } 33 | }; 34 | action do_read_reset_buffer_egress_cntr() { 35 | hdr.cwctrl.cntr_eg = reg_read_reset_buffer_egress_cntr.execute(meta.idx_qdepth_history_rx); 36 | } 37 | action do_increment_buffer_egress_cntr() { 38 | reg_increment_buffer_egress_cntr.execute(meta.idx_qdepth_history_rx); 39 | } 40 | 41 | 42 | 43 | 44 | /**************************************************************************** 45 | * R E O R D E R Q U E U E F L U S H B Y T A I L 46 | ****************************************************************************/ 47 | Register, hashidx_t>(size=CONWEAVE_TABLE_SIZE) reg_tail_resume; 48 | RegisterAction, hashidx_t, bit<1>>(reg_tail_resume) reg_check_tail_resume = { 49 | void apply(inout bit<8> reg, out bit<1> result){ 50 | result = (bit<1>)reg; 51 | if (reg == 1) { 52 | reg = 0; 53 | } 54 | } 55 | }; 56 | RegisterAction, hashidx_t, bit<1>>(reg_tail_resume) reg_update_tail_resume = { 57 | void apply(inout bit<8> reg, out bit<1> result){ 58 | reg = 1; 59 | } 60 | }; 61 | action do_check_tail_resume() { 62 | meta.flag_check_tail_resume = reg_check_tail_resume.execute((hashidx_t)hdr.cwctrl.hashidx); 63 | } 64 | action do_update_tail_resume() { 65 | reg_update_tail_resume.execute((hashidx_t)hdr.tailh.hashidx); 66 | } 67 | 68 | 69 | 70 | /**************************************************************************** 71 | * E C N M A R K I N G 72 | ****************************************************************************/ 73 | 74 | // ##### DCTCP ECN Marking ##### 75 | Register,bit<1>>(1,524287) reg_ecn_marking_threshold; // default = 2^19 - 1 76 | RegisterAction,bit<1>,bit<1>>(reg_ecn_marking_threshold) cmp_ecn_marking_threshold = { 77 | void apply(inout bit<32> reg_val, out bit<1> rv){ 78 | if((bit<32>)eg_intr_md.deq_qdepth >= reg_val){ 79 | rv = 1; 80 | } 81 | else{ 82 | rv = 0; 83 | } 84 | } 85 | }; 86 | action check_ecn_marking_threshold(){ 87 | meta.mark_ecn_codepoint = cmp_ecn_marking_threshold.execute(0); 88 | } 89 | 90 | 91 | 92 | /**************************************************************************** 93 | * D E B U G G I N G 94 | ****************************************************************************/ 95 | Register,bit<1>>(1, 0) reg_debug_eg_cntr1; 96 | RegisterAction, bit<1>, bit<32>>(reg_debug_eg_cntr1) reg_debug_eg_cntr1_action = { 97 | void apply(inout bit<32> reg, out bit<32> result) { 98 | reg = reg + 1; 99 | } 100 | }; 101 | action do_debug_eg_cntr1() { 102 | reg_debug_eg_cntr1_action.execute(0); 103 | } 104 | Register,bit<1>>(1, 0) reg_debug_eg_cntr2; 105 | RegisterAction, bit<1>, bit<32>>(reg_debug_eg_cntr2) reg_debug_eg_cntr2_action = { 106 | void apply(inout bit<32> reg, out bit<32> result) { 107 | reg = reg + 1; 108 | } 109 | }; 110 | action do_debug_eg_cntr2() { 111 | reg_debug_eg_cntr2_action.execute(0); 112 | } 113 | Register,bit<1>>(1, 0) reg_debug_eg_cntr3; 114 | RegisterAction, bit<1>, bit<32>>(reg_debug_eg_cntr3) reg_debug_eg_cntr3_action = { 115 | void apply(inout bit<32> reg, out bit<32> result) { 116 | reg = reg + 1; 117 | } 118 | }; 119 | action do_debug_eg_cntr3() { 120 | reg_debug_eg_cntr3_action.execute(0); 121 | } 122 | Register,bit<1>>(1, 0) reg_debug_eg_cntr4; 123 | RegisterAction, bit<1>, bit<32>>(reg_debug_eg_cntr4) reg_debug_eg_cntr4_action = { 124 | void apply(inout bit<32> reg, out bit<32> result) { 125 | reg = reg + 1; 126 | } 127 | }; 128 | action do_debug_eg_cntr4() { 129 | reg_debug_eg_cntr4_action.execute(0); 130 | } 131 | Register,bit<1>>(1, 0) reg_debug_eg_cntr5; 132 | RegisterAction, bit<1>, bit<32>>(reg_debug_eg_cntr5) reg_debug_eg_cntr5_action = { 133 | void apply(inout bit<32> reg, out bit<32> result) { 134 | reg = reg + 1; 135 | } 136 | }; 137 | action do_debug_eg_cntr5() { 138 | reg_debug_eg_cntr5_action.execute(0); 139 | } 140 | -------------------------------------------------------------------------------- /leaf_conweave_resource/p4src/includes/tables_egress.p4: -------------------------------------------------------------------------------- 1 | /* -*- P4_16 -*- */ 2 | #include 3 | #if __TARGET_TOFINO__ == 2 4 | #include 5 | #else 6 | #include 7 | #endif 8 | 9 | #include "headers.p4" 10 | #include "macro.p4" 11 | #include "parser.p4" 12 | 13 | 14 | /**************************************************************************** 15 | * M A P R E G - I N D E X F O R Q D E P T H H I S T O R Y 16 | ****************************************************************************/ 17 | action get_idx_queue_occupancy_array_data_eg(conweave_qdepth_idx_width_t idx) { 18 | meta.idx_qdepth_history_rx = idx; 19 | meta.hit_idx_queue_occupancy_tbl_eg = 1; 20 | } 21 | 22 | table do_get_idx_queue_occupancy_array_data_eg { 23 | key = { 24 | eg_intr_md.egress_port: exact; // 9 bits 25 | eg_intr_md.egress_qid: exact; // 7 bits 26 | } 27 | actions = { get_idx_queue_occupancy_array_data_eg; @defaultonly nop; } 28 | const default_action = nop(); 29 | size = CONWEAVE_QDEPTH_IDX_SIZE; 30 | } 31 | 32 | action get_idx_queue_occupancy_array_ctrl_eg(conweave_qdepth_idx_width_t idx) { 33 | meta.idx_qdepth_history_rx = idx; 34 | meta.hit_idx_queue_occupancy_tbl_eg = 1; 35 | } 36 | 37 | table do_get_idx_queue_occupancy_array_ctrl_eg { 38 | key = { 39 | hdr.cwctrl.afc_msg: exact; // 32 bits 40 | } 41 | actions = { get_idx_queue_occupancy_array_ctrl_eg; @defaultonly nop; } 42 | const default_action = nop(); 43 | size = CONWEAVE_QDEPTH_IDX_SIZE; 44 | } 45 | 46 | /**************************************************************************** 47 | * D C Q C N C O N F I G U R A T I O N 48 | ****************************************************************************/ 49 | 50 | 51 | // ##### DCQCN ECN Marking ##### 52 | action dcqcn_mark_probability(bit<8> value) { 53 | meta.dcqcn_prob_output = value; 54 | } 55 | 56 | table dcqcn_get_ecn_probability { 57 | key = { 58 | eg_intr_md.deq_qdepth : range; // 19 bits 59 | } 60 | actions = { 61 | dcqcn_mark_probability; 62 | } 63 | const default_action = dcqcn_mark_probability(0); // default: no ecn mark 64 | size = 1024; 65 | } 66 | 67 | Random>() random; // random seed for sampling 68 | action dcqcn_get_random_number(){ 69 | meta.dcqcn_random_number = random.get(); 70 | } 71 | 72 | action dcqcn_check_ecn_marking() { 73 | meta.mark_ecn_codepoint = 1; 74 | } 75 | 76 | table dcqcn_compare_probability { 77 | key = { 78 | meta.dcqcn_prob_output : exact; 79 | meta.dcqcn_random_number : exact; 80 | } 81 | actions = { 82 | dcqcn_check_ecn_marking; 83 | @defaultonly nop; 84 | } 85 | const default_action = nop(); 86 | size = 65536; 87 | } 88 | // ##### DCQCN ECN Marking (end) ##### 89 | 90 | 91 | 92 | /**************************************************************************** 93 | * P O R T S F R O M T O R T O D E S T I N A T I O N 94 | ****************************************************************************/ 95 | 96 | action check_toward_dst() { 97 | meta.last_hop = 1; 98 | } 99 | table do_check_toward_dst { 100 | key = { 101 | eg_intr_md.egress_port : exact; 102 | } 103 | actions = { 104 | check_toward_dst; @defaultonly nop; 105 | } 106 | const default_action = nop(); 107 | size = 256; 108 | } 109 | 110 | -------------------------------------------------------------------------------- /leaf_conweave_resource/p4src/leaf_conweave_resource.p4: -------------------------------------------------------------------------------- 1 | /* -*- P4_16 -*- */ 2 | #include 3 | #if __TARGET_TOFINO__ == 2 4 | #include 5 | #else 6 | #include 7 | #endif 8 | 9 | #include "includes/conweave_egress.p4" 10 | #include "includes/conweave_ingress.p4" 11 | #include "includes/headers.p4" 12 | #include "includes/macro.p4" 13 | #include "includes/parser.p4" 14 | 15 | Pipeline(SwitchIngressParser(), 16 | SwitchIngress(), 17 | SwitchIngressDeparser(), 18 | SwitchEgressParser(), 19 | SwitchEgress(), 20 | SwitchEgressDeparser() 21 | ) pipe; 22 | 23 | Switch(pipe) main; -------------------------------------------------------------------------------- /native_afc/README.md: -------------------------------------------------------------------------------- 1 | # AFC Egress-Pause Speed Test 2 | We compiled and tested the example using `bf-sde-9.11.1`. 3 | 4 | The important part you must focus on is the AFC header: 5 | ```c 6 | header adv_flow_ctl_h { 7 | bit<32> adv_flow_ctl; 8 | 9 | /** 32-bit adv_flow_ctl format */ 10 | // bit<1> qfc; 11 | // bit<2> tm_pipe_id; 12 | // bit<4> tm_mac_id; 13 | // bit<3> _pad; 14 | // bit<7> tm_mac_qid; 15 | // bit<15> credit; 16 | } 17 | ``` 18 | 19 | 20 | ## Where AFC works 21 | AFC packet (with AFC header) works at ingress deparser and/or egress deparser. 22 | Note that it must be very careful to clearly pause the queue and all packets 23 | It is because the AFC packet must be activated at pipeline's deparser, but it takes a time to go through the pipeline. 24 | So, there can be a minor leakage of some packets (dequeued) although you pause the queue. 25 | 26 | Here is the toy example showing that although you resume and immediately pause a queue, due to the timing, some packets have dequeued and egressed. 27 | 28 | ## Prerequisite 29 | 1. After building and running `native_afc.p4`, run the port setup using `cp/setup.py`: 30 | ```shell 31 | $ ./run_bfshell.sh -b `pwd`/cp/setup.py 32 | ``` 33 | Note that based on your cabling, you should change the port number values in the script, appropriately. 34 | 35 | 36 | 2. In `cp` directory, 37 | * We implemented `send_afc_pause.py` that generates a AFC packet pausing the specific queue. 38 | * We implemented `rpc_afc_config.py`, a `run_pd_rpc` script that enables AFC on the pipe. 39 | 40 | For more details such as mapping between logical and physical port numbering, see `Tofino Native Architecture (TNA)` documentation from Intel Barefoot. 41 | 42 | 3. You must know the Tofino architecture. At a high level, it has a flow of ingress pipeline, traffic manager (queues), and egress pipeline. Each pipeline includes parser, processing pipeline, and deparser. 43 | You can refer to `Tofino Native Architecture (TNA)` documentation. 44 | 45 | 46 | ## Speed of PFC from Ingress to Egress (25Gbps Port Speed) 47 | 1. Config AFC at egress, and `eg_bypass = 0`: 48 | ``` 49 | set_afc_activate(1) # activate "AFC" at egress, deactivated at ingress 50 | set_afc_egress_bypass(0) # no bypass 51 | ``` 52 | then send 1 `PAUSE` packet. This will pause the queue, and `PAUSE` packet will be egressed out. 53 | 54 | 55 | 2. Use the scapy script `send_afc_pause.py`. Send 1 `PAUSE` and 10 `PING`. 56 | Check the Queue occupancy with `bfshell`. The queue should have 11 packets (1 `PAUSE` and 10 `PING`). 57 | 58 | 59 | 3. Config AFC at Ingress, and `eg_bypass=1`: 60 | ``` 61 | set_afc_activate(0) # activate "AFC" at ingress, deactivated at egress 62 | set_afc_egress_bypass(1) # egress bypass 63 | ``` 64 | Then, send 1 `RESUME`. 65 | 66 | This `RESUME` packet will resume the queue at ingress and be queued. 67 | After that, the previously enqueued `PAUSE` packet will go through egress pipeline and pause the queue (activated at egress deparser). 68 | 69 | 70 | 71 | ### Result 72 | We observe total 8 packets (10 cells) are dequeued, including `PAUSE` packet. 73 | 74 | Why not 11 packets? It is because when `PAUSE` packet passing through egress pipeline, 2 ping packets have dequeued concurrently. They are not affected by `PAUSE` action as they are already dequeued and are being sent out to egress port. 75 | In ConWeave, we carefully implemented P4 program to consider this issue. 76 | To solve this, we used priority queues (but we did not mention this issue in the paper since it is too detailed). -------------------------------------------------------------------------------- /native_afc/cp/rpc_afc_config.py: -------------------------------------------------------------------------------- 1 | # python3 $SDE/run_pd_rpc.py `pwd`/rpc_afc_config.py 2 | 3 | # Step 1: enable AFC on the pipe 4 | pipe_id = 1 5 | qid = 0 6 | dev_port = 137 7 | rst = tm.sched_adv_fc_mode_enable_get(dev=0, pipe=pipe_id) 8 | print("Default enable mode on the pipe is " + str(rst)) 9 | tm.sched_adv_fc_mode_enable_set(dev=0, pipe=pipe_id, enable=True) 10 | rst = tm.sched_adv_fc_mode_enable_get(dev=0, pipe=pipe_id) 11 | print("After, enable mode on the pipe is " + str(rst)) 12 | 13 | # Step 2: enable AFC *mode* on the queue 14 | q_mode = tm.sched_q_adv_fc_mode_get(dev=0, port=dev_port, q=qid) 15 | print("Default queue mode is " + str(q_mode)) 16 | tm.sched_q_adv_fc_mode_set(dev=0, port=dev_port, q=qid, mode=1) # mode: 1 is XOFF, 0 if CREDIT 17 | q_mode = tm.sched_q_adv_fc_mode_get(dev=0, port=dev_port, q=qid) 18 | print("After set, queue mode is " + str(q_mode)) 19 | assert(q_mode==1) 20 | -------------------------------------------------------------------------------- /native_afc/cp/send_afc_pause.py: -------------------------------------------------------------------------------- 1 | from scapy.all import * 2 | import os 3 | import time 4 | 5 | 6 | myhost = os.uname()[1] 7 | assert(myhost=="lumos") 8 | iface="ens2f0" 9 | 10 | TOF2_0 = "64:9d:99:b1:26:0e" 11 | TOF2_1 = "64:9d:99:b1:26:0f" 12 | 13 | MINSIZE = 60 14 | ETHERTYPE_PAD = 0x2000 15 | ETHERTYPE_AFC = 0x2001 16 | 17 | #test_port = 9 #31/1 18 | #test_qid = 0 19 | 20 | # def get_pg_id(dev_port): 21 | # # each pipe has 64 dev_ports + divide by 8 to get the pg_id 22 | # pg_id = ((dev_port % 128) >> 3) 23 | # return pg_id 24 | 25 | # def get_pg_queue(dev_port, qid): 26 | # lane = dev_port % 8 27 | # pg_queue = lane * 16 + qid # there are 16 queues per lane 28 | # return pg_queue 29 | 30 | class AFC(Packet): 31 | name = "adv_flow_ctl" 32 | fields_desc=[ 33 | BitField("qfc", 0, 1), 34 | BitField("tm_pipe_id", 0, 2), 35 | BitField("tm_mac_id", 0, 4), 36 | BitField("pad", 0, 3), 37 | BitField("tm_mac_qid", 0, 7), 38 | BitField("credit", 0, 15) # 15-bit signed integer value 39 | ] 40 | 41 | 42 | def send_pause(pg_pipe, pg_port, pg_queue): 43 | pkt_pause = Ether(type=ETHERTYPE_AFC, src=TOF2_0, dst=TOF2_1) 44 | pkt_pause = pkt_pause/AFC( 45 | qfc = 1, # 1 bit 46 | tm_pipe_id = pg_pipe, # 2 bits 47 | tm_mac_id = pg_port, # 4 bits 48 | tm_mac_qid = pg_queue, # 10 bits 49 | credit = 1 50 | ) 51 | pkt_pause = pkt_pause/("0" * (MINSIZE - len(pkt_pause))) 52 | pkt_pause.show() 53 | sendp(pkt_pause, iface=iface) 54 | 55 | def send_unpause(pg_pipe, pg_port, pg_queue): 56 | pkt_unpause = Ether(type=ETHERTYPE_AFC, src=TOF2_0, dst=TOF2_1) 57 | pkt_unpause = pkt_unpause/AFC( 58 | qfc = 1, 59 | tm_pipe_id = pg_pipe, 60 | tm_mac_id = pg_port, 61 | tm_mac_qid = pg_queue, 62 | credit = 0 63 | ) 64 | pkt_unpause = pkt_unpause/("0" * (MINSIZE - len(pkt_unpause))) 65 | pkt_unpause.show() 66 | sendp(pkt_unpause, iface=iface) 67 | 68 | # print("************ Pause ************") 69 | send_pause(1, 1, 16) # dev_port = 137, qid = 0 70 | # print("************ Sleep 5 Seconds ************") 71 | # time.sleep(5) 72 | # print("************ Un-Pause ************") 73 | # send_unpause(1, 1, 16) # dev_port = 137, qid = 0 74 | -------------------------------------------------------------------------------- /native_afc/cp/setup.py: -------------------------------------------------------------------------------- 1 | import socket 2 | import sys 3 | import os 4 | import time 5 | 6 | hostname = socket.gethostname() 7 | def add_port_config(bfrt, port_config): 8 | speed_dict = {'10G':'BF_SPEED_10G', '25G':'BF_SPEED_25G', '40G':'BF_SPEED_40G','50G':'BF_SPEED_50G', '100G':'BF_SPEED_100G', '400G': 'BF_SPEED_400G'} 9 | fec_dict = {'NONE':'BF_FEC_TYP_NONE', 'FC':'BF_FEC_TYP_FC', 'RS':'BF_FEC_TYP_REED_SOLOMON'} 10 | an_dict = {0:'PM_AN_DEFAULT', 1:'PM_AN_FORCE_ENABLE', 2:'PM_AN_FORCE_DISABLE'} 11 | lanes_dict = {'10G':(0,1,2,3), '25G':(0,1,2,3), '40G':(0,), '50G':(0,2), '100G':(0,)} 12 | lpbk_dict = {"none": "BF_LPBK_NONE", "mac-near": "BF_LPBK_MAC_NEAR"} 13 | 14 | if len(port_config) == 4: # no loopback option 15 | port_config += ("none", ) 16 | assert(len(port_config) == 5) 17 | 18 | # extract and map values from the config first 19 | conf_port = int(port_config[0].split('/')[0]) 20 | lane = port_config[0].split('/')[1] 21 | conf_speed = speed_dict[port_config[1]] 22 | conf_fec = fec_dict[port_config[2]] 23 | conf_an = an_dict[port_config[3]] 24 | conf_lpbk = lpbk_dict[port_config[4]] 25 | 26 | if lane == '-': # need to add all possible lanes 27 | lanes = lanes_dict[port_config[1]] 28 | for lane in lanes: 29 | dp = bfrt.port.port_hdl_info.get(CONN_ID=conf_port, CHNL_ID=lane, print_ents=False).data[b'$DEV_PORT'] 30 | bfrt.port.port.add(DEV_PORT=dp, SPEED=conf_speed, FEC=conf_fec, AUTO_NEGOTIATION=conf_an, PORT_ENABLE=True, LOOPBACK_MODE=conf_lpbk) 31 | else: # specific lane is requested 32 | conf_lane = int(lane) 33 | dp = bfrt.port.port_hdl_info.get(CONN_ID=conf_port, CHNL_ID=conf_lane, print_ents=False).data[b'$DEV_PORT'] 34 | bfrt.port.port.add(DEV_PORT=dp, SPEED=conf_speed, FEC=conf_fec, AUTO_NEGOTIATION=conf_an, PORT_ENABLE=True, LOOPBACK_MODE=conf_lpbk) 35 | 36 | print("Port Configuration - FP:{}, SPEED:{}, FEC:{}, AN:{}, LPBK:{}".format(port_config[0], port_config[1], port_config[2], port_config[3], port_config[4])) 37 | 38 | 39 | hostname = socket.gethostname() 40 | if hostname == 'tofino2a': 41 | fp_port_configs = [ 42 | ('1/0', '25G', 'NONE', 2), # lumos cwh52a 43 | ('1/1', '25G', 'NONE', 2), # lumos cwh52b 44 | ] 45 | 46 | # loopback 47 | fp_lpbk_configs = [ 48 | ('4/0', '400G', 'RS', 2, "mac-near"), # 400G needs RS FEC, 49 | ('32/0', '400G', 'RS', 2, "mac-near"), # 400G needs RS FEC, 50 | ] 51 | 52 | active_dev_ports = [136, 137] # connected to servers 53 | else: 54 | print("This setup script is for tofino2a/p4campus-proc1. But you are running on {}".format(hostname)) 55 | sys.exit(1) 56 | 57 | # port setup 58 | for config in fp_port_configs: 59 | add_port_config(bfrt, config) 60 | 61 | # lpbk port setup 62 | for lpbk_config in fp_lpbk_configs: 63 | add_port_config(bfrt, lpbk_config) 64 | 65 | 66 | # ARP 67 | if len(active_dev_ports) > 0: 68 | try: 69 | bfrt.pre.node.add(MULTICAST_NODE_ID=0, MULTICAST_RID=0, MULTICAST_LAG_ID=[], DEV_PORT=active_dev_ports) 70 | bfrt.pre.mgid.add(MGID=1, MULTICAST_NODE_ID=[0], MULTICAST_NODE_L1_XID_VALID=[False], MULTICAST_NODE_L1_XID=[0]) 71 | except: 72 | print("ARP entries may already exist, so skip!") 73 | 74 | p4 = bfrt.native_afc.pipe 75 | if hostname == 'tofino2a': 76 | ## for only one-switch config 77 | p4.SwitchIngress.simple_l2_forward.add_with_forward(ingress_port=136, port=137) 78 | p4.SwitchIngress.simple_l2_forward.add_with_forward(ingress_port=137, port=136) 79 | 80 | elif hostname == "p4campus-proc1": 81 | ### 3-hop topology 82 | p4.SwitchIngress.simple_l2_forward.add_with_forward(ingress_port=160, port=168) 83 | p4.SwitchIngress.simple_l2_forward.add_with_forward(ingress_port=168, port=160) 84 | else: 85 | print("This setup script is for tofino2a/p4campus-proc1. But you are running on {}".format(hostname)) 86 | sys.exit(1) 87 | 88 | 89 | 90 | # Advanced Flow Control 91 | ## get pg_id, pg_queue 92 | def get_pg_info(dev_port, queue_id): 93 | pipe = dev_port >> 7 94 | entry = bfrt.tf2.tm.port.cfg.get(dev_port, print_ents=False) 95 | pg_id = entry.data[b'pg_id'] 96 | pg_queue = entry.data[b'egress_qid_queues'][queue_id] 97 | print('DEV_PORT: {} QueueID: {} --> Pipe: {}, PG_ID: {}, PG_QUEUE: {}'.format(dev_port, queue_id, pipe, pg_id, pg_queue)) 98 | return (pipe, pg_id, pg_queue) # 137 -> 1, 1, 16 99 | 100 | pg_info = get_pg_info(dev_port=137, queue_id=0) 101 | # bfrt.tf2.tm.queue.sched_cfg.mod(pipe=pg_info[0], pg_id=pg_info[1], pg_queue=pg_info[2], advanced_flow_control="XOFF") 102 | 103 | def set_afc_activate(ingress_0_or_egress_1=0): 104 | p4.SwitchIngress.afc_where.mod(REGISTER_INDEX=0, f1=ingress_0_or_egress_1) 105 | p4.SwitchIngress.afc_where.dump(from_hw=True) 106 | 107 | def set_afc_forward(dev_port=137): 108 | p4.SwitchIngress.afc_forward.mod(REGISTER_INDEX=0, f1=dev_port) 109 | p4.SwitchIngress.afc_forward.dump(from_hw=True) 110 | 111 | def set_afc_egress_bypass(bypass_1_or_0=1): 112 | p4.SwitchIngress.afc_egress_bypass.mod(REGISTER_INDEX=0, f1=bypass_1_or_0) 113 | p4.SwitchIngress.afc_egress_bypass.dump(from_hw=True) 114 | 115 | def get_queue(dev_port=137, queue_id=0): 116 | pg_info = get_pg_info(dev_port, queue_id) 117 | bfrt.tf2.tm.counter.queue.get(from_hw=True, pipe=pg_info[0], pg_id=pg_info[1], pg_queue=pg_info[2]) 118 | 119 | def get_afc_record(): 120 | p4.SwitchIngress.afc_record.dump(from_hw=True) 121 | 122 | def help(): 123 | print("set_afc_activate(ingress_0_or_egress_1)") 124 | print("set_afc_forward(dev_port)") 125 | print("set_afc_egress_bypass(bypass_1_or_0)") 126 | print("get_afc_record()") 127 | 128 | -------------------------------------------------------------------------------- /native_afc/p4src/native_afc.p4: -------------------------------------------------------------------------------- 1 | /* -*- P4_16 -*- */ 2 | 3 | #include 4 | #if __TARGET_TOFINO__ == 2 5 | #include 6 | #else 7 | #include 8 | #endif 9 | 10 | /************************************************************************* 11 | ************* C O N S T A N T S A N D T Y P E S ******************* 12 | **************************************************************************/ 13 | 14 | 15 | /** 16 | * @brief Basic networking 17 | */ 18 | typedef bit<48> mac_addr_t; 19 | typedef bit<32> ipv4_addr_t; 20 | typedef bit<8> ip_protocol_t; 21 | const ip_protocol_t IP_PROTOCOL_UDP = 0x11; 22 | const ip_protocol_t IP_PROTOCOL_TCP = 0x6; 23 | const int MCAST_GRP_ID = 1; 24 | 25 | enum bit<16> ether_type_t { 26 | IPV4 = 0x0800, 27 | ARP = 0x0806, 28 | ETHERTYPE_AFC = 0x2001 29 | } 30 | enum bit<8> ipv4_proto_t { 31 | TCP = IP_PROTOCOL_TCP, 32 | UDP = IP_PROTOCOL_UDP 33 | } 34 | 35 | const bit<16> UDP_ROCE_V2 = 4791; // UDP RoCEv2 36 | 37 | /************************************************************************* 38 | *********************** H E A D E R S ********************************* 39 | *************************************************************************/ 40 | 41 | /* Define all the headers the program will recognize */ 42 | /* The actual sets of headers processed by each gress can differ */ 43 | 44 | /* Standard ethernet header */ 45 | header ethernet_h { 46 | mac_addr_t dst_addr; 47 | mac_addr_t src_addr; 48 | bit<16> ether_type; 49 | } 50 | 51 | header arp_h { 52 | bit<16> htype; 53 | bit<16> ptype; 54 | bit<8> hlen; 55 | bit<8> plen; 56 | bit<16> oper; 57 | mac_addr_t sender_hw_addr; 58 | ipv4_addr_t sender_ip_addr; 59 | mac_addr_t target_hw_addr; 60 | ipv4_addr_t target_ip_addr; 61 | } 62 | 63 | header ipv4_h { 64 | bit<4> version; 65 | bit<4> ihl; 66 | bit<6> dscp; 67 | bit<2> ecn; 68 | bit<16> total_len; 69 | bit<16> identification; 70 | bit<3> flags; 71 | bit<13> frag_offset; 72 | bit<8> ttl; 73 | bit<8> protocol; 74 | bit<16> hdr_checksum; 75 | ipv4_addr_t src_addr; 76 | ipv4_addr_t dst_addr; 77 | } 78 | 79 | header tcp_h { 80 | bit<16> src_port; 81 | bit<16> dst_port; 82 | bit<32> seq_no; 83 | bit<32> ack_no; 84 | bit<4> data_offset; 85 | bit<4> res; 86 | bit<8> flags; 87 | bit<16> window; 88 | bit<16> checksum; 89 | bit<16> urgent_ptr; 90 | } 91 | 92 | header udp_h { 93 | bit<16> src_port; 94 | bit<16> dst_port; 95 | bit<16> hdr_length; 96 | bit<16> checksum; 97 | } 98 | 99 | /** 100 | * @brief RoCEv2 headers 101 | */ 102 | 103 | header ib_bth_h { 104 | bit<8> opcode; 105 | bit<8> flags; // 1 bit solicited event, 1 bit migreq, 2 bit padcount, 4 bit headerversion 106 | bit<16> partition_key; 107 | bit<8> reserved0; 108 | bit<24> destination_qp; 109 | bit<1> ack_request; 110 | bit<7> reserved1; 111 | bit<24> packet_seqnum; 112 | } 113 | 114 | 115 | header adv_flow_ctl_h { 116 | bit<32> adv_flow_ctl; 117 | 118 | /** 32-bit adv_flow_ctl format */ 119 | // bit<1> qfc; 120 | // bit<2> tm_pipe_id; 121 | // bit<4> tm_mac_id; 122 | // bit<3> _pad; 123 | // bit<7> tm_mac_qid; 124 | // bit<15> credit; 125 | } 126 | 127 | 128 | /*********************** H E A D E R S ************************/ 129 | 130 | struct header_t { 131 | ethernet_h ethernet; 132 | adv_flow_ctl_h afc_msg; 133 | ipv4_h ipv4; 134 | arp_h arp; 135 | tcp_h tcp; 136 | udp_h udp; 137 | ib_bth_h bth; 138 | } 139 | 140 | /****** G L O B A L I N G R E S S M E T A D A T A *********/ 141 | struct metadata_t { 142 | bit<32> where_to_afc; 143 | PortId_t eg_port; 144 | bit<1> eg_bypass; 145 | } 146 | 147 | 148 | 149 | /************************************************************************* 150 | ************** I N G R E S S P R O C E S S I N G ******************* 151 | *************************************************************************/ 152 | 153 | /*********************** P A R S E R **************************/ 154 | parser SwitchIngressParser(packet_in pkt, 155 | out header_t hdr, 156 | out metadata_t meta, 157 | out ingress_intrinsic_metadata_t ig_intr_md, 158 | out ingress_intrinsic_metadata_for_tm_t ig_intr_md_for_tm, 159 | out ingress_intrinsic_metadata_from_parser_t ig_intr_md_from_prsr){ 160 | /* This is a mandatory state, required by Tofino Architecture */ 161 | state start { 162 | pkt.extract(ig_intr_md); 163 | pkt.advance(PORT_METADATA_SIZE); // macro defined in tofino.p4 164 | transition parse_ethernet; 165 | } 166 | 167 | state parse_ethernet { 168 | pkt.extract(hdr.ethernet); 169 | transition select(hdr.ethernet.ether_type){ 170 | (bit<16>)ether_type_t.IPV4: parse_ipv4; 171 | (bit<16>)ether_type_t.ARP: parse_arp; 172 | (bit<16>)ether_type_t.ETHERTYPE_AFC : parse_afc; 173 | default: accept; 174 | } 175 | } 176 | 177 | state parse_ipv4 { 178 | pkt.extract(hdr.ipv4); 179 | transition select(hdr.ipv4.protocol) { 180 | (bit<8>)ipv4_proto_t.TCP : parse_tcp; 181 | (bit<8>)ipv4_proto_t.UDP : parse_udp; 182 | default: accept; 183 | } 184 | } 185 | 186 | state parse_afc { 187 | pkt.extract(hdr.afc_msg); 188 | transition accept; 189 | } 190 | 191 | state parse_arp { 192 | pkt.extract(hdr.arp); 193 | transition accept; 194 | } 195 | 196 | state parse_tcp { 197 | pkt.extract(hdr.tcp); 198 | transition accept; 199 | } 200 | 201 | state parse_udp { 202 | pkt.extract(hdr.udp); 203 | transition select(hdr.udp.dst_port) { 204 | UDP_ROCE_V2: parse_bth; 205 | default: accept; 206 | } 207 | } 208 | 209 | state parse_bth { 210 | pkt.extract(hdr.bth); 211 | transition accept; 212 | } 213 | } 214 | 215 | 216 | 217 | 218 | 219 | /***************** M A T C H - A C T I O N *********************/ 220 | 221 | control SwitchIngress( 222 | /* User */ 223 | inout header_t hdr, 224 | inout metadata_t meta, 225 | /* Intrinsic */ 226 | in ingress_intrinsic_metadata_t ig_intr_md, 227 | in ingress_intrinsic_metadata_from_parser_t ig_intr_md_from_prsr, 228 | inout ingress_intrinsic_metadata_for_deparser_t ig_intr_md_for_dprsr, 229 | inout ingress_intrinsic_metadata_for_tm_t ig_intr_md_for_tm) { 230 | 231 | action nop(){} 232 | action drop(bit<3> drop_bits) { ig_intr_md_for_dprsr.drop_ctl = drop_bits; } 233 | action forward(PortId_t port) { 234 | ig_intr_md_for_tm.ucast_egress_port = port; 235 | } 236 | table simple_l2_forward { 237 | key = { 238 | ig_intr_md.ingress_port: exact; 239 | } 240 | actions = { 241 | forward; 242 | @defaultonly nop; 243 | } 244 | const default_action = nop(); 245 | size = 256; 246 | } 247 | 248 | /** 0: Ingress, 1: Egress */ 249 | Register, bit<1>>(1, 0) afc_where; 250 | RegisterAction, bit<1>, bit<32>>(afc_where) check_afc_where = { 251 | void apply(inout bit<32> value, out bit<32> result){ 252 | result = value; 253 | } 254 | }; 255 | action do_check_afc_where() { 256 | meta.where_to_afc = check_afc_where.execute(0); 257 | } 258 | 259 | Register, bit<1>>(1, 160) afc_forward; 260 | RegisterAction, bit<1>, bit<32>>(afc_forward) check_afc_forward = { 261 | void apply(inout bit<32> value, out bit<32> result){ 262 | result = value; 263 | } 264 | }; 265 | action do_check_afc_forward() { 266 | meta.eg_port = (PortId_t)check_afc_forward.execute(0); 267 | } 268 | 269 | 270 | Register, bit<1>>(1, 1) afc_egress_bypass; 271 | RegisterAction, bit<1>, bit<1>>(afc_egress_bypass) check_afc_egress_bypass = { 272 | void apply(inout bit<32> value, out bit<1> result){ 273 | result = value[0:0]; 274 | } 275 | }; 276 | action do_check_afc_egress_bypass() { 277 | meta.eg_bypass = check_afc_egress_bypass.execute(0); 278 | } 279 | 280 | Register, bit<1>>(1, 0) afc_record; 281 | RegisterAction, bit<1>, bit<1>>(afc_record) check_afc_record = { 282 | void apply(inout bit<32> value, out bit<1> result){ 283 | value = hdr.afc_msg.adv_flow_ctl; 284 | } 285 | }; 286 | action do_check_afc_record() { 287 | check_afc_record.execute(0); 288 | } 289 | 290 | 291 | apply { 292 | if(hdr.ethernet.ether_type == (bit<16>) ether_type_t.ARP){ 293 | // do the broadcast to all involved ports 294 | ig_intr_md_for_tm.mcast_grp_a = MCAST_GRP_ID; 295 | ig_intr_md_for_tm.rid = 0; 296 | } else { 297 | if (ig_intr_md.ingress_port == (PortId_t)160) { 298 | drop(0x1); 299 | exit; 300 | } 301 | do_check_afc_forward(); // -> meta.eg_port 302 | do_check_afc_where(); // -> meta.where_to_afc (0: ingress, 1: egress) 303 | do_check_afc_egress_bypass(); // meta.eg_bypass (1: bypass) 304 | 305 | if (hdr.ethernet.ether_type == (bit<16>)ether_type_t.ETHERTYPE_AFC) { 306 | forward(meta.eg_port); 307 | if (meta.where_to_afc == 0) { /* AFC at Ingress */ 308 | /* pass adv_flow_ctl message */ 309 | ig_intr_md_for_dprsr.adv_flow_ctl = hdr.afc_msg.adv_flow_ctl; 310 | ig_intr_md_for_tm.bypass_egress = meta.eg_bypass; // uncomment to bypass egress processing 311 | do_check_afc_record(); // debugging 312 | } 313 | } else { 314 | /* forward */ 315 | simple_l2_forward.apply(); 316 | } 317 | } 318 | // ig_intr_md_for_tm.bypass_egress = 1; // uncomment to bypass egress processing 319 | } 320 | } 321 | 322 | /********************* D E P A R S E R ************************/ 323 | 324 | control SwitchIngressDeparser(packet_out pkt, 325 | /* User */ 326 | inout header_t hdr, 327 | in metadata_t meta, 328 | /* Intrinsic */ 329 | in ingress_intrinsic_metadata_for_deparser_t ig_dprsr_md) { 330 | 331 | Checksum() ipv4_checksum; 332 | 333 | apply { 334 | hdr.ipv4.hdr_checksum = ipv4_checksum.update({ 335 | hdr.ipv4.version, 336 | hdr.ipv4.ihl, 337 | hdr.ipv4.dscp, 338 | hdr.ipv4.ecn, 339 | hdr.ipv4.total_len, 340 | hdr.ipv4.identification, 341 | hdr.ipv4.flags, 342 | hdr.ipv4.frag_offset, 343 | hdr.ipv4.ttl, 344 | hdr.ipv4.protocol, 345 | hdr.ipv4.src_addr, 346 | hdr.ipv4.dst_addr}); 347 | 348 | pkt.emit(hdr); 349 | } 350 | } 351 | 352 | /************************************************************************* 353 | **************** E G R E S S P R O C E S S I N G ******************* 354 | *************************************************************************/ 355 | 356 | 357 | /*********************** P A R S E R **************************/ 358 | 359 | parser SwitchEgressParser(packet_in pkt, 360 | out header_t hdr, 361 | out metadata_t meta, 362 | out egress_intrinsic_metadata_t eg_intr_md, 363 | out egress_intrinsic_metadata_from_parser_t eg_intr_md_from_prsr){ 364 | 365 | /* This is a mandatory state, required by Tofino Architecture */ 366 | state start { 367 | pkt.extract(eg_intr_md); 368 | transition parse_ethernet; 369 | } 370 | 371 | state parse_ethernet { 372 | pkt.extract(hdr.ethernet); 373 | transition select(hdr.ethernet.ether_type) { 374 | (bit<16>)ether_type_t.IPV4 : parse_ipv4; 375 | (bit<16>)ether_type_t.ARP : parse_arp; 376 | (bit<16>)ether_type_t.ETHERTYPE_AFC : parse_afc; 377 | default: accept; 378 | } 379 | } 380 | 381 | state parse_ipv4 { 382 | pkt.extract(hdr.ipv4); 383 | transition select(hdr.ipv4.protocol) { 384 | (bit<8>)ipv4_proto_t.TCP : parse_tcp; 385 | (bit<8>)ipv4_proto_t.UDP : parse_udp; 386 | default: accept; 387 | } 388 | } 389 | 390 | state parse_afc { 391 | pkt.extract(hdr.afc_msg); 392 | transition accept; 393 | } 394 | 395 | state parse_arp { 396 | pkt.extract(hdr.arp); 397 | transition accept; 398 | } 399 | 400 | state parse_tcp { 401 | pkt.extract(hdr.tcp); 402 | transition accept; 403 | } 404 | 405 | state parse_udp { 406 | pkt.extract(hdr.udp); 407 | transition select(hdr.udp.dst_port) { 408 | UDP_ROCE_V2: parse_bth; 409 | default: accept; 410 | } 411 | } 412 | 413 | state parse_bth { 414 | pkt.extract(hdr.bth); 415 | transition accept; 416 | } 417 | } 418 | 419 | /***************** M A T C H - A C T I O N *********************/ 420 | 421 | control SwitchEgress( 422 | inout header_t hdr, 423 | inout metadata_t meta, 424 | in egress_intrinsic_metadata_t eg_intr_md, 425 | in egress_intrinsic_metadata_from_parser_t eg_intr_md_from_prsr, 426 | inout egress_intrinsic_metadata_for_deparser_t eg_intr_md_for_dprsr, 427 | inout egress_intrinsic_metadata_for_output_port_t eg_intr_md_for_oport) { 428 | 429 | Register,_>(1, 0) reg_debug_cntr0; 430 | RegisterAction, _, bit<32>>(reg_debug_cntr0) reg_debug_cntr0_action = { 431 | void apply(inout bit<32> reg, out bit<32> result) { 432 | reg = (bit<32>)eg_intr_md.egress_port; 433 | } 434 | }; 435 | 436 | Register,_>(1, 0) reg_debug_egress_qid; 437 | RegisterAction, _, bit<32>>(reg_debug_egress_qid) reg_debug_egress_qid_action = { 438 | void apply(inout bit<32> reg, out bit<32> result) { 439 | reg = (bit<32>)eg_intr_md.egress_qid; 440 | } 441 | }; 442 | 443 | 444 | apply { 445 | reg_debug_cntr0_action.execute(0); /* just counting */ 446 | 447 | reg_debug_egress_qid_action.execute(0); /* get egress_qid (is it 0, or 16, when ingress's qid was 0?) */ 448 | 449 | if (hdr.afc_msg.isValid()) { 450 | eg_intr_md_for_dprsr.adv_flow_ctl = hdr.afc_msg.adv_flow_ctl; 451 | } 452 | 453 | } 454 | } 455 | 456 | /********************* D E P A R S E R ************************/ 457 | 458 | control SwitchEgressDeparser(packet_out pkt, 459 | /* User */ 460 | inout header_t hdr, 461 | in metadata_t meta, 462 | /* Intrinsic */ 463 | in egress_intrinsic_metadata_for_deparser_t eg_intr_md_for_dprsr, 464 | in egress_intrinsic_metadata_t eg_intr_md, 465 | in egress_intrinsic_metadata_from_parser_t eg_intr_md_from_prsr){ 466 | 467 | Checksum() ipv4_checksum; 468 | 469 | apply{ 470 | hdr.ipv4.hdr_checksum = ipv4_checksum.update({ 471 | hdr.ipv4.version, 472 | hdr.ipv4.ihl, 473 | hdr.ipv4.dscp, 474 | hdr.ipv4.ecn, 475 | hdr.ipv4.total_len, 476 | hdr.ipv4.identification, 477 | hdr.ipv4.flags, 478 | hdr.ipv4.frag_offset, 479 | hdr.ipv4.ttl, 480 | hdr.ipv4.protocol, 481 | hdr.ipv4.src_addr, 482 | hdr.ipv4.dst_addr}); 483 | 484 | pkt.emit(hdr); 485 | } 486 | } 487 | 488 | 489 | /************ F I N A L P A C K A G E ******************************/ 490 | Pipeline( 491 | SwitchIngressParser(), 492 | SwitchIngress(), 493 | SwitchIngressDeparser(), 494 | SwitchEgressParser(), 495 | SwitchEgress(), 496 | SwitchEgressDeparser() 497 | ) pipe; 498 | 499 | Switch(pipe) main; -------------------------------------------------------------------------------- /native_dcqcn/README.md: -------------------------------------------------------------------------------- 1 | ## DCQCN ECN-Marking Example 2 | 3 | This repo includes DCQCN ECN marking example using range-based match-action table. 4 | Please ignore some code blocks for ingress/egress mirroring, or you can reuse them to verify whether packets are correctly ECN-marked. 5 | 6 | * There are two key scripts: `p4src/native_dcqcn.p4` and the control plane scripts in `cp/setup.py`. 7 | 8 | * We built and tested using `bf-sde-9.11.1` and Tofino1 switch. 9 | -------------------------------------------------------------------------------- /native_dcqcn/cp/devtest_cmds.py: -------------------------------------------------------------------------------- 1 | import math 2 | import time 3 | 4 | INGRESS_PORT = 128 5 | EGRESS_PORT = 129 6 | 7 | # CC Mode 8 | reg_cc_mode = bfrt.dcqcn_buffering_test.pipe.SwitchEgress.reg_cc_mode # cc mode (5: DCTCP, 9: DCQCN) 9 | 10 | # DCTCP 11 | reg_ecn_marking_threshold = bfrt.dcqcn_buffering_test.pipe.SwitchEgress.reg_ecn_marking_threshold # cells 12 | 13 | # DCQCN 14 | DCQCN_K_MIN = 1250 # 100KB - 1250 15 | DCQCN_K_MAX = 3000 # 240KB # 400KB - 5000 16 | DCQCN_P_MAX = 0.2 # 20% 17 | QDEPTH_RANGE_MAX = 2**19 18 | SEED_RANGE_MAX = 256 # random number range ~ [0, 255] (8bits) 19 | SEED_K_MAX = math.ceil(DCQCN_P_MAX * SEED_RANGE_MAX) # 52 20 | QDEPTH_STEPSIZE = math.floor((DCQCN_K_MAX - DCQCN_K_MIN) / SEED_K_MAX) # 72 21 | last_range = DCQCN_K_MIN 22 | dcqcn_get_ecn_probability = bfrt.dcqcn_buffering_test.pipe.SwitchEgress.dcqcn_get_ecn_probability # table 23 | 24 | # DEBUGGING 25 | reg_ecn_marking_cntr = bfrt.dcqcn_buffering_test.pipe.SwitchEgress.reg_ecn_marking_cntr # ECN marked packets 26 | 27 | def check_status(): 28 | """ 29 | Check status of current values and configuration 30 | 31 | Returns: 32 | None 33 | """ 34 | print("---- CHECK STATUS ----") 35 | 36 | # ECN marking threshold (unit: cells, 80Bytes) 37 | val_reg_cc_mode = reg_cc_mode.get(REGISTER_INDEX=0, from_hw=True, print_ents=False).data[b'SwitchEgress.reg_cc_mode.f1'][1] 38 | val_reg_ecn_marking_threshold = reg_ecn_marking_threshold.get(REGISTER_INDEX=0, from_hw=True, print_ents=False).data[b'SwitchEgress.reg_ecn_marking_threshold.f1'][1] 39 | threshold_kb = (val_reg_ecn_marking_threshold * 80) / 1000 40 | val_reg_ecn_marking_cntr = reg_ecn_marking_cntr.get(REGISTER_INDEX=0, from_hw=True, print_ents=False).data[b'SwitchEgress.reg_ecn_marking_cntr.f1'][1] 41 | 42 | print("[CC Mode]Current CC Mode: {} (5: DCTCP, 9: DCQCN)".format(val_reg_cc_mode)) 43 | print("\tDCTCP -- ECN Marking Threshold (KB): {}".format(threshold_kb)) 44 | print("\tDCQCN -- Kmin: {}, Kmax: {}, Pmax: {} (maxQueueDepth:{})".format(DCQCN_K_MIN, DCQCN_K_MAX, DCQCN_P_MAX, QDEPTH_RANGE_MAX)) 45 | 46 | info_dropped_pkts = bfrt.tf1.tm.counter.eg_port.get(dev_port=129, from_hw=True, pipe=0, print_ents=False).data 47 | print("[Debugging]") 48 | print("\tECN-Marked packet number: {}".format(val_reg_ecn_marking_cntr)) 49 | print("\tDropped packet at egress port({}): {}, watermark_cells: {}".format(EGRESS_PORT, info_dropped_pkts[b'drop_count_packets'], info_dropped_pkts[b'watermark_cells'])) 50 | 51 | 52 | def change_cc_mode(cc_mode: int): 53 | """ 54 | Change CC Mode: 5 (DCTCP), 9 (DCQCN) 55 | 56 | Returns: 57 | None 58 | """ 59 | print("---- CHANGE CC MODE ----") 60 | if cc_mode == 5: 61 | print("Change CC Mode to DCTCP!") 62 | reg_cc_mode.mod(REGISTER_INDEX=0, f1=cc_mode) 63 | elif cc_mode == 9: 64 | print("Change CC Mode to DCQCN!") 65 | reg_cc_mode.mod(REGISTER_INDEX=0, f1=cc_mode) 66 | else: 67 | print("ERROR!! input should be either 5 (DCTCP) or 9 (DCQCN). Do nothing.") 68 | 69 | print("Reset ECN-marked packet counter as 0") 70 | reg_ecn_marking_cntr.clear() 71 | 72 | def reconfig_dctcp_ecn_threshold(ecn_marking: int): 73 | val_reg_cc_mode = reg_cc_mode.get(REGISTER_INDEX=0, from_hw=True, print_ents=False).data[b'SwitchEgress.reg_cc_mode.f1'][1] 74 | if val_reg_cc_mode != 5: 75 | print("ALERT!! CC Mode is not DCTCP") 76 | 77 | reg_ecn_marking_threshold.mod(REGISTER_INDEX=0, f1=ecn_marking) 78 | print("Changed ECN Marking threshold to {}".format(ecn_marking)) 79 | 80 | print("Reset ECN-marked packet counter as 0") 81 | reg_ecn_marking_cntr.clear() 82 | 83 | def reconfig_dcqcn_ecn_threshold(Kmin:int, Kmax: int, Pmax: float): 84 | val_reg_cc_mode = reg_cc_mode.get(REGISTER_INDEX=0, from_hw=True, print_ents=False).data[b'SwitchEgress.reg_cc_mode.f1'][1] 85 | if val_reg_cc_mode != 9: 86 | print("ALERT!! CC Mode is not DCQCN") 87 | 88 | 89 | DCQCN_K_MIN = Kmin 90 | DCQCN_K_MAX = Kmax 91 | DCQCN_P_MAX = Pmax 92 | SEED_K_MAX = math.ceil(DCQCN_P_MAX * SEED_RANGE_MAX) 93 | QDEPTH_STEPSIZE = math.floor((DCQCN_K_MAX - DCQCN_K_MIN) / SEED_K_MAX) 94 | last_range = DCQCN_K_MIN 95 | dcqcn_get_ecn_probability = bfrt.dcqcn_buffering_test.pipe.SwitchEgress.dcqcn_get_ecn_probability 96 | dcqcn_compare_probability = bfrt.dcqcn_buffering_test.pipe.SwitchEgress.dcqcn_compare_probability 97 | 98 | ##################### 99 | # PROBABILITY TABLE # 100 | ##################### 101 | # clear table 102 | print("Clear DCQCN ECN marking / comparing table...") 103 | dcqcn_get_ecn_probability.clear() 104 | dcqcn_compare_probability.clear() 105 | 106 | print("Reconfigure DCQCN ECN marking table...") 107 | print("DCQCN Table -- Adding qdepth:[{}, {}] -> probability:{:.2f}% ({}/{})".format(0, DCQCN_K_MIN - 1, float(0/SEED_RANGE_MAX)*100, 0, SEED_RANGE_MAX)) 108 | dcqcn_get_ecn_probability.add_with_dcqcn_mark_probability(deq_qdepth_start=0, deq_qdepth_end=DCQCN_K_MIN - 1, value=0) 109 | # K_MIN < qDepth < K_MAX 110 | for i in range(1, SEED_K_MAX): 111 | print("DCQCN Table -- Adding qdepth:[{}, {}] -> probability:{:.2f}% ({}/{})".format(last_range, last_range + QDEPTH_STEPSIZE - 1, float(i/SEED_RANGE_MAX)*100, i, SEED_RANGE_MAX)) 112 | dcqcn_get_ecn_probability.add_with_dcqcn_mark_probability(deq_qdepth_start=last_range, deq_qdepth_end=last_range + QDEPTH_STEPSIZE - 1, value=i) 113 | last_range += QDEPTH_STEPSIZE 114 | # > K_MAX 115 | print("DCQCN Table -- Adding qdepth:[{}, {}] -> probability:{:.2f}%".format(last_range, QDEPTH_RANGE_MAX - 1, float(SEED_RANGE_MAX/SEED_RANGE_MAX)*100)) 116 | dcqcn_get_ecn_probability.add_with_dcqcn_mark_probability(deq_qdepth_start=last_range, deq_qdepth_end=QDEPTH_RANGE_MAX - 1, value=SEED_RANGE_MAX - 1) 117 | 118 | #################### 119 | # COMPARISON TABLE # 120 | #################### 121 | # Less than 100% 122 | for prob_output in range(1, SEED_K_MAX): 123 | for random_number in range(SEED_RANGE_MAX): # 0 ~ 255 124 | if random_number < prob_output: 125 | print("Comparison Table -- ECN Marking for Random Number {}, Output Value {}".format(random_number, prob_output)) 126 | bfrt.dcqcn_buffering_test.pipe.SwitchEgress.dcqcn_compare_probability.add_with_dcqcn_check_ecn_marking(dcqcn_prob_output=prob_output, dcqcn_random_number=random_number) 127 | # 100% ECN Marking 128 | for random_number in range(SEED_RANGE_MAX): 129 | prob_output = SEED_RANGE_MAX - 1 130 | print("Comparison Table -- ECN Marking for Random Number {} < Output Value {}".format(random_number, prob_output)) 131 | bfrt.dcqcn_buffering_test.pipe.SwitchEgress.dcqcn_compare_probability.add_with_dcqcn_check_ecn_marking(dcqcn_prob_output=prob_output, dcqcn_random_number=random_number) 132 | 133 | 134 | print("Reset ECN-marked packet counter as 0") 135 | reg_ecn_marking_cntr.clear() -------------------------------------------------------------------------------- /native_dcqcn/cp/get_rx_info.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import os 4 | import sys 5 | import time 6 | import math 7 | 8 | # python3 get_rx_info.py 000 lg blk 9 | 10 | ### INPUT ARGUMENT ### 11 | # 1. loss rate (000, 010, 100) 12 | # 2. expt_type (baseline, lg) 13 | # 3. block mode (blk, noblk) 14 | 15 | if len(sys.argv) != 4: # TODO: take cmdline arguments 16 | print("Usage: {} ") 17 | exit(1) 18 | 19 | loss_rate = sys.argv[1] 20 | expt_type = sys.argv[2] 21 | block_mode = sys.argv[3] 22 | 23 | if loss_rate not in ["000", "100", "010"]: 24 | print("Loss rate should be either 000, 100, 010") 25 | exit(1) 26 | if expt_type not in ["baseline", "lg"]: 27 | print("expt_type should be either baseline, lg") 28 | exit(1) 29 | if block_mode not in ["blk", "noblk"]: 30 | print("block_mode should be either blk, noblk") 31 | exit(1) 32 | 33 | logname = "rxinfo_{loss_rate}_{expt_type}_{block_mode}.txt".format(loss_rate=loss_rate, expt_type=expt_type, block_mode=block_mode) 34 | 35 | if os.path.exists(logname): 36 | os.remove(logname) 37 | 38 | SDE_INSTALL = os.environ['SDE_INSTALL'] 39 | PYTHON3_VER = '{}.{}'.format( 40 | sys.version_info.major, 41 | sys.version_info.minor) 42 | SDE_PYTHON3 = os.path.join(SDE_INSTALL, 'lib', 'python' + PYTHON3_VER, 43 | 'site-packages') 44 | sys.path.append(SDE_PYTHON3) 45 | sys.path.append(os.path.join(SDE_PYTHON3, 'tofino')) 46 | sys.path.append(os.path.join(SDE_PYTHON3, 'tofino', 'bfrt_grpc')) 47 | 48 | import bfrt_grpc.client as gc 49 | 50 | RX_DEV_PORT_ON_REMOTE_SWITCH = 132 # 60 51 | BFRT_CLIENT_ID = 99 52 | 53 | # running on tofino1a 54 | bfrt_endpoint = 'localhost' 55 | bfrt_port = 50052 56 | bfrt_info = None 57 | dev_tgt = None 58 | interface = None 59 | 60 | def init_bfrt(): 61 | global bfrt_endpoint 62 | global bfrt_port 63 | global bfrt_info 64 | global dev_tgt 65 | global interface 66 | # for bfrt_client_id in range(10): 67 | try: 68 | interface = gc.ClientInterface( 69 | grpc_addr = str(bfrt_endpoint) + ":" + str(bfrt_port), 70 | client_id = BFRT_CLIENT_ID, 71 | device_id = 0, 72 | num_tries = 1) 73 | # break 74 | except: 75 | quit 76 | bfrt_info = interface.bfrt_info_get() 77 | # print('The target runs the program:', bfrt_info.p4_name_get()) 78 | # if bfrt_client_id == 0: 79 | interface.bind_pipeline_config(bfrt_info.p4_name_get()) 80 | dev_tgt = gc.Target(0) 81 | 82 | init_bfrt() 83 | 84 | port_stat_table = bfrt_info.table_get('$PORT_STAT') 85 | key = port_stat_table.make_key([gc.KeyTuple('$DEV_PORT', RX_DEV_PORT_ON_REMOTE_SWITCH)]) 86 | 87 | start = time.perf_counter() 88 | end = time.perf_counter() 89 | elapsed = end - start 90 | print("elapsed time = {:.12f} seconds".format(elapsed)) 91 | 92 | seconds = -1 93 | start = time.perf_counter() 94 | time_list = [0] 95 | byte_list = [0] 96 | while (True): 97 | current_time = time.perf_counter() 98 | elapsed = current_time - start 99 | if math.floor(elapsed) <= seconds: 100 | continue 101 | 102 | # (at least) 1 second elapsed 103 | seconds += (math.floor(elapsed) - seconds) 104 | 105 | # read from switch 106 | response = port_stat_table.entry_get(dev_tgt, [key], {'from_hw': False}, None) 107 | first_resp_entry = list(response)[0] # only have 1 in this case 108 | # entry is a tuple: (data obj, key obj). Get the data obj and convert to a dict 109 | rx_port_stats = first_resp_entry[0].to_dict() 110 | rx_octets = rx_port_stats['$OctetsReceived'] 111 | 112 | # write file and close 113 | with open(logname, "a") as f: 114 | f.write("{timestamp} {bytes}\n".format(timestamp=elapsed, bytes=rx_octets)) 115 | 116 | 117 | # record locally 118 | time_list.append(elapsed) 119 | byte_list.append(rx_octets) 120 | delta_time = time_list[-1] - time_list[-2] 121 | delta_byte = byte_list[-1] - byte_list[-2] 122 | print("[Throughput] t:{:d} - {:.2f} Gbps".format(round(seconds), delta_byte / delta_time / 1000000000.0 * 8)) 123 | 124 | 125 | 126 | 127 | 128 | -------------------------------------------------------------------------------- /native_dcqcn/cp/rate_limit.py: -------------------------------------------------------------------------------- 1 | # ## Port rate limiting (dev_port = 129) -- Tofino1b 2 | # # tm.set_port_shaping_rate(port=129, pps=False, burstsize=1500, rate=100, dev=0) # 100 Kbps 3 | # # tm.set_port_shaping_rate(port=129, pps=False, burstsize=1500, rate=1 * 1000000, dev=0) # 1 Gbps 4 | # tm.set_port_shaping_rate(port=129, pps=False, burstsize=1100, rate=24336000, dev=0) # 23 Gbps 5 | # tm.enable_port_shaping(port=129, dev=0) 6 | 7 | 8 | ## Port rate limiting (dev_port = 140) -- P4campus-proc1 9 | tm.set_port_shaping_rate(port=140, pps=False, burstsize=1100, rate=24336000, dev=0) # 24.336 Gbps 10 | tm.enable_port_shaping(port=140, dev=0) 11 | -------------------------------------------------------------------------------- /native_dcqcn/cp/read_statistics.py: -------------------------------------------------------------------------------- 1 | import socket 2 | import sys 3 | import os 4 | import time 5 | 6 | hostname = socket.gethostname() 7 | print("Hostname: {}".format(hostname)) 8 | -------------------------------------------------------------------------------- /native_dcqcn/cp/setup.py: -------------------------------------------------------------------------------- 1 | import socket 2 | import sys 3 | import os 4 | import time 5 | import math 6 | 7 | ### HOW TO RUN ### 8 | # $SDE/run_bfshell.sh -b `pwd`/setup.py -i 9 | ### 10 | 11 | # interfaces 12 | INFO_DEV_PORT_PATRONUS_ENS1F1 = 147 13 | 14 | MIRROR_SESSION_RDMA_SNIFF_IG = 777 # mirroring's session id for sniffing RDMA packets for IG_MIRROR 15 | MIRROR_SESSION_RDMA_SNIFF_EG = 888 # mirroring's session id for sniffing RDMA packets for EG_MIRROR 16 | 17 | # config_pktgen_script='..../config_pktgen.py' 18 | devtest_cmds_file = "/devtest_cmds.py" 19 | 20 | hostname = socket.gethostname() 21 | print("Hostname: {}".format(hostname)) 22 | 23 | if hostname == "tofino1b": 24 | fp_port_configs = [ 25 | ('31/0', '25G', 'NONE', 2), # lumos ens2f1 26 | ('31/1', '25G', 'NONE', 2), # hajime enp6s0f1 27 | ('29/3', '25G', 'NONE', 2), # monitoring patronus ens1f1 28 | ] 29 | 30 | def add_port_config(port_config): 31 | speed_dict = {'10G':'BF_SPEED_10G', '25G':'BF_SPEED_25G', '40G':'BF_SPEED_40G','50G':'BF_SPEED_50G', '100G':'BF_SPEED_100G'} 32 | fec_dict = {'NONE':'BF_FEC_TYP_NONE', 'FC':'BF_FEC_TYP_FC', 'RS':'BF_FEC_TYP_RS'} 33 | an_dict = {0:'PM_AN_DEFAULT', 1:'PM_AN_FORCE_ENABLE', 2:'PM_AN_FORCE_DISABLE'} 34 | lanes_dict = {'10G':(0,1,2,3), '25G':(0,1,2,3), '40G':(0,), '50G':(0,2), '100G':(0,)} 35 | 36 | # extract and map values from the config first 37 | conf_port = int(port_config[0].split('/')[0]) 38 | lane = port_config[0].split('/')[1] 39 | conf_speed = speed_dict[port_config[1]] 40 | conf_fec = fec_dict[port_config[2]] 41 | conf_an = an_dict[port_config[3]] 42 | 43 | 44 | if lane == '-': # need to add all possible lanes 45 | lanes = lanes_dict[port_config[1]] 46 | for lane in lanes: 47 | dp = bfrt.port.port_hdl_info.get(CONN_ID=conf_port, CHNL_ID=lane, print_ents=False).data[b'$DEV_PORT'] 48 | bfrt.port.port.add(DEV_PORT=dp, SPEED=conf_speed, FEC=conf_fec, AUTO_NEGOTIATION=conf_an, PORT_ENABLE=True) 49 | else: # specific lane is requested 50 | conf_lane = int(lane) 51 | dp = bfrt.port.port_hdl_info.get(CONN_ID=conf_port, CHNL_ID=conf_lane, print_ents=False).data[b'$DEV_PORT'] 52 | bfrt.port.port.add(DEV_PORT=dp, SPEED=conf_speed, FEC=conf_fec, AUTO_NEGOTIATION=conf_an, PORT_ENABLE=True) 53 | 54 | for config in fp_port_configs: 55 | add_port_config(config) 56 | 57 | 58 | port_metadata_tbl = bfrt.dcqcn_buffering_test.pipe.SwitchIngressParser.PORT_METADATA # switch_id 59 | l2_forward = bfrt.dcqcn_buffering_test.pipe.SwitchIngress.l2_forward 60 | 61 | # For topology figure: https://app.diagrams.net/#G1hy8wHlz500QMTVTgnsO1ZrWA1gGPQ_zx 62 | 63 | if hostname == "tofino1b": 64 | # FORM virtual switches - aka fill the port_metadata table 65 | port_metadata_tbl.add(ingress_port=128, switch_id=0) 66 | port_metadata_tbl.add(ingress_port=129, switch_id=0) 67 | 68 | # Add entries to the l2_forward table 69 | l2_forward.add_with_forward(dst_addr=0xb8cef6046c05, switch_id=0, port=129) # to sender (DATA) 70 | l2_forward.add_with_forward(dst_addr=0xb8cef6046bd1, switch_id=0, port=128) # to receiver (ACK) 71 | 72 | # XXX monitoring entry to patronus ens1f1 (dp 29/3) 73 | l2_forward.add_with_forward(dst_addr=0x649d99b10ee1, switch_id=0, port=147) 74 | 75 | # # Pktgen pkt's forwarding from sw2 to sw3 76 | # l2_forward.add_with_forward(dst_addr=RECEIVER_SW_ADDR, switch_id=2, port=172) 77 | 78 | 79 | # Setup ARP broadcast for the active dev ports 80 | active_dev_ports = [] 81 | 82 | if hostname == 'tofino1b': 83 | active_dev_ports = [128, 129, 147] 84 | else: 85 | print("This setup script is for tofino1b/1c. But you are running on {}".format(hostname)) 86 | sys.exit(1) 87 | 88 | # ARP 89 | bfrt.pre.node.add(MULTICAST_NODE_ID=0, MULTICAST_RID=0, MULTICAST_LAG_ID=[], DEV_PORT=active_dev_ports) 90 | bfrt.pre.mgid.add(MGID=1, MULTICAST_NODE_ID=[0], MULTICAST_NODE_L1_XID_VALID=[False], MULTICAST_NODE_L1_XID=[0]) 91 | 92 | # Setup mirroring 93 | if hostname == "tofino1b": 94 | bfrt.mirror.cfg.add_with_normal(sid=MIRROR_SESSION_RDMA_SNIFF_IG, direction='INGRESS', session_enable=True, ucast_egress_port=INFO_DEV_PORT_PATRONUS_ENS1F1, ucast_egress_port_valid=1, max_pkt_len=16384) 95 | bfrt.mirror.cfg.add_with_normal(sid=MIRROR_SESSION_RDMA_SNIFF_EG, direction='EGRESS', session_enable=True, ucast_egress_port=INFO_DEV_PORT_PATRONUS_ENS1F1, ucast_egress_port_valid=1, max_pkt_len=16384) 96 | 97 | 98 | # Setup ECN marking for DCTCP 99 | reg_ecn_marking_threshold = bfrt.dcqcn_buffering_test.pipe.SwitchEgress.reg_ecn_marking_threshold 100 | # reg_ecn_marking_threshold.mod(REGISTER_INDEX=0, f1=375) # 375 x 80 = 30KB (20 pkts) | 1 Gbps 101 | reg_ecn_marking_threshold.mod(REGISTER_INDEX=0, f1=1250) # 1250 x 80 = 100KB (65 pkts) | 10 Gbps 102 | 103 | # Setup RED-based ECN marking for DCQCN 104 | DCQCN_K_MIN = 1250 # 100KB 105 | DCQCN_K_MAX = 3000 # 240KB # 400KB - 5000 106 | DCQCN_P_MAX = 0.2 # 20% 107 | QDEPTH_RANGE_MAX = 2**19 108 | SEED_RANGE_MAX = 256 # random number range ~ [0, 255] (8bits) 109 | SEED_K_MAX = math.ceil(DCQCN_P_MAX * SEED_RANGE_MAX) # 52 110 | QDEPTH_STEPSIZE = math.floor((DCQCN_K_MAX - DCQCN_K_MIN) / SEED_K_MAX) # 72 111 | 112 | last_range = DCQCN_K_MIN 113 | ##################### 114 | # PROBABILITY TABLE # 115 | ##################### 116 | dcqcn_get_ecn_probability = bfrt.dcqcn_buffering_test.pipe.SwitchEgress.dcqcn_get_ecn_probability 117 | # < K_MIN 118 | print("DCQCN Table -- Adding qdepth:[{}, {}] -> probability:{:.2f}% ({}/{})".format(0, DCQCN_K_MIN - 1, float(0/SEED_RANGE_MAX)*100, 0, SEED_RANGE_MAX)) 119 | dcqcn_get_ecn_probability.add_with_dcqcn_mark_probability(deq_qdepth_start=0, deq_qdepth_end=DCQCN_K_MIN - 1, value=0) 120 | # K_MIN < qDepth < K_MAX 121 | for i in range(1, SEED_K_MAX): 122 | print("DCQCN Table -- Adding qdepth:[{}, {}] -> probability:{:.2f}% ({}/{})".format(last_range, last_range + QDEPTH_STEPSIZE - 1, float(i/SEED_RANGE_MAX)*100, i, SEED_RANGE_MAX)) 123 | dcqcn_get_ecn_probability.add_with_dcqcn_mark_probability(deq_qdepth_start=last_range, deq_qdepth_end=last_range + QDEPTH_STEPSIZE - 1, value=i) 124 | last_range += QDEPTH_STEPSIZE 125 | # > K_MAX 126 | print("DCQCN Table -- Adding qdepth:[{}, {}] -> probability:{:.2f}%".format(last_range, QDEPTH_RANGE_MAX - 1, float(SEED_RANGE_MAX/SEED_RANGE_MAX)*100)) 127 | dcqcn_get_ecn_probability.add_with_dcqcn_mark_probability(deq_qdepth_start=last_range, deq_qdepth_end=QDEPTH_RANGE_MAX - 1, value=SEED_RANGE_MAX - 1) 128 | 129 | #################### 130 | # COMPARISON TABLE # 131 | #################### 132 | dcqcn_compare_probability = bfrt.dcqcn_buffering_test.pipe.SwitchEgress.dcqcn_compare_probability 133 | # Less than 100% 134 | for prob_output in range(1, SEED_K_MAX): 135 | for random_number in range(SEED_RANGE_MAX): # 0 ~ 255 136 | if random_number < prob_output: 137 | print("Comparison Table -- ECN Marking for Random Number {}, Output Value {}".format(random_number, prob_output)) 138 | bfrt.dcqcn_buffering_test.pipe.SwitchEgress.dcqcn_compare_probability.add_with_dcqcn_check_ecn_marking(dcqcn_prob_output=prob_output, dcqcn_random_number=random_number) 139 | # 100% ECN Marking 140 | for random_number in range(SEED_RANGE_MAX): 141 | prob_output = SEED_RANGE_MAX - 1 142 | print("Comparison Table -- ECN Marking for Random Number {} < Output Value {}".format(random_number, prob_output)) 143 | bfrt.dcqcn_buffering_test.pipe.SwitchEgress.dcqcn_compare_probability.add_with_dcqcn_check_ecn_marking(dcqcn_prob_output=prob_output, dcqcn_random_number=random_number) 144 | 145 | 146 | # ####################### 147 | # ### CONFIG PKTGEN ### 148 | # ####################### 149 | # print("######## CONFIGURING PKTGEN ########") 150 | # os.system("$SDE/run_pd_rpc.py {}".format(config_pktgen_script)) 151 | # time.sleep(0.5) 152 | # print("PktGen configured for test traffic!") 153 | 154 | ############################### 155 | ### LOAD DEVTEST_CMDS FILE ### 156 | ############################### 157 | print("######## LOADING DEVTEST COMMANDS ########") 158 | if hostname == "tofino1b": 159 | with open(devtest_cmds_file, "rb") as src_file: 160 | code = compile(src_file.read(), devtest_cmds_file, "exec") 161 | exec(code) 162 | print("devtest_cmds.py loaded!") 163 | 164 | 165 | 166 | 167 | -------------------------------------------------------------------------------- /native_dcqcn/p4src/includes/headers.p4: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | /* for random number generation */ 4 | #define RANDOM_GEN_BIT_WIDTH 20 5 | typedef bit random_gen_bitwidth_t; 6 | 7 | 8 | typedef bit<48> mac_addr_t; 9 | typedef bit<32> ipv4_addr_t; 10 | 11 | struct port_metadata_t { 12 | bit<8> switch_id; 13 | }; 14 | 15 | header ethernet_h { 16 | mac_addr_t dst_addr; 17 | mac_addr_t src_addr; 18 | bit<16> ether_type; 19 | } 20 | 21 | header arp_h { 22 | bit<16> htype; 23 | bit<16> ptype; 24 | bit<8> hlen; 25 | bit<8> plen; 26 | bit<16> oper; 27 | mac_addr_t sender_hw_addr; 28 | ipv4_addr_t sender_ip_addr; 29 | mac_addr_t target_hw_addr; 30 | ipv4_addr_t target_ip_addr; 31 | } 32 | 33 | header ipv4_h { 34 | bit<4> version; 35 | bit<4> ihl; 36 | bit<6> dscp; 37 | bit<2> ecn; 38 | bit<16> total_len; 39 | bit<16> identification; 40 | bit<3> flags; 41 | bit<13> frag_offset; 42 | bit<8> ttl; 43 | bit<8> protocol; 44 | bit<16> hdr_checksum; 45 | ipv4_addr_t src_addr; 46 | ipv4_addr_t dst_addr; 47 | } 48 | 49 | header tcp_h { 50 | bit<16> src_port; 51 | bit<16> dst_port; 52 | bit<32> seq_no; 53 | bit<32> ack_no; 54 | bit<4> data_offset; 55 | bit<4> res; 56 | bit<8> flags; 57 | bit<16> window; 58 | bit<16> checksum; 59 | bit<16> urgent_ptr; 60 | } 61 | 62 | header udp_h { 63 | bit<16> src_port; 64 | bit<16> dst_port; 65 | bit<16> hdr_length; 66 | bit<16> checksum; 67 | } 68 | 69 | header icmp_h { 70 | bit<8> type_; 71 | bit<8> code; 72 | bit<16> hdr_checksum; 73 | bit<16> id; 74 | bit<16> seq_no; 75 | bit<64> data_time; 76 | } 77 | 78 | /*---- RDMA monitoring -----*/ 79 | header ib_bth_h { // 12 bytes 80 | /** 81 | * @brief opcode 82 | * --RC-- 83 | * 0x04 RC RDMA SEND-ONLY (4) 84 | * 0x0A RC RDMA WRITE-ONLY (10) 85 | * 0x06 RC RDMA WRITE FIRST (6) - RETH 86 | * 0x07 RC RDMA WRITE MIDDLE (7) 87 | * 0x08 RC RDMA WRITE LAST (8) 88 | * 0x11 RC RDMA ACK/NACK (17) - AETH 89 | * 0x10 RC RDMA Read-response ONLY (16) 90 | * 0x0C RC RDMA Read-request (13) 91 | * 0x81 Mellanox's CNP packet (129) 92 | */ 93 | bit<8> opcode; 94 | bit<8> flags; // 1 bit solicited event, 1 bit migreq, 2 bit padcount, 4 bit headerversion 95 | bit<16> partition_key; 96 | bit<8> reserved0; 97 | bit<24> destination_qp; 98 | bit<1> ack_request; 99 | bit<7> reserved1; 100 | bit<24> packet_seqnum; 101 | } 102 | 103 | // RC FIRST WR 104 | header ib_reth_h { 105 | bit<64> virtual_addr; 106 | bit<32> remote_key; 107 | bit<32> dma_length; 108 | } 109 | 110 | // RC SEND-ONLY (4) 111 | header ib_deth_h { 112 | bit<32> queue_key; 113 | bit<8> reserved2; 114 | bit<24> source_qp; 115 | } 116 | 117 | // ACK 118 | header ib_aeth_h { 119 | bit<1> reserved; 120 | bit<2> opcode; // (0: ACK, 3: NACK) 121 | bit<5> error_code; // (PSN SEQ ERROR) 122 | bit<8> msg_seq_number; 123 | } 124 | 125 | /* Any metadata to be bridged from ig to eg */ 126 | header bridged_meta_h { 127 | } 128 | 129 | /* Mirror Types */ 130 | const bit<3> IG_MIRROR_TYPE_1 = 1; // corresponds to ig_mirror1_h 131 | const bit<3> EG_MIRROR_TYPE_1 = 2; // corresponds to eg_mirror1_h 132 | 133 | header eg_mirror1_h { 134 | bit<48> egress_global_timestamp; 135 | bit<8> mirrored; 136 | @flexible bit<2> ecn; 137 | } 138 | 139 | header ig_mirror1_h { 140 | bit<48> ingress_mac_timestamp; 141 | bit<8> opcode; 142 | bit<8> mirrored; 143 | bit<8> last_ack; 144 | bit<32> rdma_seqnum; 145 | } 146 | 147 | struct header_t { 148 | /* custom bridged info, needs to be deparsed from ig to eg */ 149 | bridged_meta_h bridged_meta; 150 | 151 | /* Normal headers */ 152 | ethernet_h ethernet; 153 | ipv4_h ipv4; 154 | arp_h arp; 155 | tcp_h tcp; 156 | udp_h udp; 157 | icmp_h icmp; 158 | 159 | /* RDMA headers */ 160 | ib_bth_h bth; 161 | ib_reth_h reth; 162 | ib_deth_h deth; 163 | ib_aeth_h aeth; 164 | } 165 | 166 | struct metadata_t { 167 | /* switch's ID for our virtual topology */ 168 | port_metadata_t port_md; 169 | 170 | /* mirroring */ 171 | eg_mirror1_h eg_mirror1; 172 | ig_mirror1_h ig_mirror1; 173 | MirrorId_t mirror_session; 174 | 175 | /* ECN */ 176 | bit<1> exceeded_ecn_marking_threshold; 177 | bit<8> cc_mode; 178 | bit<8> dcqcn_random_number; 179 | bit<8> dcqcn_prob_output; 180 | } 181 | -------------------------------------------------------------------------------- /native_dcqcn/p4src/includes/parser.p4: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | // turn on only one of them 4 | // #define IG_MIRRORING_ENABLED 5 | // #define EG_MIRRORING_ENABLED 6 | 7 | enum bit<16> ether_type_t { 8 | IPV4 = 0x0800, 9 | ARP = 0x0806 10 | } 11 | 12 | enum bit<8> ipv4_proto_t { 13 | TCP = 6, 14 | UDP = 17, 15 | ICMP = 1 16 | } 17 | 18 | enum bit<16> udp_port_t { 19 | ROCE_V2 = 4791 20 | } 21 | 22 | 23 | // --------------------------------------------------------------------------- 24 | // Ingress parser 25 | // --------------------------------------------------------------------------- 26 | parser SwitchIngressParser( 27 | packet_in pkt, 28 | out header_t hdr, 29 | out metadata_t meta, 30 | out ingress_intrinsic_metadata_t ig_intr_md, 31 | out ingress_intrinsic_metadata_for_tm_t ig_intr_md_for_tm, 32 | out ingress_intrinsic_metadata_from_parser_t ig_intr_md_from_prsr){ 33 | 34 | 35 | state start { 36 | pkt.extract(ig_intr_md); 37 | transition parse_port_metadata; 38 | } 39 | 40 | state parse_port_metadata { 41 | meta.port_md = port_metadata_unpack(pkt); 42 | transition init_metadata; 43 | } 44 | 45 | state init_metadata { // init bridged_meta (based on slide 23 of BA-1122) 46 | // hdr.bridged_meta.setValid(); 47 | // hdr.bridged_meta.type = INTERNAL_HDR_TYPE_BRIDGED_META; 48 | // hdr.bridged_meta.info = 0; 49 | transition parse_ethernet; 50 | } 51 | 52 | state parse_ethernet { 53 | pkt.extract(hdr.ethernet); 54 | transition select(hdr.ethernet.ether_type){ 55 | (bit<16>) ether_type_t.IPV4: parse_ipv4; 56 | (bit<16>) ether_type_t.ARP: parse_arp; 57 | default: accept; 58 | } 59 | } 60 | 61 | state parse_ipv4 { 62 | pkt.extract(hdr.ipv4); 63 | transition select(hdr.ipv4.protocol){ 64 | (bit<8>) ipv4_proto_t.TCP: parse_tcp; 65 | (bit<8>) ipv4_proto_t.UDP: parse_udp; 66 | (bit<8>) ipv4_proto_t.ICMP: parse_icmp; 67 | default: accept; 68 | } 69 | } 70 | 71 | state parse_arp { 72 | pkt.extract(hdr.arp); 73 | transition accept; 74 | } 75 | 76 | state parse_tcp { 77 | pkt.extract(hdr.tcp); 78 | transition accept; 79 | } 80 | 81 | state parse_udp { 82 | pkt.extract(hdr.udp); 83 | transition select(hdr.udp.dst_port) { 84 | (bit<16>) udp_port_t.ROCE_V2: parse_bth; 85 | default: accept; 86 | } 87 | } 88 | 89 | state parse_bth { 90 | pkt.extract(hdr.bth); 91 | transition select(hdr.bth.opcode) { 92 | 0x04 : parse_deth; // RC RDMA SEND-ONLY (4) 93 | 0x06 : parse_reth; // RC RDMA WRITE FIRST (6) 94 | 0x11 : parse_aeth; // RC RDMA ACK (17) 95 | default: accept; 96 | 97 | // 0x0A : parse_reth; // RC RDMA WRITE-ONLY (10) - RETH (not sure) 98 | // 0x2A : parse_reth; // UC RDMA Write (42) - RETH (not sure) 99 | // 0x64 : parse_deth; // UC RDMA SEND-ONLY - DETH (not sure) 100 | } 101 | } 102 | 103 | state parse_reth { 104 | pkt.extract(hdr.reth); 105 | transition accept; 106 | } 107 | 108 | state parse_deth { 109 | pkt.extract(hdr.deth); 110 | transition accept; 111 | } 112 | 113 | state parse_aeth { 114 | pkt.extract(hdr.aeth); 115 | transition accept; 116 | } 117 | 118 | state parse_icmp { 119 | pkt.extract(hdr.icmp); 120 | transition accept; 121 | } 122 | } 123 | 124 | 125 | // --------------------------------------------------------------------------- 126 | // Ingress Deparser 127 | // --------------------------------------------------------------------------- 128 | control SwitchIngressDeparser( 129 | packet_out pkt, 130 | inout header_t hdr, 131 | in metadata_t meta, 132 | in ingress_intrinsic_metadata_for_deparser_t ig_dprsr_md) { 133 | 134 | Checksum() ipv4_checksum; 135 | Mirror() mirror; 136 | 137 | apply { 138 | hdr.ipv4.hdr_checksum = ipv4_checksum.update({ 139 | hdr.ipv4.version, 140 | hdr.ipv4.ihl, 141 | hdr.ipv4.dscp, 142 | hdr.ipv4.ecn, 143 | hdr.ipv4.total_len, 144 | hdr.ipv4.identification, 145 | hdr.ipv4.flags, 146 | hdr.ipv4.frag_offset, 147 | hdr.ipv4.ttl, 148 | hdr.ipv4.protocol, 149 | hdr.ipv4.src_addr, 150 | hdr.ipv4.dst_addr}); 151 | 152 | // different mirror types can define different sets of headers 153 | if(ig_dprsr_md.mirror_type == IG_MIRROR_TYPE_1) { 154 | // which session? what mirroring metadata? 155 | mirror.emit(meta.mirror_session, {meta.ig_mirror1.ingress_mac_timestamp, 156 | meta.ig_mirror1.opcode, 157 | meta.ig_mirror1.mirrored, 158 | meta.ig_mirror1.last_ack, 159 | meta.ig_mirror1.rdma_seqnum}); 160 | } 161 | pkt.emit(hdr); 162 | } 163 | } 164 | 165 | 166 | // --------------------------------------------------------------------------- 167 | // Egress parser 168 | // --------------------------------------------------------------------------- 169 | parser SwitchEgressParser( 170 | packet_in pkt, 171 | out header_t hdr, 172 | out metadata_t meta, 173 | out egress_intrinsic_metadata_t eg_intr_md, 174 | out egress_intrinsic_metadata_from_parser_t eg_intr_md_from_prsr){ 175 | 176 | // internal_hdr_h internal_hdr; 177 | state start { 178 | pkt.extract(eg_intr_md); 179 | transition parse_metadata; 180 | } 181 | 182 | state parse_metadata { 183 | #ifdef IG_MIRRORING_ENABLED 184 | ig_mirror1_h mirror_md = pkt.lookahead(); 185 | transition select(mirror_md.mirrored) { 186 | (bit<8>)IG_MIRROR_TYPE_1 : parse_ig_mirror_md; 187 | default : parse_ethernet; 188 | } 189 | #endif 190 | 191 | #ifdef EG_MIRRORING_ENABLED 192 | eg_mirror1_h mirror_md = pkt.lookahead(); 193 | transition select(mirror_md.mirrored) { 194 | (bit<8>)EG_MIRROR_TYPE_1 : parse_eg_mirror_md; 195 | default : parse_ethernet; 196 | } 197 | #endif 198 | 199 | #ifndef IG_MIRRORING_ENABLED 200 | #ifndef EG_MIRRORING_ENABLED 201 | transition parse_ethernet; // if no ig/eg_mirroring 202 | #endif 203 | #endif 204 | } 205 | 206 | /* mirroring */ 207 | state parse_ig_mirror_md { 208 | pkt.extract(meta.ig_mirror1); 209 | transition parse_ethernet; 210 | } 211 | 212 | state parse_eg_mirror_md { 213 | pkt.extract(meta.eg_mirror1); 214 | transition parse_ethernet; 215 | } 216 | 217 | state parse_ethernet { 218 | pkt.extract(hdr.ethernet); 219 | transition select(hdr.ethernet.ether_type){ 220 | (bit<16>) ether_type_t.IPV4: parse_ipv4; 221 | (bit<16>) ether_type_t.ARP: parse_arp; 222 | default: accept; 223 | } 224 | } 225 | 226 | state parse_ipv4 { 227 | pkt.extract(hdr.ipv4); 228 | transition select(hdr.ipv4.protocol){ 229 | (bit<8>) ipv4_proto_t.TCP: parse_tcp; 230 | (bit<8>) ipv4_proto_t.UDP: parse_udp; 231 | (bit<8>) ipv4_proto_t.ICMP: parse_icmp; 232 | default: accept; 233 | } 234 | } 235 | 236 | state parse_arp { 237 | pkt.extract(hdr.arp); 238 | transition accept; 239 | } 240 | 241 | state parse_tcp { 242 | pkt.extract(hdr.tcp); 243 | transition accept; 244 | } 245 | 246 | state parse_udp { 247 | pkt.extract(hdr.udp); 248 | transition select(hdr.udp.dst_port) { 249 | (bit<16>) udp_port_t.ROCE_V2: parse_bth; 250 | default: accept; 251 | } 252 | } 253 | 254 | state parse_bth { 255 | pkt.extract(hdr.bth); 256 | transition select(hdr.bth.opcode) { 257 | 0x04 : parse_deth; // RC RDMA SEND-ONLY (4) 258 | 0x06 : parse_reth; // RC RDMA WRITE FIRST (6) 259 | 0x11 : parse_aeth; // RC RDMA ACK (17) 260 | default: accept; 261 | 262 | // 0x0A : parse_reth; // RC RDMA WRITE-ONLY (10) - RETH (not sure) 263 | // 0x2A : parse_reth; // UC RDMA Write (42) - RETH (not sure) 264 | // 0x64 : parse_deth; // UC RDMA SEND-ONLY - DETH (not sure) 265 | } 266 | } 267 | 268 | state parse_reth { 269 | pkt.extract(hdr.reth); 270 | transition accept; 271 | } 272 | 273 | state parse_deth { 274 | pkt.extract(hdr.deth); 275 | transition accept; 276 | } 277 | 278 | state parse_aeth { 279 | pkt.extract(hdr.aeth); 280 | transition accept; 281 | } 282 | 283 | state parse_icmp { 284 | pkt.extract(hdr.icmp); 285 | transition accept; 286 | } 287 | // do more stuff here if needed 288 | } 289 | 290 | // --------------------------------------------------------------------------- 291 | // Egress Deparser 292 | // --------------------------------------------------------------------------- 293 | control SwitchEgressDeparser( 294 | packet_out pkt, 295 | inout header_t hdr, 296 | in metadata_t meta, 297 | in egress_intrinsic_metadata_for_deparser_t eg_intr_md_for_dprsr, 298 | in egress_intrinsic_metadata_t eg_intr_md, 299 | in egress_intrinsic_metadata_from_parser_t eg_intr_md_from_prsr){ 300 | 301 | Mirror() mirror; 302 | Checksum() ipv4_checksum; 303 | 304 | apply{ 305 | 306 | if(eg_intr_md_for_dprsr.mirror_type == EG_MIRROR_TYPE_1){ 307 | mirror.emit(meta.mirror_session, 308 | { meta.eg_mirror1.egress_global_timestamp, 309 | meta.eg_mirror1.mirrored, 310 | meta.eg_mirror1.ecn 311 | }); 312 | } 313 | 314 | hdr.ipv4.hdr_checksum = ipv4_checksum.update({ 315 | hdr.ipv4.version, 316 | hdr.ipv4.ihl, 317 | hdr.ipv4.dscp, 318 | hdr.ipv4.ecn, 319 | hdr.ipv4.total_len, 320 | hdr.ipv4.identification, 321 | hdr.ipv4.flags, 322 | hdr.ipv4.frag_offset, 323 | hdr.ipv4.ttl, 324 | hdr.ipv4.protocol, 325 | hdr.ipv4.src_addr, 326 | hdr.ipv4.dst_addr}); 327 | 328 | pkt.emit(hdr); 329 | } 330 | } 331 | -------------------------------------------------------------------------------- /native_dcqcn/p4src/native_dcqcn.p4: -------------------------------------------------------------------------------- 1 | /* -*- P4_16 -*- */ 2 | #include 3 | #if __TARGET_TOFINO__ == 2 4 | #include 5 | #else 6 | #include 7 | #endif 8 | 9 | #include "includes/headers.p4" 10 | #include "includes/parser.p4" 11 | 12 | const int MCAST_GRP_ID = 1; // for ARP 13 | const bit<9> RECIRC_PORT_PIPE_1 = 196; // recirculation port 14 | const bit<32> OUT_OF_RANGE_24BIT = 32w16777216; // 2^24 15 | 16 | const bit<10> MIRROR_SESSION_RDMA_ID_IG = 10w777; 17 | const bit<10> MIRROR_SESSION_RDMA_ID_EG = 10w888; 18 | 19 | const int MAX_PORTS = 256; 20 | 21 | 22 | control SwitchIngress( 23 | inout header_t hdr, 24 | inout metadata_t meta, 25 | in ingress_intrinsic_metadata_t ig_intr_md, 26 | in ingress_intrinsic_metadata_from_parser_t ig_intr_md_from_prsr, 27 | inout ingress_intrinsic_metadata_for_deparser_t ig_intr_md_for_dprsr, 28 | inout ingress_intrinsic_metadata_for_tm_t ig_intr_md_for_tm){ 29 | 30 | /** 31 | * @brief L2 Forwarding 32 | */ 33 | action nop(){} 34 | action drop(){ 35 | ig_intr_md_for_dprsr.drop_ctl = 0b001; 36 | } 37 | 38 | action miss(bit<3> drop_bits) { 39 | ig_intr_md_for_dprsr.drop_ctl = drop_bits; 40 | } 41 | 42 | action forward(PortId_t port){ 43 | ig_intr_md_for_tm.ucast_egress_port = port; 44 | } 45 | 46 | /* What we mainly use for switching/routing */ 47 | table l2_forward { 48 | key = { 49 | meta.port_md.switch_id: exact; 50 | hdr.ethernet.dst_addr: exact; 51 | } 52 | 53 | actions = { 54 | forward; 55 | @defaultonly miss; 56 | } 57 | 58 | const default_action = miss(0x1); 59 | } 60 | 61 | /* Mirroring packets to Sniff Port */ 62 | action mirror_to_collector(bit<10> ing_mir_ses){ 63 | ig_intr_md_for_dprsr.mirror_type = IG_MIRROR_TYPE_1; 64 | meta.mirror_session = ing_mir_ses; 65 | meta.ig_mirror1.ingress_mac_timestamp = ig_intr_md.ingress_mac_tstamp; 66 | meta.ig_mirror1.opcode = hdr.bth.opcode; 67 | meta.ig_mirror1.mirrored = (bit<8>)IG_MIRROR_TYPE_1; 68 | } 69 | 70 | action get_seqnum_to_metadata() { 71 | meta.ig_mirror1.rdma_seqnum = (bit<32>)hdr.bth.packet_seqnum; 72 | } 73 | 74 | apply { 75 | if(hdr.ethernet.ether_type == (bit<16>) ether_type_t.ARP){ 76 | // do the broadcast to all involved ports 77 | ig_intr_md_for_tm.mcast_grp_a = MCAST_GRP_ID; 78 | ig_intr_md_for_tm.rid = 0; 79 | } else { // non-arp packet 80 | l2_forward.apply(); 81 | 82 | if (hdr.bth.isValid()){ // if RDMA 83 | #ifdef IG_MIRRORING_ENABLED 84 | mirror_to_collector(MIRROR_SESSION_RDMA_ID_IG); // ig_mirror all RDMA packets 85 | get_seqnum_to_metadata(); 86 | #endif 87 | } 88 | } 89 | 90 | // Allow egress processing for all switches 91 | // ig_intr_md_for_tm.bypass_egress = 1w1; 92 | } 93 | 94 | } // End of SwitchIngressControl 95 | 96 | 97 | 98 | 99 | 100 | /******************* 101 | * Egress Pipeline * 102 | * *****************/ 103 | 104 | control SwitchEgress( 105 | inout header_t hdr, 106 | inout metadata_t meta, 107 | in egress_intrinsic_metadata_t eg_intr_md, 108 | in egress_intrinsic_metadata_from_parser_t eg_intr_md_from_prsr, 109 | inout egress_intrinsic_metadata_for_deparser_t eg_intr_md_for_dprsr, 110 | inout egress_intrinsic_metadata_for_output_port_t eg_intr_md_for_oport){ 111 | 112 | // DCQCN (9)? DCTCP(5)? 113 | Register,bit<1>>(1, 9) reg_cc_mode; // default: DCQCN (9) 114 | RegisterAction,bit<1>,bit<8>>(reg_cc_mode) get_reg_cc_mode = { 115 | void apply(inout bit<8> reg_val, out bit<8> rv){ 116 | rv = reg_val; 117 | } 118 | }; 119 | action get_cc_mode() { 120 | meta.cc_mode = get_reg_cc_mode.execute(0); 121 | } 122 | 123 | // DCTCP 124 | Register,bit<1>>(1,1250) reg_ecn_marking_threshold; // default = 1250 (100KB) 125 | RegisterAction,bit<1>,bit<1>>(reg_ecn_marking_threshold) cmp_ecn_marking_threshold = { 126 | void apply(inout bit<32> reg_val, out bit<1> rv){ 127 | if((bit<32>)eg_intr_md.deq_qdepth >= reg_val){ 128 | rv = 1; 129 | } 130 | else { 131 | rv = 0; 132 | } 133 | } 134 | }; 135 | 136 | action dctcp_check_ecn_marking(){ 137 | meta.exceeded_ecn_marking_threshold = cmp_ecn_marking_threshold.execute(0); 138 | } 139 | 140 | action mark_ecn_ce_codepoint(){ 141 | hdr.ipv4.ecn = 0b11; 142 | } 143 | 144 | // DCQCN 145 | action dcqcn_mark_probability(bit<8> value) { 146 | meta.dcqcn_prob_output = value; 147 | } 148 | 149 | table dcqcn_get_ecn_probability { 150 | key = { 151 | eg_intr_md.deq_qdepth : range; // 19 bits 152 | } 153 | actions = { 154 | dcqcn_mark_probability; 155 | } 156 | const default_action = dcqcn_mark_probability(0); // default: no ecn mark 157 | size = 1024; 158 | } 159 | 160 | Random>() random; // random seed for sampling 161 | action dcqcn_get_random_number(){ 162 | meta.dcqcn_random_number = random.get(); 163 | } 164 | 165 | action nop(){} 166 | 167 | action dcqcn_check_ecn_marking() { 168 | meta.exceeded_ecn_marking_threshold = (bit<1>)1; 169 | } 170 | 171 | table dcqcn_compare_probability { 172 | key = { 173 | meta.dcqcn_prob_output : exact; 174 | meta.dcqcn_random_number : exact; 175 | } 176 | actions = { 177 | dcqcn_check_ecn_marking; 178 | @defaultonly nop; 179 | } 180 | const default_action = nop(); 181 | size = 65536; 182 | } 183 | 184 | // mirroring 185 | action encode_eg_mirror_md(bit<2> ecn_mark) { 186 | meta.eg_mirror1.egress_global_timestamp = eg_intr_md_from_prsr.global_tstamp; 187 | meta.eg_mirror1.mirrored = (bit<8>)EG_MIRROR_TYPE_1; // eg type 188 | meta.eg_mirror1.ecn = ecn_mark; // ecn mark 189 | meta.mirror_session = MIRROR_SESSION_RDMA_ID_EG; // session id 190 | eg_intr_md_for_dprsr.mirror_type = EG_MIRROR_TYPE_1; // for deparser 191 | } 192 | 193 | action decode_eg_mirror_md() { 194 | hdr.ethernet.src_addr = meta.eg_mirror1.egress_global_timestamp; 195 | hdr.ethernet.dst_addr = (bit<48>)hdr.bth.packet_seqnum; 196 | hdr.ipv4.ecn = meta.eg_mirror1.ecn; 197 | } 198 | 199 | // for debugging ECN marking 200 | Register,bit<1>>(1) reg_ecn_marking_cntr; 201 | RegisterAction,bit<1>,bit<1>>(reg_ecn_marking_cntr) incr_ecn_marking_cntr = { 202 | void apply(inout bit<32> reg_val, out bit<1> rv){ 203 | reg_val = reg_val |+| 1; 204 | } 205 | }; 206 | 207 | apply{ 208 | /* IG_MIRRORING : RDMA Monitoring */ 209 | #ifdef IG_MIRRORING_ENABLED 210 | if (meta.ig_mirror1.mirrored == (bit<8>)IG_MIRROR_TYPE_1) { 211 | /* Timestamp -> MAC Src Address*/ 212 | hdr.ethernet.src_addr = meta.ig_mirror1.ingress_mac_timestamp; // 48 bits 213 | /* Sequence Number -> MAC Dst Address */ 214 | hdr.ethernet.dst_addr = (bit<48>)meta.ig_mirror1.rdma_seqnum; 215 | } 216 | #endif 217 | 218 | /* ECN */ 219 | if (hdr.ipv4.ecn == 0b01 || hdr.ipv4.ecn == 0b10){ 220 | get_cc_mode(); 221 | if (meta.cc_mode == 5) { 222 | /* DCTCP (static marking) */ 223 | dctcp_check_ecn_marking(); 224 | } else if (meta.cc_mode == 9) { 225 | /* DCQCN (RED-like marking) */ 226 | dcqcn_get_ecn_probability.apply(); // get probability to ecn-mark 227 | dcqcn_get_random_number(); // get random number for sampling 228 | dcqcn_compare_probability.apply(); 229 | } 230 | if (meta.exceeded_ecn_marking_threshold == 1){ 231 | mark_ecn_ce_codepoint(); 232 | incr_ecn_marking_cntr.execute(0); 233 | } 234 | } 235 | 236 | /* EG_MIRRORING : RDMA_Monitoring */ 237 | #ifdef EG_MIRRORING_ENABLED 238 | if (hdr.bth.isValid()) { 239 | if (meta.eg_mirror1.mirrored != (bit<8>)EG_MIRROR_TYPE_1) { // to be mirrored 240 | encode_eg_mirror_md(hdr.ipv4.ecn); 241 | } else { // mirrored 242 | decode_eg_mirror_md(); 243 | // debugging of eg_mirror 244 | if (hdr.ipv4.ecn == 0b11) { 245 | incr_ecn_marking_cntr.execute(0); 246 | } 247 | } 248 | } 249 | #endif 250 | } // end of apply block 251 | 252 | } // End of SwitchEgress 253 | 254 | 255 | Pipeline(SwitchIngressParser(), 256 | SwitchIngress(), 257 | SwitchIngressDeparser(), 258 | SwitchEgressParser(), 259 | SwitchEgress(), 260 | SwitchEgressDeparser() 261 | ) pipe; 262 | 263 | Switch(pipe) main; 264 | --------------------------------------------------------------------------------