├── .github └── workflows │ └── build.yml ├── .gitignore ├── Cargo.toml ├── Changes.md ├── LICENSE ├── Readme.md ├── examples ├── flood-mt.rs └── flood.rs └── src ├── lib.rs ├── mmap.rs ├── xdp.rs ├── xsk.rs └── xsk ├── iface.rs ├── ring.rs ├── socket.rs ├── umem.rs └── user.rs /.github/workflows/build.yml: -------------------------------------------------------------------------------- 1 | name: CI workflow 2 | 3 | on: 4 | pull_request: 5 | branches: 6 | - 'master' 7 | workflow_dispatch: 8 | push: 9 | branches: 10 | - 'master' 11 | 12 | env: 13 | CARGO_TERM_COLOR: always 14 | 15 | jobs: 16 | build: 17 | name: Build 18 | strategy: 19 | fail-fast: true 20 | matrix: 21 | include: 22 | - os: ubuntu-latest 23 | rust-version: 'stable' 24 | target: x86_64-unknown-linux-gnu 25 | runs-on: ${{ matrix.os }} 26 | steps: 27 | - uses: actions/checkout@master 28 | - name: Set up Rust 29 | uses: dtolnay/rust-toolchain@master 30 | with: 31 | toolchain: ${{ matrix.rust-version }} 32 | target: ${{ matrix.target }} 33 | - uses: swatinem/rust-cache@v2 34 | with: 35 | key: ${{ matrix.target }} 36 | save-if: ${{ github.ref == 'refs/heads/master' }} 37 | - name: Build 38 | run: | 39 | cargo build --release --target=${{ matrix.target }} 40 | - name: Clippy 41 | run: | 42 | cargo clippy --target=${{ matrix.target }} 43 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | /target 2 | /Cargo.lock 3 | -------------------------------------------------------------------------------- /Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | # Distribution information. 3 | name = "xdpilone" 4 | version = "1.2.0" 5 | 6 | # User facing information. 7 | description = "Interaction with Linux XDP sockets and rings. No libbpf/libxpd-sys. Lightweight, high-performance." 8 | license = "EUPL-1.2" 9 | authors = ["Aurelia Molzer"] 10 | categories = ["network-programming", "os::linux-apis", "no-std"] 11 | repository = "https://github.com/HeroicKatora/xdpilone" 12 | readme = "Readme.md" 13 | 14 | # Cargo&resolver configuration 15 | edition = "2021" 16 | rust-version = "1.65.0" 17 | 18 | [dependencies.libc] 19 | version = "0.2" 20 | default-features = false 21 | # For exclusion of multiple devices on the same queue. 22 | [dependencies.spin] 23 | version = "0.9" 24 | features = ["rwlock"] 25 | default-features = false 26 | 27 | [dev-dependencies.clap] 28 | version = "4" 29 | features = ["derive"] 30 | 31 | [badges] 32 | maintenance = { status = "passively-maintained" } 33 | -------------------------------------------------------------------------------- /Changes.md: -------------------------------------------------------------------------------- 1 | ## v1.2.0 2 | 3 | - Introduced `XdpStatisticsV2`, a forward compatible struct for fetching 4 | statistics related to an XDP socket. The kernel in fills the passed struct 5 | depending on the indicated size, older kernels will leave some of its members 6 | untouched. This struct is marked non-exhaustive so that any future fields 7 | added by the kernel can be supported in a minor change without requiring 8 | further separate types. 9 | 10 | ## v1.1.1 11 | 12 | - The flags passed via `SocketFlags::bind_flags` are now applied to all bind 13 | calls where previously they would only apply to sockets sharing the umem with 14 | another prior socket. 15 | 16 | ## v1.1 17 | 18 | - Added `DeviceQueue::bind` for binding queues from multiple different 19 | interfaces to the same underlying `umem`. Previously only a single socket for 20 | each additional queue could be bound when the same socket set up both 21 | fill/completion rings as well as receive/transmit rings. 22 | - Note: I'm currently not entirely comfortable with the types of the `bind` 23 | argument. They are not generic enough to cover all possible usages—the socket 24 | of fq/cq socket itself is sufficient but a `User` with rx/tx sockopts is 25 | required. At the same time however the types barely guard invariants that 26 | would detect some misuse or failure paths at compile time. Also to-be-used 27 | bind flags are associated with the socket as a `User` struct not as an 28 | independent argument to the `bind` call. 29 | - Note: Please open PRs resolving this either way, not issues. 30 | - Rename `Errno::new` to `Errno::last_os_error` aligning it with the standard 31 | library for this construct. The old name is kept as a documentation hidden 32 | method for compatibility. 33 | 34 | ## v1.0.5 35 | 36 | - Discovered that the `XdpUmemReg` contains padding, being passed to the kernel 37 | as the `tx_metadata_len` option. This would should up as spurious invalid 38 | argument (EINVAL) errors from the interpretation of the field. 39 | 40 | ## v1.0.4 41 | 42 | - No code changes. 43 | - Clarified status as feature-complete, passively maintained. 44 | - Updated some documentation. 45 | 46 | 47 | ## v1.0.3 48 | 49 | - Hide an unimplemented function sketch which was accidentally left over from 50 | previous experiments. Calling it always panics. The method will remain 51 | accessible for compatibility reasons (SemVer). 52 | 53 | ## v1.0.2 54 | 55 | - Implement `Iterator` for `ReadRx` and `ReadComplete`. 56 | - Document queue interaction adapters with intended workflow. 57 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | EUROPEAN UNION PUBLIC LICENCE v. 1.2 2 | EUPL © the European Union 2007, 2016 3 | 4 | This European Union Public Licence (the ‘EUPL’) applies to the Work (as defined 5 | below) which is provided under the terms of this Licence. Any use of the Work, 6 | other than as authorised under this Licence is prohibited (to the extent such 7 | use is covered by a right of the copyright holder of the Work). 8 | 9 | The Work is provided under the terms of this Licence when the Licensor (as 10 | defined below) has placed the following notice immediately following the 11 | copyright notice for the Work: 12 | 13 | Licensed under the EUPL 14 | 15 | or has expressed by any other means his willingness to license under the EUPL. 16 | 17 | 1. Definitions 18 | 19 | In this Licence, the following terms have the following meaning: 20 | 21 | - ‘The Licence’: this Licence. 22 | 23 | - ‘The Original Work’: the work or software distributed or communicated by the 24 | Licensor under this Licence, available as Source Code and also as Executable 25 | Code as the case may be. 26 | 27 | - ‘Derivative Works’: the works or software that could be created by the 28 | Licensee, based upon the Original Work or modifications thereof. This Licence 29 | does not define the extent of modification or dependence on the Original Work 30 | required in order to classify a work as a Derivative Work; this extent is 31 | determined by copyright law applicable in the country mentioned in Article 15. 32 | 33 | - ‘The Work’: the Original Work or its Derivative Works. 34 | 35 | - ‘The Source Code’: the human-readable form of the Work which is the most 36 | convenient for people to study and modify. 37 | 38 | - ‘The Executable Code’: any code which has generally been compiled and which is 39 | meant to be interpreted by a computer as a program. 40 | 41 | - ‘The Licensor’: the natural or legal person that distributes or communicates 42 | the Work under the Licence. 43 | 44 | - ‘Contributor(s)’: any natural or legal person who modifies the Work under the 45 | Licence, or otherwise contributes to the creation of a Derivative Work. 46 | 47 | - ‘The Licensee’ or ‘You’: any natural or legal person who makes any usage of 48 | the Work under the terms of the Licence. 49 | 50 | - ‘Distribution’ or ‘Communication’: any act of selling, giving, lending, 51 | renting, distributing, communicating, transmitting, or otherwise making 52 | available, online or offline, copies of the Work or providing access to its 53 | essential functionalities at the disposal of any other natural or legal 54 | person. 55 | 56 | 2. Scope of the rights granted by the Licence 57 | 58 | The Licensor hereby grants You a worldwide, royalty-free, non-exclusive, 59 | sublicensable licence to do the following, for the duration of copyright vested 60 | in the Original Work: 61 | 62 | - use the Work in any circumstance and for all usage, 63 | - reproduce the Work, 64 | - modify the Work, and make Derivative Works based upon the Work, 65 | - communicate to the public, including the right to make available or display 66 | the Work or copies thereof to the public and perform publicly, as the case may 67 | be, the Work, 68 | - distribute the Work or copies thereof, 69 | - lend and rent the Work or copies thereof, 70 | - sublicense rights in the Work or copies thereof. 71 | 72 | Those rights can be exercised on any media, supports and formats, whether now 73 | known or later invented, as far as the applicable law permits so. 74 | 75 | In the countries where moral rights apply, the Licensor waives his right to 76 | exercise his moral right to the extent allowed by law in order to make effective 77 | the licence of the economic rights here above listed. 78 | 79 | The Licensor grants to the Licensee royalty-free, non-exclusive usage rights to 80 | any patents held by the Licensor, to the extent necessary to make use of the 81 | rights granted on the Work under this Licence. 82 | 83 | 3. Communication of the Source Code 84 | 85 | The Licensor may provide the Work either in its Source Code form, or as 86 | Executable Code. If the Work is provided as Executable Code, the Licensor 87 | provides in addition a machine-readable copy of the Source Code of the Work 88 | along with each copy of the Work that the Licensor distributes or indicates, in 89 | a notice following the copyright notice attached to the Work, a repository where 90 | the Source Code is easily and freely accessible for as long as the Licensor 91 | continues to distribute or communicate the Work. 92 | 93 | 4. Limitations on copyright 94 | 95 | Nothing in this Licence is intended to deprive the Licensee of the benefits from 96 | any exception or limitation to the exclusive rights of the rights owners in the 97 | Work, of the exhaustion of those rights or of other applicable limitations 98 | thereto. 99 | 100 | 5. Obligations of the Licensee 101 | 102 | The grant of the rights mentioned above is subject to some restrictions and 103 | obligations imposed on the Licensee. Those obligations are the following: 104 | 105 | Attribution right: The Licensee shall keep intact all copyright, patent or 106 | trademarks notices and all notices that refer to the Licence and to the 107 | disclaimer of warranties. The Licensee must include a copy of such notices and a 108 | copy of the Licence with every copy of the Work he/she distributes or 109 | communicates. The Licensee must cause any Derivative Work to carry prominent 110 | notices stating that the Work has been modified and the date of modification. 111 | 112 | Copyleft clause: If the Licensee distributes or communicates copies of the 113 | Original Works or Derivative Works, this Distribution or Communication will be 114 | done under the terms of this Licence or of a later version of this Licence 115 | unless the Original Work is expressly distributed only under this version of the 116 | Licence — for example by communicating ‘EUPL v. 1.2 only’. The Licensee 117 | (becoming Licensor) cannot offer or impose any additional terms or conditions on 118 | the Work or Derivative Work that alter or restrict the terms of the Licence. 119 | 120 | Compatibility clause: If the Licensee Distributes or Communicates Derivative 121 | Works or copies thereof based upon both the Work and another work licensed under 122 | a Compatible Licence, this Distribution or Communication can be done under the 123 | terms of this Compatible Licence. For the sake of this clause, ‘Compatible 124 | Licence’ refers to the licences listed in the appendix attached to this Licence. 125 | Should the Licensee's obligations under the Compatible Licence conflict with 126 | his/her obligations under this Licence, the obligations of the Compatible 127 | Licence shall prevail. 128 | 129 | Provision of Source Code: When distributing or communicating copies of the Work, 130 | the Licensee will provide a machine-readable copy of the Source Code or indicate 131 | a repository where this Source will be easily and freely available for as long 132 | as the Licensee continues to distribute or communicate the Work. 133 | 134 | Legal Protection: This Licence does not grant permission to use the trade names, 135 | trademarks, service marks, or names of the Licensor, except as required for 136 | reasonable and customary use in describing the origin of the Work and 137 | reproducing the content of the copyright notice. 138 | 139 | 6. Chain of Authorship 140 | 141 | The original Licensor warrants that the copyright in the Original Work granted 142 | hereunder is owned by him/her or licensed to him/her and that he/she has the 143 | power and authority to grant the Licence. 144 | 145 | Each Contributor warrants that the copyright in the modifications he/she brings 146 | to the Work are owned by him/her or licensed to him/her and that he/she has the 147 | power and authority to grant the Licence. 148 | 149 | Each time You accept the Licence, the original Licensor and subsequent 150 | Contributors grant You a licence to their contributions to the Work, under the 151 | terms of this Licence. 152 | 153 | 7. Disclaimer of Warranty 154 | 155 | The Work is a work in progress, which is continuously improved by numerous 156 | Contributors. It is not a finished work and may therefore contain defects or 157 | ‘bugs’ inherent to this type of development. 158 | 159 | For the above reason, the Work is provided under the Licence on an ‘as is’ basis 160 | and without warranties of any kind concerning the Work, including without 161 | limitation merchantability, fitness for a particular purpose, absence of defects 162 | or errors, accuracy, non-infringement of intellectual property rights other than 163 | copyright as stated in Article 6 of this Licence. 164 | 165 | This disclaimer of warranty is an essential part of the Licence and a condition 166 | for the grant of any rights to the Work. 167 | 168 | 8. Disclaimer of Liability 169 | 170 | Except in the cases of wilful misconduct or damages directly caused to natural 171 | persons, the Licensor will in no event be liable for any direct or indirect, 172 | material or moral, damages of any kind, arising out of the Licence or of the use 173 | of the Work, including without limitation, damages for loss of goodwill, work 174 | stoppage, computer failure or malfunction, loss of data or any commercial 175 | damage, even if the Licensor has been advised of the possibility of such damage. 176 | However, the Licensor will be liable under statutory product liability laws as 177 | far such laws apply to the Work. 178 | 179 | 9. Additional agreements 180 | 181 | While distributing the Work, You may choose to conclude an additional agreement, 182 | defining obligations or services consistent with this Licence. However, if 183 | accepting obligations, You may act only on your own behalf and on your sole 184 | responsibility, not on behalf of the original Licensor or any other Contributor, 185 | and only if You agree to indemnify, defend, and hold each Contributor harmless 186 | for any liability incurred by, or claims asserted against such Contributor by 187 | the fact You have accepted any warranty or additional liability. 188 | 189 | 10. Acceptance of the Licence 190 | 191 | The provisions of this Licence can be accepted by clicking on an icon ‘I agree’ 192 | placed under the bottom of a window displaying the text of this Licence or by 193 | affirming consent in any other similar way, in accordance with the rules of 194 | applicable law. Clicking on that icon indicates your clear and irrevocable 195 | acceptance of this Licence and all of its terms and conditions. 196 | 197 | Similarly, you irrevocably accept this Licence and all of its terms and 198 | conditions by exercising any rights granted to You by Article 2 of this Licence, 199 | such as the use of the Work, the creation by You of a Derivative Work or the 200 | Distribution or Communication by You of the Work or copies thereof. 201 | 202 | 11. Information to the public 203 | 204 | In case of any Distribution or Communication of the Work by means of electronic 205 | communication by You (for example, by offering to download the Work from a 206 | remote location) the distribution channel or media (for example, a website) must 207 | at least provide to the public the information requested by the applicable law 208 | regarding the Licensor, the Licence and the way it may be accessible, concluded, 209 | stored and reproduced by the Licensee. 210 | 211 | 12. Termination of the Licence 212 | 213 | The Licence and the rights granted hereunder will terminate automatically upon 214 | any breach by the Licensee of the terms of the Licence. 215 | 216 | Such a termination will not terminate the licences of any person who has 217 | received the Work from the Licensee under the Licence, provided such persons 218 | remain in full compliance with the Licence. 219 | 220 | 13. Miscellaneous 221 | 222 | Without prejudice of Article 9 above, the Licence represents the complete 223 | agreement between the Parties as to the Work. 224 | 225 | If any provision of the Licence is invalid or unenforceable under applicable 226 | law, this will not affect the validity or enforceability of the Licence as a 227 | whole. Such provision will be construed or reformed so as necessary to make it 228 | valid and enforceable. 229 | 230 | The European Commission may publish other linguistic versions or new versions of 231 | this Licence or updated versions of the Appendix, so far this is required and 232 | reasonable, without reducing the scope of the rights granted by the Licence. New 233 | versions of the Licence will be published with a unique version number. 234 | 235 | All linguistic versions of this Licence, approved by the European Commission, 236 | have identical value. Parties can take advantage of the linguistic version of 237 | their choice. 238 | 239 | 14. Jurisdiction 240 | 241 | Without prejudice to specific agreement between parties, 242 | 243 | - any litigation resulting from the interpretation of this License, arising 244 | between the European Union institutions, bodies, offices or agencies, as a 245 | Licensor, and any Licensee, will be subject to the jurisdiction of the Court 246 | of Justice of the European Union, as laid down in article 272 of the Treaty on 247 | the Functioning of the European Union, 248 | 249 | - any litigation arising between other parties and resulting from the 250 | interpretation of this License, will be subject to the exclusive jurisdiction 251 | of the competent court where the Licensor resides or conducts its primary 252 | business. 253 | 254 | 15. Applicable Law 255 | 256 | Without prejudice to specific agreement between parties, 257 | 258 | - this Licence shall be governed by the law of the European Union Member State 259 | where the Licensor has his seat, resides or has his registered office, 260 | 261 | - this licence shall be governed by Belgian law if the Licensor has no seat, 262 | residence or registered office inside a European Union Member State. 263 | 264 | Appendix 265 | 266 | ‘Compatible Licences’ according to Article 5 EUPL are: 267 | 268 | - GNU General Public License (GPL) v. 2, v. 3 269 | - GNU Affero General Public License (AGPL) v. 3 270 | - Open Software License (OSL) v. 2.1, v. 3.0 271 | - Eclipse Public License (EPL) v. 1.0 272 | - CeCILL v. 2.0, v. 2.1 273 | - Mozilla Public Licence (MPL) v. 2 274 | - GNU Lesser General Public Licence (LGPL) v. 2.1, v. 3 275 | - Creative Commons Attribution-ShareAlike v. 3.0 Unported (CC BY-SA 3.0) for 276 | works other than software 277 | - European Union Public Licence (EUPL) v. 1.1, v. 1.2 278 | - Québec Free and Open-Source Licence — Reciprocity (LiLiQ-R) or Strong 279 | Reciprocity (LiLiQ-R+). 280 | 281 | The European Commission may update this Appendix to later versions of the above 282 | licences without producing a new version of the EUPL, as long as they provide 283 | the rights granted in Article 2 of this Licence and protect the covered Source 284 | Code from exclusive appropriation. 285 | 286 | All other changes or additions to this Appendix require the production of a new 287 | EUPL version. 288 | -------------------------------------------------------------------------------- /Readme.md: -------------------------------------------------------------------------------- 1 | The XDP Rust access library. 2 | 3 | ## Motivation 4 | 5 | For the Linux AF_XDP all existing libraries are based on or around the C access 6 | libraries. The goal is develop a Rust centric library that can take advantage 7 | of its added thread-safety benefits for socket types, as well as high-level 8 | abstractions (such as closures, `Arc`) for interacting with the packet buffers. 9 | 10 | The primary metrics for decision making are performance, and latency. 11 | 12 | ## Overview 13 | 14 | Goals: 15 | - No more latency than the C implementation in the data paths. 16 | - Enable and simplify *correct* multi-threading on the same Umem. 17 | 18 | Non-Goals: 19 | - Handling BPF / XSK_MAP. This is _necessary_ to accept packets on any of the 20 | RX sockets created, however it can be setup at any point with no interaction 21 | with the actual queues. Hence we keep this large dependency tree separate. 22 | (You could choose a pure-Rust libbpf alternative if you want to). 23 | - Dealing with any aspects of buffer allocation. 24 | 25 | ## Name Origin 26 | 27 | The drug Ixabepilone is a pharmaceutical against cancer. 28 | -------------------------------------------------------------------------------- /examples/flood-mt.rs: -------------------------------------------------------------------------------- 1 | //! This example demonstrates _flooding_ a network with packets. 2 | //! 3 | //! Aim at a network interface with care! 4 | use core::cell::UnsafeCell; 5 | use core::sync::atomic::{AtomicU32, Ordering}; 6 | use core::{num::NonZeroU32, ptr::NonNull}; 7 | 8 | use xdpilone::xdp::XdpDesc; 9 | use xdpilone::{BufIdx, DeviceQueue, IfInfo, RingTx, Socket, SocketConfig, Umem, UmemConfig}; 10 | 11 | // We can use _any_ data mapping, so let's use a static one setup by the linker/loader. 12 | #[repr(align(4096))] 13 | struct PacketMap(UnsafeCell<[u8; 1 << 20]>); 14 | // Safety: no instance used for unsynchronized data access. 15 | unsafe impl Sync for PacketMap {} 16 | 17 | static MEM: PacketMap = PacketMap(UnsafeCell::new([0; 1 << 20])); 18 | 19 | fn main() { 20 | let args = ::parse(); 21 | 22 | // Register the packet buffer with the kernel, getting an XDP socket file descriptor for it. 23 | let mem = NonNull::new(MEM.0.get() as *mut [u8]).unwrap(); 24 | 25 | // Safety: we guarantee this mapping is aligned, and will be alive. It is static, after-all. 26 | let umem = unsafe { Umem::new(UmemConfig::default(), mem) }.unwrap(); 27 | let info = ifinfo(&args).unwrap(); 28 | 29 | // Let's use that same file descriptor for our packet buffer operations on the specified 30 | // network interface. Umem + Fill/Complete + Rx/Tx will live on the same FD. 31 | 32 | let rxtx_config = SocketConfig { 33 | rx_size: None, 34 | tx_size: NonZeroU32::new(1 << 12), 35 | bind_flags: 0, 36 | }; 37 | 38 | let num_threads = args.threads.map_or(1, NonZeroU32::get); 39 | 40 | let mut tx_queues = vec![]; 41 | let mut to_binds = vec![]; 42 | let mut devices = vec![]; 43 | let mut socks = vec![]; 44 | 45 | for (idx, (dev_idx, info)) in core::iter::repeat(info.iter().enumerate()) 46 | .flatten() 47 | .take(num_threads as usize) 48 | .enumerate() 49 | { 50 | let sock = if idx == 0 { 51 | Socket::with_shared(&info, &umem).unwrap() 52 | } else { 53 | Socket::new(&info).unwrap() 54 | }; 55 | 56 | if idx == dev_idx { 57 | devices.push(umem.fq_cq(&sock).unwrap()); 58 | } 59 | 60 | // Configure our receive/transmit queues. 61 | let rxtx = umem.rx_tx(&sock, &rxtx_config).unwrap(); 62 | socks.push(sock); 63 | to_binds.push(rxtx); 64 | } 65 | 66 | for (idx, ((dev_idx, queue), rxtx)) in core::iter::repeat(devices.iter().enumerate()) 67 | .flatten() 68 | .zip(to_binds.iter()) 69 | .enumerate() 70 | { 71 | if idx == dev_idx { 72 | eprintln!("Binding socket {idx} {}", rxtx.as_raw_fd()); 73 | // Ready to bind, i.e. kernel to start doing things on the ring. 74 | umem.bind(&rxtx).unwrap(); 75 | } else { 76 | queue.bind(&rxtx).unwrap(); 77 | } 78 | } 79 | 80 | for (idx, rxtx) in to_binds.iter().enumerate() { 81 | eprintln!("Mapping socket {idx}"); 82 | // Map the TX queue into our memory space. 83 | let tx = rxtx.map_tx().unwrap(); 84 | assert!(rxtx.map_rx().is_err(), "did not provide a rx_size"); 85 | tx_queues.push(tx); 86 | } 87 | 88 | // Setup one frame we're going to use, repeatedly. 89 | // We only need its descriptor for the TX queue. 90 | let desc = { 91 | let mut frame = umem.frame(BufIdx(0)).unwrap(); 92 | // Safety: we are the unique thread accessing this at the moment. 93 | prepare_buffer(frame.offset, unsafe { frame.addr.as_mut() }, &args) 94 | }; 95 | 96 | eprintln!("Connection up!"); 97 | 98 | // Bring our bindings into an 'active duty' state. 99 | let start = std::time::Instant::now(); 100 | 101 | let batch: u32 = args.batch.unwrap_or(1 << 10); 102 | let total: u32 = args.total.unwrap_or(1 << 20); 103 | const WAKE_THRESHOLD: u32 = 1 << 4; 104 | 105 | let sent_reserved = AtomicU32::new(0); 106 | let sent = AtomicU32::new(0); 107 | let completed = AtomicU32::new(0); 108 | let stall_count = AtomicU32::new(0); 109 | 110 | let stat_loops = AtomicU32::new(0); 111 | let stat_stall = AtomicU32::new(0); 112 | let stat_woken = AtomicU32::new(0); 113 | let tx_log_batch = [0; 33].map(AtomicU32::new); 114 | let cq_log_batch = [0; 33].map(AtomicU32::new); 115 | 116 | let tx_by_sock: Vec<_> = (0..to_binds.len()).map(|_| AtomicU32::new(0)).collect(); 117 | let cq_by_queue: Vec<_> = (0..devices.len()).map(|_| AtomicU32::new(0)).collect(); 118 | 119 | eprintln!( 120 | "Dumping {} B with {} packets!", 121 | total as f32 * desc.len as f32, 122 | total 123 | ); 124 | 125 | let completer = |mut queue: DeviceQueue, ctr: &AtomicU32| loop { 126 | let current = completed.load(Ordering::Relaxed); 127 | 128 | if current == total { 129 | break; 130 | } 131 | 132 | // Number of completions reaped in this iteration. 133 | let comp_now: u32; 134 | let comp_batch = sent 135 | .load(Ordering::Acquire) 136 | .saturating_sub(current) 137 | .min(batch); 138 | 139 | { 140 | // Try to dequeue some completions. 141 | let mut reader = queue.complete(comp_batch); 142 | let mut comp_temp = 0; 143 | 144 | while reader.read().is_some() { 145 | comp_temp += 1; 146 | } 147 | 148 | comp_now = comp_temp; 149 | reader.release(); 150 | } 151 | 152 | if comp_now == 0 { 153 | stall_count.fetch_add(1, Ordering::Relaxed); 154 | stat_stall.fetch_add(1, Ordering::Relaxed); 155 | } 156 | 157 | completed.fetch_add(comp_now, Ordering::Release); 158 | ctr.fetch_add(comp_now, Ordering::Release); 159 | stat_loops.fetch_add(1, Ordering::Relaxed); 160 | 161 | cq_log_batch[32 - comp_now.leading_zeros() as usize].fetch_add(1, Ordering::Relaxed); 162 | }; 163 | 164 | let sender = |mut tx: RingTx, ctr: &AtomicU32| { 165 | let mut stall_threshold = WAKE_THRESHOLD; 166 | loop { 167 | if sent.load(Ordering::Relaxed) >= total && completed.load(Ordering::Relaxed) >= total { 168 | break; 169 | } 170 | 171 | let send_batch = loop { 172 | // Reserve some of these buffers. Relaxed loads because we don't synchronize with any 173 | // other memory locations, only atomicity. 174 | match sent_reserved.fetch_update(Ordering::Relaxed, Ordering::Relaxed, |n| { 175 | Some(n + total.saturating_sub(n).min(batch)) 176 | }) { 177 | // Break with the number of updated total. 178 | Ok(prev) => break total.saturating_sub(prev).min(batch), 179 | Err(_) => continue, 180 | } 181 | }; 182 | 183 | let sent_now: u32; 184 | 185 | { 186 | // Try to add descriptors to the transmit buffer. 187 | let mut writer = tx.transmit(send_batch); 188 | let bufs = core::iter::repeat(desc); 189 | sent_now = writer.insert(bufs); 190 | writer.commit(); 191 | } 192 | 193 | if stall_count.load(Ordering::Relaxed) > stall_threshold { 194 | // It may be necessary to wake up. This is costly, in relative terms, so we avoid doing 195 | // it when the kernel proceeds without us. We detect this by checking if both queues 196 | // failed to make progress for some time. 197 | tx.wake(); 198 | stat_woken.fetch_add(1, Ordering::Relaxed); 199 | stall_threshold += WAKE_THRESHOLD; 200 | } 201 | 202 | // Stat tracking.. 203 | sent_reserved.fetch_sub(send_batch - sent_now, Ordering::Relaxed); 204 | sent.fetch_add(sent_now, Ordering::Release); 205 | ctr.fetch_add(sent_now, Ordering::Release); 206 | 207 | tx_log_batch[32 - sent_now.leading_zeros() as usize].fetch_add(1, Ordering::Relaxed); 208 | } 209 | }; 210 | 211 | std::thread::scope(|scope| { 212 | for (queue, ctr) in devices.into_iter().zip(cq_by_queue.iter()) { 213 | scope.spawn(|| completer(queue, ctr)); 214 | } 215 | 216 | for (tx, ctr) in tx_queues.into_iter().zip(tx_by_sock.iter()) { 217 | scope.spawn(|| sender(tx, ctr)); 218 | } 219 | }); 220 | 221 | // Dump all measurements we took. 222 | let end = std::time::Instant::now(); 223 | let secs = end.saturating_duration_since(start).as_secs_f32(); 224 | 225 | let packets = completed.into_inner() as f32; 226 | let bytes = packets * desc.len as f32; 227 | 228 | eprintln!( 229 | "{:?} s; {} pkt; {} pkt/s; {} B/s; {} L1-B/s", 230 | secs, 231 | packets, 232 | packets / secs, 233 | bytes / secs, 234 | // Each frame has 7(Preamble)+1(delimiter)+12(IGP) Ethernet overhead. 235 | (bytes + packets * 20.) / secs, 236 | ); 237 | 238 | eprintln!( 239 | "Statistics\nLoops: {}; stalled: {}; wake/sys-call: {}", 240 | stat_loops.into_inner(), 241 | stat_stall.into_inner(), 242 | stat_woken.into_inner() 243 | ); 244 | 245 | eprintln!("Tx Batch size (log2): {:?}", tx_log_batch); 246 | eprintln!("Cq Batch size (log2): {:?}", cq_log_batch); 247 | 248 | eprintln!("Tx by socket: {:?}", tx_by_sock); 249 | eprintln!("Cq by queue: {:?}", cq_by_queue); 250 | } 251 | 252 | fn prepare_buffer(offset: u64, buffer: &mut [u8], args: &Args) -> XdpDesc { 253 | buffer[..ARP.len()].copy_from_slice(&ARP[..]); 254 | let extra = args.length.unwrap_or(0).saturating_sub(ARP.len() as u32); 255 | 256 | XdpDesc { 257 | addr: offset, 258 | len: ARP.len() as u32 + extra, 259 | options: 0, 260 | } 261 | } 262 | 263 | #[derive(clap::Parser)] 264 | struct Args { 265 | /// The name of the interface to use. 266 | ifname: Vec, 267 | /// Overwrite the queue_id. 268 | #[arg(long = "queue-id")] 269 | queue_id: Option, 270 | /// Maximum number of queue operations in a single loop. 271 | #[arg(long = "batch-size")] 272 | batch: Option, 273 | /// The total number of packets to enqueue on the NIC. 274 | #[arg(long = "packet-total")] 275 | total: Option, 276 | /// The count of bytes in each test packet to flood. 277 | #[arg(long = "packet-length")] 278 | length: Option, 279 | #[arg(long = "threads")] 280 | threads: Option, 281 | } 282 | 283 | fn ifinfo(args: &Args) -> Result, xdpilone::Errno> { 284 | let mut infos = vec![]; 285 | 286 | if args.ifname.is_empty() { 287 | eprintln!("At least one IFNAME required"); 288 | std::process::exit(1); 289 | } 290 | 291 | for ifname in &args.ifname { 292 | let mut bytes = ifname.to_owned(); 293 | bytes.push('\0'); 294 | let bytes = bytes.as_bytes(); 295 | let name = core::ffi::CStr::from_bytes_with_nul(bytes).unwrap(); 296 | 297 | let mut info = IfInfo::invalid(); 298 | info.from_name(name)?; 299 | if let Some(q) = args.queue_id { 300 | info.set_queue(q); 301 | } 302 | 303 | infos.push(info); 304 | } 305 | 306 | Ok(infos) 307 | } 308 | 309 | #[rustfmt::skip] 310 | static ARP: [u8; 14+28] = [ 311 | 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 312 | 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 313 | 0x08, 0x06, 314 | 315 | 0x00, 0x01, 316 | 0x08, 0x00, 0x06, 0x04, 317 | 0x00, 0x01, 318 | 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 319 | 0x21, 0x22, 0x23, 0x24, 320 | 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 321 | 0x41, 0x42, 0x43, 0x44, 322 | ]; 323 | -------------------------------------------------------------------------------- /examples/flood.rs: -------------------------------------------------------------------------------- 1 | //! This example demonstrates _flooding_ a network with packets. 2 | //! 3 | //! This will very aggressively poll the queues. Seriously, wasting time is an understatement. Do 4 | //! not use in production and aim at a network interface with care! 5 | use core::{mem::MaybeUninit, num::NonZeroU32, ptr::NonNull}; 6 | use xdpilone::xdp::XdpDesc; 7 | use xdpilone::{BufIdx, IfInfo, Socket, SocketConfig, Umem, UmemConfig}; 8 | 9 | // We can use _any_ data mapping, so let's use a static one setup by the linker/loader. 10 | #[repr(align(4096))] 11 | struct PacketMap(MaybeUninit<[u8; 1 << 20]>); 12 | 13 | fn main() { 14 | let args = ::parse(); 15 | 16 | let alloc = Box::new(PacketMap(MaybeUninit::uninit())); 17 | // Register the packet buffer with the kernel, getting an XDP socket file descriptor for it. 18 | let mem = NonNull::new(Box::leak(alloc).0.as_mut_ptr()).unwrap(); 19 | 20 | // Safety: we guarantee this mapping is aligned, and will be alive. It is static, after-all. 21 | let umem = unsafe { Umem::new(UmemConfig::default(), mem) }.unwrap(); 22 | let info = ifinfo(&args).unwrap(); 23 | 24 | // Let's use that same file descriptor for our packet buffer operations on the specified 25 | // network interface. Umem + Fill/Complete + Rx/Tx will live on the same FD. 26 | let sock = Socket::with_shared(&info, &umem).unwrap(); 27 | // Get the fill/completion device (which handles the 'device queue'). 28 | let device = umem.fq_cq(&sock).unwrap(); 29 | 30 | // Configure our receive/transmit queues. 31 | let rxtx = umem 32 | .rx_tx( 33 | &sock, 34 | &SocketConfig { 35 | rx_size: None, 36 | tx_size: NonZeroU32::new(1 << 14), 37 | bind_flags: SocketConfig::XDP_BIND_NEED_WAKEUP, 38 | }, 39 | ) 40 | .unwrap(); 41 | 42 | assert!(rxtx.map_rx().is_err(), "did not provide a rx_size"); 43 | // Map the TX queue into our memory space. 44 | let tx = rxtx.map_tx().unwrap(); 45 | 46 | // Ready to bind, i.e. kernel to start doing things on the ring. 47 | umem.bind(&rxtx).unwrap(); 48 | 49 | // Setup one frame we're going to use, repeatedly. 50 | // We only need its descriptor for the TX queue. 51 | let desc = { 52 | let mut frame = umem.frame(BufIdx(1)).unwrap(); 53 | // Safety: we are the unique thread accessing this at the moment. 54 | prepare_buffer(frame.offset, unsafe { frame.addr.as_mut() }, &args) 55 | }; 56 | 57 | eprintln!("Connection up!"); 58 | 59 | // Bring our bindings into an 'active duty' state. 60 | let mut tx = tx; 61 | let mut device = device; 62 | 63 | let start = std::time::Instant::now(); 64 | 65 | let batch: u32 = args.batch.unwrap_or(1 << 10); 66 | let total: u32 = args.total.unwrap_or(1 << 20); 67 | 68 | let mut sent = 0; 69 | let mut completed = 0; 70 | 71 | // some nice stats to track and later report. 72 | let mut stat_loops = 0; 73 | let mut stat_stall = 0; 74 | let mut stat_woken = 0; 75 | let mut tx_log_batch = [0; 33]; 76 | let mut cq_log_batch = [0; 33]; 77 | 78 | eprintln!( 79 | "Dumping {} B with {} packets!", 80 | total as f32 * desc.len as f32, 81 | total 82 | ); 83 | 84 | eprintln!("The description is {:?}", desc,); 85 | 86 | while !(sent == completed && sent == total) { 87 | let sent_now: u32; // Number of buffers enqueued in this iteration. 88 | let comp_now: u32; // Number of completions reaped in this iteration. 89 | 90 | { 91 | let send_batch = total.saturating_sub(sent).min(batch); 92 | // Try to add descriptors to the transmit buffer. 93 | let mut writer = tx.transmit(send_batch); 94 | let bufs = core::iter::repeat(desc); 95 | sent_now = writer.insert(bufs); 96 | writer.commit(); 97 | } 98 | 99 | // It may be necessary to wake up. This is costly, in relative terms, so we avoid doing 100 | // it when the kernel proceeds without us. We detect this by checking if both queues 101 | // failed to make progress for some time. And then only do it once. 102 | if tx.needs_wakeup() { 103 | tx.wake(); 104 | stat_woken += 1; 105 | } 106 | 107 | { 108 | let comp_batch = sent.saturating_sub(completed).min(batch); 109 | // Try to dequeue some completions. 110 | let mut reader = device.complete(comp_batch); 111 | let mut comp_temp = 0; 112 | 113 | while reader.read().is_some() { 114 | comp_temp += 1; 115 | } 116 | 117 | comp_now = comp_temp; 118 | reader.release(); 119 | } 120 | 121 | if sent_now == 0 && comp_now == 0 { 122 | stat_stall += 1; 123 | } 124 | 125 | // Stat tracking.. 126 | sent += sent_now; 127 | completed += comp_now; 128 | stat_loops += 1; 129 | 130 | tx_log_batch[32 - sent_now.leading_zeros() as usize] += 1; 131 | cq_log_batch[32 - comp_now.leading_zeros() as usize] += 1; 132 | } 133 | 134 | // Dump all measurements we took. 135 | let end = std::time::Instant::now(); 136 | let secs = end.saturating_duration_since(start).as_secs_f32(); 137 | let packets = completed as f32; 138 | let bytes = packets * desc.len as f32; 139 | 140 | eprintln!( 141 | "{:?} s; {} pkt; {} pkt/s; {} B/s; {} L1-B/s", 142 | secs, 143 | packets, 144 | packets / secs, 145 | bytes / secs, 146 | // Each frame has 7(Preamble)+1(delimiter)+12(IGP) Ethernet overhead. 147 | (bytes + packets * 20.) / secs, 148 | ); 149 | 150 | eprintln!( 151 | "Statistics\nLoops: {}; stalled: {}; wake/sys-call: {}", 152 | stat_loops, stat_stall, stat_woken 153 | ); 154 | 155 | eprintln!("Tx Batch size (log2): {:?}", tx_log_batch); 156 | eprintln!("Cq Batch size (log2): {:?}", cq_log_batch); 157 | } 158 | 159 | fn prepare_buffer(offset: u64, buffer: &mut [u8], args: &Args) -> XdpDesc { 160 | buffer[..ARP.len()].copy_from_slice(&ARP[..]); 161 | let extra = args.length.unwrap_or(0).saturating_sub(ARP.len() as u32); 162 | 163 | XdpDesc { 164 | addr: offset, 165 | len: ARP.len() as u32 + extra, 166 | options: 0, 167 | } 168 | } 169 | 170 | #[derive(clap::Parser)] 171 | struct Args { 172 | /// The name of the interface to use. 173 | ifname: String, 174 | /// Overwrite the queue_id. 175 | #[arg(long = "queue-id")] 176 | queue_id: Option, 177 | /// Maximum number of queue operations in a single loop. 178 | #[arg(long = "batch-size")] 179 | batch: Option, 180 | /// The total number of packets to enqueue on the NIC. 181 | #[arg(long = "packet-total")] 182 | total: Option, 183 | /// The count of bytes in each test packet to flood. 184 | #[arg(long = "packet-length")] 185 | length: Option, 186 | } 187 | 188 | fn ifinfo(args: &Args) -> Result { 189 | let mut bytes = String::from(&args.ifname); 190 | bytes.push('\0'); 191 | let bytes = bytes.as_bytes(); 192 | let name = core::ffi::CStr::from_bytes_with_nul(bytes).unwrap(); 193 | 194 | let mut info = IfInfo::invalid(); 195 | info.from_name(name)?; 196 | if let Some(q) = args.queue_id { 197 | info.set_queue(q); 198 | } 199 | 200 | Ok(info) 201 | } 202 | 203 | #[rustfmt::skip] 204 | static ARP: [u8; 14+28] = [ 205 | 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 206 | 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 207 | 0x08, 0x06, 208 | 209 | 0x00, 0x01, 210 | 0x08, 0x00, 0x06, 0x04, 211 | 0x00, 0x01, 212 | 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 213 | 0x21, 0x22, 0x23, 0x24, 214 | 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 215 | 0x41, 0x42, 0x43, 0x44, 216 | ]; 217 | -------------------------------------------------------------------------------- /src/lib.rs: -------------------------------------------------------------------------------- 1 | //! Rust idiomatic bindings for the AF_XDP socket interface. 2 | //! 3 | //! This library helps with creating suitable socket(s) from a memory allocation of chunks, sockets 4 | //! for access to all four rings, binding to a specific `(ifname, queue_id)`, and for creating the 5 | //! memory mapping to interact with all these queues directly. 6 | //! 7 | //! It does _not_ interact with the packet filter / forwarding directly, nor any other aspect of 8 | //! `bpf`. You can send packets but you can not receive them. Please use another library for `bpf` 9 | //! and `netlink` interaction to configure the network device to route received frames to the RX 10 | //! ring. 11 | //! 12 | //! The entrypoint to the library is an instance of [`crate::Umem`]. 13 | #![no_std] 14 | #![deny(missing_docs)] 15 | extern crate alloc; 16 | 17 | /// User-space side of one or multiple XDP sockets. 18 | mod xsk; 19 | 20 | pub use xsk::{ 21 | BufIdx, DeviceQueue, IfInfo, ReadComplete, ReadRx, RingCons, RingProd, RingRx, RingTx, Socket, 22 | SocketConfig, Umem, UmemChunk, UmemConfig, User, WriteFill, WriteTx, 23 | }; 24 | 25 | /// Bindings for XDP kernel-interface, including structs. 26 | pub mod xdp; 27 | 28 | pub(crate) struct LastErrno; 29 | 30 | /// An error that has been read from `errno`. 31 | // 32 | // `Default` is a bit misleading even though there is a constructor without any parameters. In 33 | // hindsight it may have been better to provide a descriptive name. 34 | #[allow(clippy::new_without_default)] 35 | pub struct Errno(libc::c_int); 36 | 37 | impl From for Errno { 38 | fn from(LastErrno: LastErrno) -> Self { 39 | Errno::last_os_error() 40 | } 41 | } 42 | 43 | impl Errno { 44 | /// Create an error from the latest `errno`. 45 | #[deprecated = "use the more descriptive name `Errno::last_os_error`"] 46 | #[doc(hidden)] 47 | pub fn new() -> Self { 48 | Self::last_os_error() 49 | } 50 | 51 | /// Create an error from the latest `errno`. 52 | pub fn last_os_error() -> Self { 53 | Errno(unsafe { *libc::__errno_location() }) 54 | } 55 | 56 | /// Get the actual `errno` value. 57 | pub fn get_raw(&self) -> libc::c_int { 58 | self.0 59 | } 60 | } 61 | 62 | impl core::fmt::Display for Errno { 63 | fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { 64 | let st = unsafe { libc::strerror(self.0) }; 65 | let cstr = unsafe { core::ffi::CStr::from_ptr(st) }; 66 | write!(f, "{}", cstr.to_string_lossy()) 67 | } 68 | } 69 | 70 | impl core::fmt::Debug for Errno { 71 | fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { 72 | write!(f, "Errno({}: {})", self.0, self) 73 | } 74 | } 75 | -------------------------------------------------------------------------------- /src/mmap.rs: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/197g/xdpilone/635387a62121ea09b5c3249769eb5440a38771ae/src/mmap.rs -------------------------------------------------------------------------------- /src/xdp.rs: -------------------------------------------------------------------------------- 1 | // Please see the respective Linux documentation instead. 2 | #![allow(missing_docs)] 3 | 4 | /// Rx/Tx descriptor. 5 | /// 6 | /// The layout of this struct is part of the kernel interface. 7 | #[repr(C)] 8 | #[derive(Default, Debug, Copy, Clone)] 9 | pub struct XdpDesc { 10 | /// Full address of this descriptor. 11 | pub addr: u64, 12 | /// Logical length of the buffer referenced by the descriptor. 13 | pub len: u32, 14 | /// A bitfield of options. 15 | pub options: u32, 16 | } 17 | 18 | /// Argument to `setsockopt(_, SOL_XDP, XDP_UMEM_REG)`. 19 | /// 20 | /// Note that this struct's size determines the kernel interpretation of the option. In particular, 21 | /// padding passes garbage to the kernel while indicating said garbage as values! 22 | #[repr(C)] 23 | #[derive(Default, Debug, Copy, Clone)] 24 | pub struct XdpUmemReg { 25 | pub addr: u64, 26 | pub len: u64, 27 | pub chunk_size: u32, 28 | pub headroom: u32, 29 | pub flags: u32, 30 | pub tx_metadata_len: u32, 31 | } 32 | 33 | const _NO_PADDING: () = { 34 | assert!( 35 | core::mem::size_of::() 36 | // For each field. Keep in sync. 37 | == (core::mem::size_of::() 38 | + core::mem::size_of::() 39 | + core::mem::size_of::() 40 | + core::mem::size_of::() 41 | + core::mem::size_of::() 42 | + core::mem::size_of::()) 43 | ); 44 | }; 45 | 46 | /// The mmap-offsets to use for mapping one ring of an XDP socket. 47 | #[repr(C)] 48 | #[derive(Default, Debug, Copy, Clone)] 49 | pub struct XdpRingOffsets { 50 | /// the relative address of the producer. 51 | pub producer: u64, 52 | /// the relative address of the consumer. 53 | pub consumer: u64, 54 | /// the relative address of the descriptor. 55 | pub desc: u64, 56 | /// the relative address of the flags area. 57 | pub flags: u64, 58 | } 59 | 60 | /// The different offsets as returned by the kernel, for all rings of a socket. 61 | #[repr(C)] 62 | #[derive(Default, Debug, Copy, Clone)] 63 | pub struct XdpMmapOffsets { 64 | pub rx: XdpRingOffsets, 65 | pub tx: XdpRingOffsets, 66 | /// Fill ring offset. 67 | pub fr: XdpRingOffsets, 68 | /// Completion ring offset. 69 | pub cr: XdpRingOffsets, 70 | } 71 | 72 | /// Prior version of XdpMmapOffsets (<= Linux 5.3). 73 | #[repr(C)] 74 | #[derive(Default, Debug, Copy, Clone)] 75 | pub struct XdpRingOffsetsV1 { 76 | /// the relative address of the producer. 77 | pub producer: u64, 78 | /// the relative address of the consumer. 79 | pub consumer: u64, 80 | /// the relative address of the descriptor. 81 | pub desc: u64, 82 | } 83 | 84 | /// Prior version of XdpMmapOffsets (<= Linux 5.3). 85 | #[repr(C)] 86 | #[derive(Default, Debug, Copy, Clone)] 87 | pub struct XdpMmapOffsetsV1 { 88 | /// Offsets for the receive ring (kernel produced). 89 | pub rx: XdpRingOffsetsV1, 90 | /// Offsets for the transmit ring (user produced). 91 | pub tx: XdpRingOffsetsV1, 92 | /// Offsets for the fill ring (user produced). 93 | pub fr: XdpRingOffsetsV1, 94 | /// Offsets for the completion ring (kernel produced). 95 | pub cr: XdpRingOffsetsV1, 96 | } 97 | 98 | #[repr(C)] 99 | #[doc(alias = "sockaddr_xdp")] 100 | #[derive(Debug, Copy, Clone)] 101 | pub struct SockAddrXdp { 102 | #[doc(alias = "sxdp_family")] 103 | pub family: u16, 104 | #[doc(alias = "sxdp_flags")] 105 | pub flags: u16, 106 | #[doc(alias = "sxdp_ifindex")] 107 | pub ifindex: u32, 108 | #[doc(alias = "sxdp_queue_id")] 109 | pub queue_id: u32, 110 | #[doc(alias = "sxdp_shared_umem_fd")] 111 | pub shared_umem_fd: u32, 112 | } 113 | 114 | /// Prior version of XdpStatisticsV2 that only contains fields present from <= Linux 5.8 115 | #[repr(C)] 116 | #[doc(alias = "xdp_statistics")] 117 | #[derive(Debug, Default, Copy, Clone)] 118 | pub struct XdpStatistics { 119 | pub rx_dropped: u64, 120 | pub rx_invalid_descs: u64, 121 | pub tx_invalid_descs: u64, 122 | } 123 | 124 | #[repr(C)] 125 | #[doc(alias = "xdp_statistics")] 126 | #[derive(Debug, Default, Copy, Clone)] 127 | #[non_exhaustive] 128 | pub struct XdpStatisticsV2 { 129 | pub rx_dropped: u64, 130 | pub rx_invalid_descs: u64, 131 | pub tx_invalid_descs: u64, 132 | // Only set on >= Linux 5.9 133 | pub rx_ring_full: u64, 134 | // Only set on >= Linux 5.9 135 | pub rx_fill_ring_empty_descs: u64, 136 | // Only set on >= Linux 5.9 137 | pub tx_ring_empty_descs: u64, 138 | } 139 | 140 | impl Default for SockAddrXdp { 141 | fn default() -> Self { 142 | SockAddrXdp { 143 | family: libc::AF_XDP as u16, 144 | flags: 0, 145 | ifindex: 0, 146 | queue_id: 0, 147 | shared_umem_fd: 0, 148 | } 149 | } 150 | } 151 | -------------------------------------------------------------------------------- /src/xsk.rs: -------------------------------------------------------------------------------- 1 | //! Our own XSK (user-space XDP ring implementation). 2 | //! 3 | //! Consider: the reasoning behind these structs is their implementation in a _header_ of C code, 4 | //! so that they can be optimized on all platforms. How much sense does it make to not write them 5 | //! in Rust code, where rustc does _exactly_ this. 6 | //! 7 | //! The data structures here are not *safe* to construct. Some of them depend on the caller to 8 | //! uphold guarantees such as keeping an mmap alive, or holding onto a socket for them. Take care. 9 | 10 | /// Implementations for interface related operations. 11 | mod iface; 12 | /// Implementations for primitives `XskRing`, `RingProd`, `RingCons`. 13 | mod ring; 14 | /// Implementations for sockets. 15 | mod socket; 16 | /// Implementation for memory management. 17 | mod umem; 18 | /// Implementations for the actual queue management (user-space side). 19 | mod user; 20 | 21 | use crate::xdp::XdpMmapOffsets; 22 | 23 | use alloc::sync::Arc; 24 | use core::sync::atomic::AtomicU32; 25 | use core::{num::NonZeroU32, ptr::NonNull}; 26 | 27 | pub(crate) struct SocketFd(libc::c_int); 28 | 29 | /// Not defined in all libc versions and a _system_ property, not an implementation property. Thus 30 | /// we define it ourselves here. 31 | pub(crate) const SOL_XDP: libc::c_int = 283; 32 | 33 | pub use self::user::{ReadComplete, ReadRx, WriteFill, WriteTx}; 34 | 35 | /// Internal structure shared for all rings. 36 | /// 37 | /// TODO: copied from , does everything make sense in Rust? 38 | #[repr(C)] 39 | #[derive(Debug)] 40 | struct XskRing { 41 | /// _owned_ version of the producer head, may lag. 42 | cached_producer: u32, 43 | /// _owned_ version of the consumer head, may lag. 44 | cached_consumer: u32, 45 | /// Bit mask to quickly validate/force entry IDs. 46 | mask: u32, 47 | /// Number of entries (= mask + 1). 48 | size: u32, 49 | /// The mmaped-producer base. 50 | /// 51 | /// Note: Using lifetime static here, but we point into an `mmap` area and it is important that 52 | /// we do not outlive the binding. The constructor promises this. 53 | producer: &'static AtomicU32, 54 | /// The mmaped-consumer base. 55 | consumer: &'static AtomicU32, 56 | /// The mmaped-consumer ring control base. 57 | ring: NonNull, 58 | /// The mmaped-consumer flags base. 59 | flags: NonNull, 60 | } 61 | 62 | /// Static configuration describing a memory area to use for ring chunks. 63 | #[derive(Debug, Clone)] 64 | pub struct UmemConfig { 65 | /// Number of entries in the fill queue. 66 | pub fill_size: u32, 67 | /// Number of entries in the completion queue. 68 | pub complete_size: u32, 69 | /// Size of data chunks in each of the ring queues. 70 | pub frame_size: u32, 71 | /// Reserved area at the start of the kernel area. 72 | pub headroom: u32, 73 | /// Flags to set with the creation calls. 74 | pub flags: u32, 75 | } 76 | 77 | /// Configuration for a created socket. 78 | /// 79 | /// Passed to [`Umem::rx_tx`] 80 | #[derive(Debug, Default, Clone)] 81 | pub struct SocketConfig { 82 | /// The number of receive descriptors in the ring. 83 | pub rx_size: Option, 84 | /// The number of transmit descriptors in the ring. 85 | pub tx_size: Option, 86 | /// Additional flags to pass to the `bind` call as part of `sockaddr_xdp`. 87 | pub bind_flags: u16, 88 | } 89 | 90 | /// The basic Umem descriptor. 91 | /// 92 | /// This struct manages the buffers themselves, in a high-level sense, not any of the 93 | /// communication or queues. 94 | /// 95 | /// Compared to `libxdp` there no link to the queues is stored. Such a struct would necessitate 96 | /// thread-safe access to the ring's producer and consumer queues. Instead, a `DeviceQueue` is the 97 | /// owner of a device queue's fill/completion ring, but _not_ receive and transmission rings. All 98 | /// other sockets with the same interface/queue depend on it but have their own packet rings. 99 | /// 100 | /// You'll note that the fill ring and completion are a shared liveness requirement but under 101 | /// unique control. Exactly one process has to responsibility of maintaining them and ensuring the 102 | /// rings progress. Failing to do so impacts _all_ sockets sharing this `Umem`. The converse is not 103 | /// true. A single socket can starve its transmission buffer or refuse accepting received packets 104 | /// but the worst is packet loss in this queue. 105 | /// 106 | /// The controller of the fill/completion pair also controls the associated bpf program which maps 107 | /// packets onto the set of sockets (aka. 'XSKMAP'). 108 | // Implementation: 109 | pub struct Umem { 110 | umem_area: NonNull<[u8]>, 111 | config: UmemConfig, 112 | fd: Arc, 113 | devices: DeviceControl, 114 | } 115 | 116 | /// A raw pointer to a specific chunk in a Umem. 117 | /// 118 | /// It's unsafe to access the frame, by design. All aspects of _managing_ the contents of the 119 | /// kernel-shared memory are left to the user of the library. 120 | #[derive(Clone, Copy, Debug)] 121 | pub struct UmemChunk { 122 | /// The address range associated with the chunk. 123 | pub addr: NonNull<[u8]>, 124 | /// The absolute offset of this chunk from the start of the Umem. 125 | /// 126 | /// This is the basis of the address calculation shared with the kernel. 127 | pub offset: u64, 128 | } 129 | 130 | #[derive(Clone)] 131 | struct DeviceControl { 132 | /// The tracker, not critical for memory safety (here anyways) but correctness. 133 | inner: Arc, 134 | } 135 | 136 | /// A synchronized set for tracking which `IfCtx` are taken. 137 | trait ControlSet: Send + Sync + 'static { 138 | fn insert(&self, _: IfCtx) -> bool; 139 | fn contains(&self, _: &IfCtx) -> bool; 140 | fn remove(&self, _: &IfCtx); 141 | } 142 | 143 | /// One prepared socket for a receive/transmit pair. 144 | /// 145 | /// Note: it is not yet _bound_ to a specific `PF_XDP` address (device queue). 146 | pub struct Socket { 147 | info: Arc, 148 | fd: Arc, 149 | } 150 | 151 | /// One device queue associated with an XDP socket. 152 | /// 153 | /// A socket is more specifically a set of receive and transmit queues for packets (mapping to some 154 | /// underlying hardware mapping those bytes with a network). The fill and completion queue can, in 155 | /// theory, be shared with other sockets of the same `Umem`. 156 | pub struct DeviceQueue { 157 | /// Fill and completion queues. 158 | fcq: DeviceRings, 159 | /// This is also a socket. 160 | socket: Socket, 161 | /// Reference to de-register. 162 | devices: DeviceControl, 163 | } 164 | 165 | /// An owner of receive/transmit queues. 166 | /// 167 | /// This represents a configured version of the raw `Socket`. It allows you to map the required 168 | /// rings and _then_ [`Umem::bind`] the socket, enabling the operations of the queues with the 169 | /// interface. 170 | pub struct User { 171 | /// A clone of the socket it was created from. 172 | socket: Socket, 173 | /// The configuration with which it was created. 174 | config: Arc, 175 | /// A cached version of the map describing receive/tranmit queues. 176 | map: SocketMmapOffsets, 177 | } 178 | 179 | /// A receiver queue. 180 | /// 181 | /// This also maintains the mmap of the associated queue. 182 | // Implemented in 183 | pub struct RingRx { 184 | ring: RingCons, 185 | fd: Arc, 186 | } 187 | 188 | /// A transmitter queue. 189 | /// 190 | /// This also maintains the mmap of the associated queue. 191 | // Implemented in 192 | pub struct RingTx { 193 | ring: RingProd, 194 | fd: Arc, 195 | } 196 | 197 | /// A complete (cached) information about a socket. 198 | /// 199 | /// Please allocate this, the struct is quite large. For instance, put it into an `Arc` as soon as 200 | /// it is no longer mutable, or initialize it in-place with [`Arc::get_mut`]. 201 | #[derive(Clone, Copy)] 202 | pub struct IfInfo { 203 | ctx: IfCtx, 204 | ifname: [libc::c_char; libc::IFNAMSIZ], 205 | } 206 | 207 | /// Reduced version of `IfCtx`, only retaining numeric IDs for the kernel. 208 | #[derive(Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)] 209 | pub(crate) struct IfCtx { 210 | ifindex: libc::c_uint, 211 | queue_id: u32, 212 | /// The namespace cookie, associated with a *socket*. 213 | /// This field is filled by some surrounding struct containing the info. 214 | netnscookie: u64, 215 | } 216 | 217 | pub(crate) struct DeviceRings { 218 | pub prod: RingProd, 219 | pub cons: RingCons, 220 | // Proof that we obtained this. Not sure if and where we'd use it. 221 | #[allow(dead_code)] 222 | pub(crate) map: SocketMmapOffsets, 223 | } 224 | 225 | #[derive(Debug)] 226 | pub(crate) struct SocketMmapOffsets { 227 | inner: XdpMmapOffsets, 228 | } 229 | 230 | /// An index to an XDP buffer. 231 | /// 232 | /// Usually passed from a call of reserved or available buffers(in [`RingProd`] and 233 | /// [`RingCons`] respectively) to one of the access functions. This resolves the raw index to a 234 | /// memory address in the ring buffer. 235 | /// 236 | /// This is _not_ a pure offset, a masking is needed to access the raw offset! The kernel requires 237 | /// the buffer count to be a power-of-two for this to be efficient. Then, producer and consumer 238 | /// heads operate on the 32-bit number range, _silently_ mapping to the same range of indices. 239 | /// (Similar to TCP segments, actually). Well-behaving sides will maintain the order of the two 240 | /// numbers in this wrapping space, which stays perfectly well-defined as long as less than `2**31` 241 | /// buffer are identified in total. 242 | /// 243 | /// In other words, you need a configured ring to determine an exact offset or compare two indices. 244 | /// 245 | /// This type does _not_ implement comparison traits or hashing! Nevertheless, there's nothing 246 | /// unsafe about creating or observing this detail, so feel free to construct your own or use the 247 | /// transparent layout to (unsafely) treat the type as a `u32` instead. 248 | #[repr(transparent)] 249 | #[derive(Debug, Copy, Clone)] 250 | pub struct BufIdx(pub u32); 251 | 252 | /// A producer ring. 253 | /// 254 | /// Here, user space maintains the write head and the kernel the read tail. 255 | #[derive(Debug)] 256 | pub struct RingProd { 257 | inner: XskRing, 258 | mmap_addr: NonNull<[u8]>, 259 | } 260 | 261 | /// A consumer ring. 262 | /// 263 | /// Here, kernel maintains the write head and user space the read tail. 264 | #[derive(Debug)] 265 | pub struct RingCons { 266 | inner: XskRing, 267 | mmap_addr: NonNull<[u8]>, 268 | } 269 | 270 | impl Default for UmemConfig { 271 | fn default() -> Self { 272 | UmemConfig { 273 | fill_size: 1 << 11, 274 | complete_size: 1 << 11, 275 | frame_size: 1 << 12, 276 | headroom: 0, 277 | flags: 0, 278 | } 279 | } 280 | } 281 | 282 | impl Drop for SocketFd { 283 | fn drop(&mut self) { 284 | let _ = unsafe { libc::close(self.0) }; 285 | } 286 | } 287 | 288 | // FIXME: pending stabilization, use pointer::len directly. 289 | // 290 | // 291 | // FIXME: In 1.79 this was stabilized. Bump MSRV fine? 292 | fn ptr_len(ptr: *mut [u8]) -> usize { 293 | unsafe { (*(ptr as *mut [()])).len() } 294 | } 295 | 296 | impl Socket { 297 | /// Get the raw file descriptor number underlying this socket. 298 | pub fn as_raw_fd(&self) -> i32 { 299 | self.fd.0 300 | } 301 | } 302 | 303 | impl User { 304 | /// Get the raw file descriptor number underlying this socket. 305 | pub fn as_raw_fd(&self) -> i32 { 306 | self.socket.as_raw_fd() 307 | } 308 | } 309 | -------------------------------------------------------------------------------- /src/xsk/iface.rs: -------------------------------------------------------------------------------- 1 | use core::ffi::CStr; 2 | 3 | use super::{IfCtx, IfInfo, SocketFd, SocketMmapOffsets}; 4 | use crate::xdp::{XdpMmapOffsets, XdpMmapOffsetsV1, XdpStatistics, XdpStatisticsV2}; 5 | use crate::{Errno, LastErrno}; 6 | 7 | impl IfInfo { 8 | /// Create an info referring to no device. 9 | /// 10 | /// This allows allocating an info to overwrite with more specific information. 11 | pub fn invalid() -> Self { 12 | IfInfo { 13 | ctx: IfCtx { 14 | ifindex: 0, 15 | queue_id: 0, 16 | netnscookie: 0, 17 | }, 18 | ifname: [b'\0' as libc::c_char; libc::IFNAMSIZ], 19 | } 20 | } 21 | 22 | /// Set the information from an interface, by name. 23 | /// 24 | /// Common interface names may be `enp8s0`, `lo`, `wg0`, etc. The interface name-to-index pair 25 | /// will be very similar to what would be returned by `ip link show`. 26 | pub fn from_name(&mut self, st: &CStr) -> Result<(), Errno> { 27 | let bytes = st.to_bytes_with_nul(); 28 | 29 | if bytes.len() > self.ifname.len() { 30 | return Err(Errno(libc::EINVAL)); 31 | } 32 | 33 | assert!(bytes.len() <= self.ifname.len()); 34 | let bytes = unsafe { &*(bytes as *const _ as *const [libc::c_char]) }; 35 | let index = unsafe { libc::if_nametoindex(st.as_ptr()) }; 36 | 37 | if index == 0 { 38 | return Err(LastErrno)?; 39 | } 40 | 41 | self.ctx.ifindex = index; 42 | self.ctx.queue_id = 0; 43 | self.ctx.netnscookie = 0; 44 | self.ifname[..bytes.len()].copy_from_slice(bytes); 45 | 46 | Ok(()) 47 | } 48 | 49 | /// Set the information from an interface, by its numeric identifier. 50 | /// 51 | /// See [`Self::from_name`]. 52 | pub fn from_ifindex(&mut self, index: libc::c_uint) -> Result<(), Errno> { 53 | let err = unsafe { libc::if_indextoname(index, self.ifname.as_mut_ptr()) }; 54 | 55 | if err.is_null() { 56 | return Err(LastErrno)?; 57 | } 58 | 59 | Ok(()) 60 | } 61 | 62 | /// Configure the QueueID. 63 | /// 64 | /// This does _not_ guarantee that this queue is valid, or actually exists. You'll find out 65 | /// during the bind call. Most other ways of querying such information could suffer from TOCTOU 66 | /// issues in any case. 67 | pub fn set_queue(&mut self, queue_id: u32) { 68 | self.ctx.queue_id = queue_id; 69 | } 70 | 71 | /// Get the `ifindex`, numeric ID of the interface in the kernel, for the identified interface. 72 | pub fn ifindex(&self) -> u32 { 73 | self.ctx.ifindex 74 | } 75 | 76 | /// Get the queue ID previously set with `set_queue`. 77 | pub fn queue_id(&self) -> u32 { 78 | self.ctx.queue_id 79 | } 80 | } 81 | 82 | impl SocketMmapOffsets { 83 | const OPT_V1: libc::socklen_t = core::mem::size_of::() as libc::socklen_t; 84 | const OPT_LATEST: libc::socklen_t = core::mem::size_of::() as libc::socklen_t; 85 | 86 | /// Query the socket mmap offsets of an XDP socket. 87 | pub fn new(sock: &SocketFd) -> Result { 88 | let mut this = SocketMmapOffsets { 89 | inner: Default::default(), 90 | }; 91 | this.set_from_fd(sock)?; 92 | Ok(this) 93 | } 94 | 95 | /// Overwrite data with the socket mmap offsets of an XDP socket. 96 | /// 97 | /// This operation is atomic: On error, the previous values are retained. On success, the 98 | /// attributes have been updated. 99 | pub fn set_from_fd(&mut self, sock: &SocketFd) -> Result<(), Errno> { 100 | use crate::xdp::{XdpRingOffsets, XdpRingOffsetsV1}; 101 | 102 | // The flags was implicit, based on the consumer. 103 | fn fixup_v1(v1: XdpRingOffsetsV1) -> XdpRingOffsets { 104 | XdpRingOffsets { 105 | producer: v1.producer, 106 | consumer: v1.consumer, 107 | desc: v1.desc, 108 | flags: v1.consumer + core::mem::size_of::() as u64, 109 | } 110 | } 111 | 112 | union Offsets { 113 | v1: XdpMmapOffsetsV1, 114 | latest: XdpMmapOffsets, 115 | init: (), 116 | } 117 | 118 | let mut off = Offsets { init: () }; 119 | let mut optlen: libc::socklen_t = core::mem::size_of_val(&off) as libc::socklen_t; 120 | 121 | let err = unsafe { 122 | libc::getsockopt( 123 | sock.0, 124 | super::SOL_XDP, 125 | super::Umem::XDP_MMAP_OFFSETS, 126 | (&mut off) as *mut _ as *mut libc::c_void, 127 | &mut optlen, 128 | ) 129 | }; 130 | 131 | if err != 0 { 132 | return Err(LastErrno)?; 133 | } 134 | 135 | match optlen { 136 | Self::OPT_V1 => { 137 | let v1 = unsafe { off.v1 }; 138 | 139 | self.inner = XdpMmapOffsets { 140 | rx: fixup_v1(v1.rx), 141 | tx: fixup_v1(v1.tx), 142 | fr: fixup_v1(v1.fr), 143 | cr: fixup_v1(v1.cr), 144 | }; 145 | 146 | Ok(()) 147 | } 148 | Self::OPT_LATEST => { 149 | self.inner = unsafe { off.latest }; 150 | Ok(()) 151 | } 152 | _ => Err(Errno(-libc::EINVAL)), 153 | } 154 | } 155 | } 156 | 157 | impl XdpStatistics { 158 | pub(crate) fn new(sock: &SocketFd) -> Result { 159 | let mut this = Self::default(); 160 | this.set_from_fd(sock)?; 161 | Ok(this) 162 | } 163 | 164 | pub(crate) fn set_from_fd(&mut self, sock: &SocketFd) -> Result<(), Errno> { 165 | let mut optlen: libc::socklen_t = core::mem::size_of_val(self) as libc::socklen_t; 166 | let err = unsafe { 167 | libc::getsockopt( 168 | sock.0, 169 | super::SOL_XDP, 170 | super::Umem::XDP_STATISTICS, 171 | self as *mut _ as *mut libc::c_void, 172 | &mut optlen, 173 | ) 174 | }; 175 | 176 | if err != 0 { 177 | return Err(LastErrno)?; 178 | } 179 | 180 | Ok(()) 181 | } 182 | } 183 | 184 | impl XdpStatisticsV2 { 185 | pub(crate) fn new(sock: &SocketFd) -> Result { 186 | let mut this = Self::default(); 187 | this.set_from_fd(sock)?; 188 | Ok(this) 189 | } 190 | 191 | pub(crate) fn set_from_fd(&mut self, sock: &SocketFd) -> Result<(), Errno> { 192 | let mut optlen: libc::socklen_t = core::mem::size_of_val(self) as libc::socklen_t; 193 | let err = unsafe { 194 | libc::getsockopt( 195 | sock.0, 196 | super::SOL_XDP, 197 | super::Umem::XDP_STATISTICS, 198 | self as *mut _ as *mut libc::c_void, 199 | &mut optlen, 200 | ) 201 | }; 202 | 203 | if err != 0 { 204 | return Err(LastErrno)?; 205 | } 206 | 207 | Ok(()) 208 | } 209 | } 210 | -------------------------------------------------------------------------------- /src/xsk/ring.rs: -------------------------------------------------------------------------------- 1 | use core::sync::atomic::Ordering; 2 | use core::{ops::RangeInclusive, ptr::NonNull}; 3 | 4 | use crate::xdp::{XdpDesc, XdpRingOffsets}; 5 | use crate::xsk::{BufIdx, RingCons, RingProd, SocketFd, SocketMmapOffsets, XskRing}; 6 | use crate::{Errno, LastErrno}; 7 | 8 | impl XskRing { 9 | const XDP_PGOFF_RX_RING: libc::off_t = 0; 10 | const XDP_PGOFF_TX_RING: libc::off_t = 0x80000000; 11 | const XDP_UMEM_PGOFF_FILL_RING: libc::off_t = 0x100000000; 12 | const XDP_UMEM_PGOFF_COMPLETION_RING: libc::off_t = 0x180000000; 13 | 14 | /// Construct a ring from an mmap given by the kernel. 15 | /// 16 | /// # Safety 17 | /// 18 | /// The caller is responsible for ensuring that the memory mapping is valid, and **outlives** 19 | /// the ring itself. Please attach a reference counted pointer to the controller or something 20 | /// of that sort. 21 | /// 22 | /// The caller must ensure that the memory region is not currently mutably aliased. That's 23 | /// wrong anyways because the kernel may write to it, i.e. it is not immutable! A shared 24 | /// aliasing is okay. 25 | pub unsafe fn new(tx_map: NonNull, off: &XdpRingOffsets, count: u32) -> Self { 26 | debug_assert!(count.is_power_of_two()); 27 | let tx_map: *mut u8 = tx_map.as_ptr(); 28 | let trust_offset = |off: u64| NonNull::new_unchecked(tx_map.offset(off as isize)); 29 | 30 | let producer = trust_offset(off.producer).cast().as_ref(); 31 | let consumer = trust_offset(off.consumer).cast().as_ref(); 32 | 33 | let ring = trust_offset(off.desc).cast(); 34 | let flags = trust_offset(off.flags).cast(); 35 | 36 | XskRing { 37 | mask: count - 1, 38 | size: count, 39 | producer, 40 | consumer, 41 | ring, 42 | flags, 43 | cached_producer: producer.load(Ordering::Relaxed), 44 | cached_consumer: consumer.load(Ordering::Relaxed), 45 | } 46 | } 47 | 48 | unsafe fn map( 49 | fd: &SocketFd, 50 | off: &XdpRingOffsets, 51 | count: u32, 52 | sz: u64, 53 | offset: libc::off_t, 54 | ) -> Result<(Self, NonNull<[u8]>), Errno> { 55 | let len = (off.desc + u64::from(count) * sz) as usize; 56 | 57 | let mmap = unsafe { 58 | libc::mmap( 59 | core::ptr::null_mut(), 60 | len, 61 | libc::PROT_READ | libc::PROT_WRITE, 62 | libc::MAP_SHARED | libc::MAP_POPULATE, 63 | fd.0, 64 | offset, 65 | ) 66 | }; 67 | 68 | if mmap == libc::MAP_FAILED { 69 | return Err(LastErrno)?; 70 | } 71 | 72 | assert!(!mmap.is_null()); 73 | // Safety: as by MMap this pointer is valid. 74 | let mmap_addr = core::ptr::slice_from_raw_parts_mut(mmap as *mut u8, len); 75 | let mmap_addr = unsafe { NonNull::new_unchecked(mmap_addr) }; 76 | let nn = mmap_addr.cast(); 77 | 78 | Ok((XskRing::new(nn, off, count), mmap_addr)) 79 | } 80 | } 81 | 82 | impl RingProd { 83 | /// # Safety 84 | /// 85 | /// The caller must only pass `fd` and `off` if they correspond as they were returned by the 86 | /// kernel. 87 | pub(crate) unsafe fn fill( 88 | fd: &SocketFd, 89 | off: &SocketMmapOffsets, 90 | count: u32, 91 | ) -> Result { 92 | let (inner, mmap_addr) = XskRing::map( 93 | fd, 94 | &off.inner.fr, 95 | count, 96 | core::mem::size_of::() as u64, 97 | XskRing::XDP_UMEM_PGOFF_FILL_RING, 98 | )?; 99 | 100 | Ok(RingProd { inner, mmap_addr }) 101 | } 102 | 103 | /// # Safety 104 | /// 105 | /// The caller must only pass `fd` and `off` if they correspond as they were returned by the 106 | /// kernel. 107 | pub(crate) unsafe fn tx( 108 | fd: &SocketFd, 109 | off: &SocketMmapOffsets, 110 | count: u32, 111 | ) -> Result { 112 | let (inner, mmap_addr) = XskRing::map( 113 | fd, 114 | &off.inner.tx, 115 | count, 116 | core::mem::size_of::() as u64, 117 | XskRing::XDP_PGOFF_TX_RING, 118 | )?; 119 | 120 | Ok(RingProd { inner, mmap_addr }) 121 | } 122 | 123 | /// Return the address of an address descriptor. 124 | /// 125 | /// # Safety 126 | /// 127 | /// To be used only in fill and complete rings. Further, the caller guarantees that the `idx` 128 | /// parameter is valid for the ring. 129 | pub(crate) unsafe fn fill_addr(&self, idx: BufIdx) -> NonNull { 130 | let offset = (idx.0 & self.inner.mask) as isize; 131 | let base = self.inner.ring.cast::().as_ptr(); 132 | unsafe { NonNull::new_unchecked(base.offset(offset)) } 133 | } 134 | 135 | /// Return the address of a buffer descriptor. 136 | /// 137 | /// # Safety 138 | /// 139 | /// To be used only in fill and complete rings. Further, the caller guarantees that the `idx` 140 | /// parameter is valid for the ring. 141 | pub(crate) unsafe fn tx_desc(&self, idx: BufIdx) -> NonNull { 142 | let offset = (idx.0 & self.inner.mask) as isize; 143 | let base = self.inner.ring.cast::().as_ptr(); 144 | unsafe { NonNull::new_unchecked(base.offset(offset)) } 145 | } 146 | 147 | /// Query for up to `nb` free entries. 148 | /// 149 | /// Serves small requests based on cached state about the kernel's consumer head. Large 150 | /// requests may thus incur an extra refresh of the consumer head. 151 | pub fn count_free(&mut self, mininmum: u32) -> u32 { 152 | let free_entries = self 153 | .inner 154 | .cached_consumer 155 | .wrapping_sub(self.inner.cached_producer); 156 | 157 | if free_entries >= mininmum { 158 | return free_entries; 159 | } 160 | 161 | self.inner.cached_consumer = self.inner.consumer.load(Ordering::Acquire); 162 | // No-op module the size, but ensures our view of the consumer is always ahead of the 163 | // producer, no matter buffer counts and mask. 164 | // TODO: actually, I don't _quite_ understand. This algorithm is copied from libxdp. 165 | self.inner.cached_consumer += self.inner.size; 166 | 167 | self.inner.cached_consumer - self.inner.cached_producer 168 | } 169 | 170 | /// Prepare consuming some buffers on our-side, not submitting to the kernel yet. 171 | /// 172 | /// Writes the index of the next available buffer into `idx`. Fails if less than the requested 173 | /// amount of buffers can be reserved. Returns the number of actual buffers reserved. 174 | pub fn reserve(&mut self, nb: RangeInclusive, idx: &mut BufIdx) -> u32 { 175 | let (start, end) = (*nb.start(), *nb.end()); 176 | let free = self.count_free(start); 177 | 178 | if free < start { 179 | return 0; 180 | } 181 | 182 | let free = free.min(end); 183 | *idx = BufIdx(self.inner.cached_producer); 184 | self.inner.cached_producer += free; 185 | 186 | free 187 | } 188 | 189 | /// Cancel a previous `reserve`. 190 | /// 191 | /// If passed a smaller number, the remaining reservation stays active. 192 | pub fn cancel(&mut self, nb: u32) { 193 | self.inner.cached_producer -= nb; 194 | } 195 | 196 | /// Submit a number of buffers. 197 | /// 198 | /// Note: the client side state is _not_ adjusted. If you've called `reserve` before please 199 | /// check to maintain a consistent view. 200 | /// 201 | /// TODO: interestingly this could be implemented on a shared reference. But is doing so 202 | /// useful? There's no affirmation that the _intended_ buffers are submitted. 203 | pub fn submit(&mut self, nb: u32) { 204 | // We are the only writer, all other writes are ordered before. 205 | let cur = self.inner.producer.load(Ordering::Relaxed); 206 | // When the kernel reads it, all writes to buffers must be ordered before this write to the 207 | // head, this represents the memory synchronization edge. 208 | self.inner 209 | .producer 210 | .store(cur.wrapping_add(nb), Ordering::Release); 211 | } 212 | 213 | /// Get the raw difference between consumer and producer heads in shared memory. 214 | /// 215 | /// Both variables are loaded with _relaxed_ loads. No synchronization with any other memory 216 | /// operations is implied by calling this method. For this, you would need make sure to have 217 | /// some form of barrier, acquire on receiving and release on transmitting, for operations 218 | /// within chunks. 219 | pub fn count_pending(&self) -> u32 { 220 | let comitted = self.inner.producer.load(Ordering::Relaxed); 221 | let consumed = self.inner.consumer.load(Ordering::Relaxed); 222 | 223 | comitted.wrapping_sub(consumed) 224 | } 225 | 226 | /// Return the bits behind the `flags` register in the mmap. 227 | pub fn check_flags(&self) -> u32 { 228 | unsafe { *self.inner.flags.as_ptr() } 229 | } 230 | } 231 | 232 | impl RingCons { 233 | /// Create a completion ring. 234 | /// # Safety 235 | /// 236 | /// The caller must only pass `fd` and `off` if they correspond as they were returned by the 237 | /// kernel. 238 | pub(crate) unsafe fn comp( 239 | fd: &SocketFd, 240 | off: &SocketMmapOffsets, 241 | count: u32, 242 | ) -> Result { 243 | let (inner, mmap_addr) = XskRing::map( 244 | fd, 245 | &off.inner.cr, 246 | count, 247 | core::mem::size_of::() as u64, 248 | XskRing::XDP_UMEM_PGOFF_COMPLETION_RING, 249 | )?; 250 | 251 | Ok(RingCons { inner, mmap_addr }) 252 | } 253 | 254 | /// Create a receive ring. 255 | /// # Safety 256 | /// 257 | /// The caller must only pass `fd` and `off` if they correspond as they were returned by the 258 | /// kernel. 259 | pub(crate) unsafe fn rx( 260 | fd: &SocketFd, 261 | off: &SocketMmapOffsets, 262 | count: u32, 263 | ) -> Result { 264 | let (inner, mmap_addr) = XskRing::map( 265 | fd, 266 | &off.inner.rx, 267 | count, 268 | core::mem::size_of::() as u64, 269 | XskRing::XDP_PGOFF_RX_RING, 270 | )?; 271 | 272 | Ok(RingCons { inner, mmap_addr }) 273 | } 274 | 275 | /// Get a pointer to an address descriptor in the ring. 276 | /// 277 | /// # Safety 278 | /// 279 | /// This ring must be a Fill or Completion ring. 280 | pub unsafe fn comp_addr(&self, idx: BufIdx) -> NonNull { 281 | let offset = (idx.0 & self.inner.mask) as isize; 282 | let base = self.inner.ring.cast::().as_ptr(); 283 | // Safety: all offsets within `self.inner.mask` are valid in our mmap. 284 | unsafe { NonNull::new_unchecked(base.offset(offset)) } 285 | } 286 | 287 | /// Get a pointer to an XDP frame descriptor in the ring. 288 | /// 289 | /// # Safety 290 | /// 291 | /// This ring must be a Receive or Transmit ring. 292 | pub unsafe fn rx_desc(&self, idx: BufIdx) -> NonNull { 293 | let offset = (idx.0 & self.inner.mask) as isize; 294 | let base = self.inner.ring.cast::().as_ptr(); 295 | // Safety: all offsets within `self.inner.mask` are valid in our mmap. 296 | unsafe { NonNull::new_unchecked(base.offset(offset)) } 297 | } 298 | 299 | /// Find the number of available entries. 300 | /// 301 | /// Any count lower than `expected` will try to refresh the consumer. 302 | pub fn count_available(&mut self, expected: u32) -> u32 { 303 | let mut available = self 304 | .inner 305 | .cached_producer 306 | .wrapping_sub(self.inner.cached_consumer); 307 | 308 | if available < expected { 309 | let new_val = self.inner.producer.load(Ordering::Relaxed); 310 | available = new_val.wrapping_sub(self.inner.cached_consumer); 311 | self.inner.cached_producer = self.inner.producer.load(Ordering::Acquire); 312 | } 313 | 314 | available 315 | } 316 | 317 | /// Get the raw difference between consumer and producer heads in shared memory. 318 | /// 319 | /// Both variables are loaded with _relaxed_ loads. No synchronization with any other memory 320 | /// operations is implied by calling this method. For this, you would need make sure to have 321 | /// some form of barrier, acquire on receiving and release on transmitting, for operations 322 | /// within chunks. 323 | pub fn count_pending(&self) -> u32 { 324 | let available = self.inner.producer.load(Ordering::Relaxed); 325 | let consumed = self.inner.consumer.load(Ordering::Relaxed); 326 | 327 | available.wrapping_sub(consumed) 328 | } 329 | 330 | pub(crate) fn peek(&mut self, nb: RangeInclusive, idx: &mut BufIdx) -> u32 { 331 | let (start, end) = (*nb.start(), *nb.end()); 332 | let count = self.count_available(start); 333 | 334 | if count < start { 335 | return 0; 336 | } 337 | 338 | let count = count.min(end); 339 | *idx = BufIdx(self.inner.cached_consumer); 340 | self.inner.cached_consumer += count; 341 | 342 | count 343 | } 344 | 345 | /// Cancel a previous `peek`. 346 | /// 347 | /// If passed a smaller number, the remaining reservation stays active. 348 | pub fn cancel(&mut self, nb: u32) { 349 | self.inner.cached_consumer -= nb; 350 | } 351 | 352 | /// Mark some buffers as processed. 353 | /// 354 | /// TODO: interestingly this could be implemented on a shared reference. But is doing so 355 | /// useful? There's no affirmation that the _intended_ buffers are submitted. 356 | pub fn release(&mut self, nb: u32) { 357 | // We are the only writer, all other writes are ordered before. 358 | let cur = self.inner.consumer.load(Ordering::Relaxed); 359 | // All our reads from buffers must be ordered before this write to the head, this 360 | // represents the memory synchronization edge. 361 | self.inner 362 | .consumer 363 | .store(cur.wrapping_add(nb), Ordering::Release); 364 | } 365 | 366 | /// Return the flags, as indicated by the kernel in shared memory. 367 | pub fn check_flags(&self) -> u32 { 368 | unsafe { *self.inner.flags.as_ptr() } 369 | } 370 | } 371 | 372 | impl Drop for RingProd { 373 | fn drop(&mut self) { 374 | let len = super::ptr_len(self.mmap_addr.as_ptr()); 375 | unsafe { libc::munmap(self.mmap_addr.as_ptr() as *mut _, len) }; 376 | } 377 | } 378 | 379 | impl Drop for RingCons { 380 | fn drop(&mut self) { 381 | let len = super::ptr_len(self.mmap_addr.as_ptr()); 382 | unsafe { libc::munmap(self.mmap_addr.as_ptr() as *mut _, len) }; 383 | } 384 | } 385 | 386 | // Safety; `NonNull` here controls an `mmap`. All other values are almost trivally safe to send to 387 | // a different thread. Indeed, we hold no shared reference `&_` to any non-´Sync` resource which 388 | // makes this sound by definition. 389 | unsafe impl Send for XskRing {} 390 | unsafe impl Send for RingProd {} 391 | unsafe impl Send for RingCons {} 392 | -------------------------------------------------------------------------------- /src/xsk/socket.rs: -------------------------------------------------------------------------------- 1 | use alloc::sync::Arc; 2 | 3 | use crate::xsk::{IfInfo, Socket, SocketFd, Umem}; 4 | use crate::{Errno, LastErrno}; 5 | 6 | impl Socket { 7 | const SO_NETNS_COOKIE: libc::c_int = 71; 8 | const INIT_NS: u64 = 1; 9 | 10 | /// Create a new socket for a given interface. 11 | pub fn new(interface: &IfInfo) -> Result { 12 | let fd = Arc::new(SocketFd::new()?); 13 | Self::with_xdp_socket(interface, fd) 14 | } 15 | 16 | /// Create a socket using the FD of the `umem`. 17 | pub fn with_shared(interface: &IfInfo, umem: &Umem) -> Result { 18 | Self::with_xdp_socket(interface, umem.fd.clone()) 19 | } 20 | 21 | fn with_xdp_socket(interface: &IfInfo, fd: Arc) -> Result { 22 | let mut info = Arc::new(*interface); 23 | 24 | let mut netnscookie: u64 = 0; 25 | let mut optlen: libc::socklen_t = core::mem::size_of_val(&netnscookie) as libc::socklen_t; 26 | let err = unsafe { 27 | libc::getsockopt( 28 | fd.0, 29 | libc::SOL_SOCKET, 30 | Self::SO_NETNS_COOKIE, 31 | (&mut netnscookie) as *mut _ as *mut libc::c_void, 32 | &mut optlen, 33 | ) 34 | }; 35 | 36 | match err { 37 | 0 => {} 38 | libc::ENOPROTOOPT => netnscookie = Self::INIT_NS, 39 | _ => return Err(LastErrno)?, 40 | } 41 | 42 | // Won't reallocate in practice. 43 | Arc::make_mut(&mut info).ctx.netnscookie = netnscookie; 44 | 45 | Ok(Socket { fd, info }) 46 | } 47 | } 48 | 49 | impl SocketFd { 50 | pub(crate) fn new() -> Result { 51 | let fd = unsafe { libc::socket(libc::AF_XDP, libc::SOCK_RAW, 0) }; 52 | if fd < 0 { 53 | return Err(LastErrno)?; 54 | } 55 | Ok(SocketFd(fd)) 56 | } 57 | } 58 | -------------------------------------------------------------------------------- /src/xsk/umem.rs: -------------------------------------------------------------------------------- 1 | use core::ptr::NonNull; 2 | 3 | use alloc::collections::BTreeSet; 4 | use alloc::sync::Arc; 5 | 6 | use crate::xdp::{SockAddrXdp, XdpDesc, XdpStatistics, XdpStatisticsV2, XdpUmemReg}; 7 | use crate::xsk::{ 8 | ptr_len, BufIdx, DeviceControl, DeviceQueue, DeviceRings, IfCtx, RingCons, RingProd, RingRx, 9 | RingTx, Socket, SocketConfig, SocketFd, SocketMmapOffsets, Umem, UmemChunk, UmemConfig, User, 10 | }; 11 | use crate::{Errno, LastErrno}; 12 | 13 | use spin::RwLock; 14 | 15 | impl BufIdx { 16 | /// Convert a slice of raw numbers to buffer indices, in-place. 17 | pub fn from_slice(id: &[u32]) -> &[Self] { 18 | unsafe { &*(id as *const [u32] as *const [Self]) } 19 | } 20 | 21 | /// Convert a slice of raw numbers to buffer indices, in-place. 22 | pub fn from_mut_slice(id: &mut [u32]) -> &mut [Self] { 23 | unsafe { &mut *(id as *mut [u32] as *mut [Self]) } 24 | } 25 | 26 | /// Convert a slice buffer indices to raw numbers, in-place. 27 | pub fn to_slice(this: &[Self]) -> &[u32] { 28 | unsafe { &*(this as *const [Self] as *const [u32]) } 29 | } 30 | 31 | /// Convert a slice buffer indices to raw numbers, in-place. 32 | pub fn to_mut_slice(this: &mut [Self]) -> &mut [u32] { 33 | unsafe { &mut *(this as *mut [Self] as *mut [u32]) } 34 | } 35 | } 36 | 37 | impl Umem { 38 | /* Socket options for XDP */ 39 | pub(crate) const XDP_MMAP_OFFSETS: libc::c_int = 1; 40 | pub(crate) const XDP_RX_RING: libc::c_int = 2; 41 | pub(crate) const XDP_TX_RING: libc::c_int = 3; 42 | pub(crate) const XDP_UMEM_REG: libc::c_int = 4; 43 | pub(crate) const XDP_UMEM_FILL_RING: libc::c_int = 5; 44 | pub(crate) const XDP_UMEM_COMPLETION_RING: libc::c_int = 6; 45 | pub(crate) const XDP_STATISTICS: libc::c_int = 7; 46 | #[allow(dead_code)] 47 | pub(crate) const XDP_OPTIONS: libc::c_int = 8; 48 | 49 | /// Create a new Umem ring. 50 | /// 51 | /// # Safety 52 | /// 53 | /// The caller passes an area denoting the memory of the ring. It must be valid for the 54 | /// indicated buffer size and count. The caller is also responsible for keeping the mapping 55 | /// alive. 56 | /// 57 | /// The area must be page aligned and not exceed i64::MAX in length (on future systems where 58 | /// you could). 59 | pub unsafe fn new(config: UmemConfig, area: NonNull<[u8]>) -> Result { 60 | fn is_page_aligned(area: NonNull<[u8]>) -> bool { 61 | let page_size = unsafe { libc::sysconf(libc::_SC_PAGESIZE) } as usize; 62 | // TODO: use `addr()` as we don't need to expose the pointer here. Just the address as 63 | // an integer and no provenance-preserving cast intended. 64 | (area.as_ptr() as *mut u8 as usize & (page_size - 1)) == 0 65 | } 66 | 67 | assert!(config.frame_size > 0, "Invalid frame size"); 68 | 69 | assert!( 70 | is_page_aligned(area), 71 | "UB: Bad mmap area provided, but caller is responsible for its soundness." 72 | ); 73 | 74 | let area_size = ptr_len(area.as_ptr()); 75 | 76 | assert!( 77 | u64::try_from(area_size).is_ok(), 78 | "Unhandled address space calculation" 79 | ); 80 | 81 | let devices = DeviceControl { 82 | inner: Arc::new(SpinLockedControlSet::default()), 83 | }; 84 | 85 | // Two steps: 86 | // 1. Create a new XDP socket in the kernel. 87 | // 2. Configure it with the area and size. 88 | // Safety: correct `socket` call. 89 | let umem = Umem { 90 | config, 91 | fd: Arc::new(SocketFd::new()?), 92 | umem_area: area, 93 | devices, 94 | }; 95 | 96 | Self::configure(&umem)?; 97 | 98 | Ok(umem) 99 | } 100 | 101 | /// Get the address associated with a buffer, if it is in-bounds. 102 | /// 103 | /// # Safety 104 | /// 105 | /// No requirements. However, please ensure that _use_ of the pointer is done properly. The 106 | /// pointer is guaranteed to be derived from the `area` passed in the constructor. The method 107 | /// guarantees that it does not _access_ any of the pointers in this process. 108 | pub fn frame(&self, idx: BufIdx) -> Option { 109 | let pitch: u32 = self.config.frame_size; 110 | let idx: u32 = idx.0; 111 | let area_size = ptr_len(self.umem_area.as_ptr()) as u64; 112 | 113 | // Validate that it fits. 114 | let offset = u64::from(pitch) * u64::from(idx); 115 | if area_size.checked_sub(u64::from(pitch)) < Some(offset) { 116 | return None; 117 | } 118 | 119 | // Now: area_size is converted, without loss, from an isize that denotes the [u8] length, 120 | // valid as guaranteed by the caller of the constructor. We have just checked: 121 | // 122 | // `[offset..offset+pitch) < area_size`. 123 | // 124 | // So all of the following is within the bounds of the constructor-guaranteed 125 | // address manipulation. 126 | let base = unsafe { self.umem_area.cast::().as_ptr().offset(offset as isize) }; 127 | debug_assert!(!base.is_null(), "UB: offsetting area within produced NULL"); 128 | let slice = core::ptr::slice_from_raw_parts_mut(base, pitch as usize); 129 | let addr = unsafe { NonNull::new_unchecked(slice) }; 130 | Some(UmemChunk { addr, offset }) 131 | } 132 | 133 | /// Count the number of available data frames. 134 | pub fn len_frames(&self) -> u32 { 135 | let area_size = ptr_len(self.umem_area.as_ptr()) as u64; 136 | let count = area_size / u64::from(self.config.frame_size); 137 | u32::try_from(count).unwrap_or(u32::MAX) 138 | } 139 | 140 | fn configure(this: &Umem) -> Result<(), Errno> { 141 | let mut mr = XdpUmemReg { 142 | addr: this.umem_area.as_ptr() as *mut u8 as u64, 143 | len: ptr_len(this.umem_area.as_ptr()) as u64, 144 | chunk_size: this.config.frame_size, 145 | headroom: this.config.headroom, 146 | flags: this.config.flags, 147 | ..XdpUmemReg::default() 148 | }; 149 | 150 | let optlen = core::mem::size_of_val(&mr) as libc::socklen_t; 151 | let err = unsafe { 152 | libc::setsockopt( 153 | this.fd.0, 154 | super::SOL_XDP, 155 | Self::XDP_UMEM_REG, 156 | (&mut mr) as *mut _ as *mut libc::c_void, 157 | optlen, 158 | ) 159 | }; 160 | 161 | if err != 0 { 162 | return Err(LastErrno)?; 163 | } 164 | 165 | Ok(()) 166 | } 167 | 168 | /// Configure the fill and completion queue for a interface queue. 169 | /// 170 | /// The caller _should_ only call this once for each interface info. However, it's not entirely 171 | /// incorrect to do it multiple times. Just, be careful that the administration becomes extra 172 | /// messy. All code is written under the assumption that only one controller/writer for the 173 | /// user-space portions of each queue is active at a time. The kernel won't care about your 174 | /// broken code and race conditions writing to the same queue concurrently. It's an SPSC. 175 | /// Probably only the first call for each interface succeeds. 176 | pub fn fq_cq(&self, interface: &Socket) -> Result { 177 | if !self.devices.insert(interface.info.ctx) { 178 | // We know this will just yield `-EBUSY` anyways. 179 | return Err(Errno(libc::EINVAL)); 180 | } 181 | 182 | struct DropableDevice<'info>(&'info IfCtx, &'info DeviceControl); 183 | 184 | impl Drop for DropableDevice<'_> { 185 | fn drop(&mut self) { 186 | self.1.remove(self.0); 187 | } 188 | } 189 | 190 | // Okay, got a device. Let's create the queues for it. On failure, cleanup. 191 | let _tmp_device = DropableDevice(&interface.info.ctx, &self.devices); 192 | 193 | let sock = &*interface.fd; 194 | Self::configure_cq(sock, &self.config)?; 195 | let map = SocketMmapOffsets::new(sock)?; 196 | 197 | // FIXME: should we be configured the `cached_consumer` and `cached_producer` and 198 | // potentially other values, here? The setup produces a very rough clone of _just_ the ring 199 | // itself and none of the logic beyond. 200 | let prod = unsafe { RingProd::fill(sock, &map, self.config.fill_size) }?; 201 | let cons = unsafe { RingCons::comp(sock, &map, self.config.complete_size) }?; 202 | 203 | let device = DeviceQueue { 204 | fcq: DeviceRings { map, cons, prod }, 205 | socket: Socket { 206 | info: interface.info.clone(), 207 | fd: interface.fd.clone(), 208 | }, 209 | devices: self.devices.clone(), 210 | }; 211 | 212 | core::mem::forget(_tmp_device); 213 | Ok(device) 214 | } 215 | 216 | /// Configure the device address for a socket. 217 | /// 218 | /// Either `rx_size` or `tx_size` must be non-zero, i.e. the call to bind will fail if none of 219 | /// the rings is actually configured. 220 | /// 221 | /// Note: if the underlying socket is shared then this will also bind other objects that share 222 | /// the underlying socket file descriptor, this is intended. 223 | pub fn rx_tx(&self, interface: &Socket, config: &SocketConfig) -> Result { 224 | let sock = &*interface.fd; 225 | Self::configure_rt(sock, config)?; 226 | let map = SocketMmapOffsets::new(sock)?; 227 | 228 | Ok(User { 229 | socket: Socket { 230 | info: interface.info.clone(), 231 | fd: interface.fd.clone(), 232 | }, 233 | config: Arc::new(config.clone()), 234 | map, 235 | }) 236 | } 237 | 238 | /// Activate a socket with by binding it to a device. 239 | /// 240 | /// This associates the umem region to these queues. This is intended for: 241 | /// 242 | /// - sockets that maintain the fill and completion ring for a device queue, i.e. a `fc_cq` was 243 | /// called with the socket and that network interface queue is currently being bound. 244 | /// 245 | /// - queues that the umem socket file descriptor is maintaining as a device queue, i.e. the 246 | /// call to `fc_cq` used a socket created with [`Socket::with_shared`] that utilized the 247 | /// [`Umem`] instance. 248 | /// 249 | /// Otherwise, when a pure rx/tx socket should be setup use [`DeviceQueue::bind`] with the 250 | /// previously bound socket providing its fill/completion queues. 251 | /// 252 | /// The tree of parents should look as follows: 253 | /// 254 | /// ```text 255 | /// fd0: umem [+fq/cq for ifq0] [+rx/+tx] 256 | /// |- [fd1: socket +rx/tx on ifq0 if fd0 has fq/cq] Umem::bind(fd0, fd1) 257 | /// |- [fd2: socket +rx/tx on ifq0 if fd0 has fq/cq …] Umem::bind(fd0, fd2) 258 | /// | 259 | /// |- fd3: socket +fq/cq for ifq1 [+rx/tx] Umem::bind(fd0, fd3) 260 | /// | |- fd4: socket +rx/tx on ifq1 DeviceQueue::bind(fd3, fd4) 261 | /// | |- fd5: socket +rx/tx on ifq1 … DeviceQueue::bind(fd3, fd5) 262 | /// | 263 | /// |-fd6: socket +fq/cq for ifq2 [+rx/tx] Umem::bind(fd0, fd6) 264 | /// | |- fd7: socket +rx/tx on ifq1 DeviceQueue::bind(fd6, fd7) 265 | /// | |- … 266 | /// ``` 267 | pub fn bind(&self, interface: &User) -> Result<(), Errno> { 268 | Self::bind_at(interface, &self.fd) 269 | } 270 | 271 | fn bind_at(interface: &User, umem_sock: &SocketFd) -> Result<(), Errno> { 272 | let mut sxdp = SockAddrXdp { 273 | ifindex: interface.socket.info.ctx.ifindex, 274 | queue_id: interface.socket.info.ctx.queue_id, 275 | flags: interface.config.bind_flags, 276 | ..SockAddrXdp::default() 277 | }; 278 | 279 | // Note: using a separate socket with shared umem requires one dedicated configured cq for 280 | // the interface indicated. 281 | 282 | if interface.socket.fd.0 != umem_sock.0 { 283 | sxdp.flags |= SocketConfig::XDP_BIND_SHARED_UMEM; 284 | sxdp.shared_umem_fd = umem_sock.0 as u32; 285 | } 286 | 287 | if unsafe { 288 | libc::bind( 289 | interface.socket.fd.0, 290 | (&sxdp) as *const _ as *const libc::sockaddr, 291 | core::mem::size_of_val(&sxdp) as libc::socklen_t, 292 | ) 293 | } != 0 294 | { 295 | return Err(LastErrno)?; 296 | } 297 | 298 | Ok(()) 299 | } 300 | 301 | pub(crate) fn configure_cq(fd: &SocketFd, config: &UmemConfig) -> Result<(), Errno> { 302 | if unsafe { 303 | libc::setsockopt( 304 | fd.0, 305 | super::SOL_XDP, 306 | Umem::XDP_UMEM_COMPLETION_RING, 307 | (&config.complete_size) as *const _ as *const libc::c_void, 308 | core::mem::size_of_val(&config.complete_size) as libc::socklen_t, 309 | ) 310 | } != 0 311 | { 312 | return Err(LastErrno)?; 313 | } 314 | 315 | if unsafe { 316 | libc::setsockopt( 317 | fd.0, 318 | super::SOL_XDP, 319 | Umem::XDP_UMEM_FILL_RING, 320 | (&config.fill_size) as *const _ as *const libc::c_void, 321 | core::mem::size_of_val(&config.fill_size) as libc::socklen_t, 322 | ) 323 | } != 0 324 | { 325 | return Err(LastErrno)?; 326 | } 327 | 328 | Ok(()) 329 | } 330 | 331 | pub(crate) fn configure_rt(fd: &SocketFd, config: &SocketConfig) -> Result<(), Errno> { 332 | if let Some(num) = config.rx_size { 333 | if unsafe { 334 | libc::setsockopt( 335 | fd.0, 336 | super::SOL_XDP, 337 | Umem::XDP_RX_RING, 338 | (&num) as *const _ as *const libc::c_void, 339 | core::mem::size_of_val(&num) as libc::socklen_t, 340 | ) 341 | } != 0 342 | { 343 | return Err(LastErrno)?; 344 | } 345 | } 346 | 347 | if let Some(num) = config.tx_size { 348 | if unsafe { 349 | libc::setsockopt( 350 | fd.0, 351 | super::SOL_XDP, 352 | Umem::XDP_TX_RING, 353 | (&num) as *const _ as *const libc::c_void, 354 | core::mem::size_of_val(&num) as libc::socklen_t, 355 | ) 356 | } != 0 357 | { 358 | return Err(LastErrno)?; 359 | } 360 | } 361 | 362 | Ok(()) 363 | } 364 | } 365 | 366 | impl DeviceQueue { 367 | /// Get the statistics of this XDP socket. 368 | #[deprecated = "Consider using `statistics_v2` for additional statistics exposed on >= Linux 5.9"] 369 | pub fn statistics(&self) -> Result { 370 | XdpStatistics::new(&self.socket.fd) 371 | } 372 | 373 | /// Get the statistics of this XDP socket. 374 | pub fn statistics_v2(&self) -> Result { 375 | XdpStatisticsV2::new(&self.socket.fd) 376 | } 377 | 378 | /// Configure a default XDP program. 379 | /// 380 | /// This is necessary to start receiving packets on any of the related receive rings, i.e. to 381 | /// start consuming from the fill queue and fill the completion queue. 382 | #[doc(hidden)] 383 | #[deprecated = "Not implemented to reduce scope and weight, use another library to bind a BPF to the socket."] 384 | pub fn setup_xdp_prog(&mut self) -> Result<(), libc::c_int> { 385 | panic!("Not implemented to reduce scope and weight, use another library to bind a BPF to the socket."); 386 | } 387 | 388 | /// Bind the socket to a device queue, activate rx/tx queues. 389 | pub fn bind(&self, interface: &User) -> Result<(), Errno> { 390 | Umem::bind_at(interface, &self.socket.fd) 391 | } 392 | } 393 | 394 | impl User { 395 | /// Get the statistics of this XDP socket. 396 | #[deprecated = "Consider using `statistics_v2` for additional statistics exposed on >= Linux 5.9"] 397 | pub fn statistics(&self) -> Result { 398 | XdpStatistics::new(&self.socket.fd) 399 | } 400 | 401 | /// Get the statistics of this XDP socket. 402 | pub fn statistics_v2(&self) -> Result { 403 | XdpStatisticsV2::new(&self.socket.fd) 404 | } 405 | 406 | /// Map the RX ring into memory, returning a handle. 407 | /// 408 | /// Fails if you did not pass any size for `rx_size` in the configuration, which should be somewhat obvious. 409 | /// 410 | /// FIXME: we allow mapping the ring more than once. Not a memory safety problem afaik, but a 411 | /// correctness problem. 412 | pub fn map_rx(&self) -> Result { 413 | let rx_size = self.config.rx_size.ok_or(Errno(-libc::EINVAL))?.get(); 414 | let ring = unsafe { RingCons::rx(&self.socket.fd, &self.map, rx_size) }?; 415 | Ok(RingRx { 416 | fd: self.socket.fd.clone(), 417 | ring, 418 | }) 419 | } 420 | 421 | /// Map the TX ring into memory, returning a handle. 422 | /// 423 | /// Fails if you did not pass any size for `tx_size` in the configuration, which should be somewhat obvious. 424 | /// 425 | /// FIXME: we allow mapping the ring more than once. Not a memory safety problem afaik, but a 426 | /// correctness problem. 427 | pub fn map_tx(&self) -> Result { 428 | let tx_size = self.config.tx_size.ok_or(Errno(-libc::EINVAL))?.get(); 429 | let ring = unsafe { RingProd::tx(&self.socket.fd, &self.map, tx_size) }?; 430 | Ok(RingTx { 431 | fd: self.socket.fd.clone(), 432 | ring, 433 | }) 434 | } 435 | } 436 | 437 | impl SocketConfig { 438 | /// Flag-bit for [`Umem::bind`] that the descriptor is shared. 439 | /// 440 | /// Generally, this flag need not be passed directly. Instead, it is set within by the library 441 | /// when the same `Umem` is used for multiple interface/queue combinations. 442 | pub const XDP_BIND_SHARED_UMEM: u16 = 1 << 0; 443 | /// Force copy-mode. 444 | pub const XDP_BIND_COPY: u16 = 1 << 1; 445 | /// Force zero-copy-mode. 446 | /// check if your NIC supports zero-copy mode by searching `XDP_SETUP_XSK_POOL` in linux kernel source code. 447 | pub const XDP_BIND_ZEROCOPY: u16 = 1 << 2; 448 | /// Enable support for need wakeup. 449 | /// 450 | /// Needs to be set for [`DeviceQueue::needs_wakeup`] and [`RingTx::needs_wakeup`]. 451 | pub const XDP_BIND_NEED_WAKEUP: u16 = 1 << 3; 452 | } 453 | 454 | #[derive(Default)] 455 | struct SpinLockedControlSet { 456 | inner: RwLock>, 457 | } 458 | 459 | impl core::ops::Deref for DeviceControl { 460 | type Target = dyn super::ControlSet; 461 | fn deref(&self) -> &Self::Target { 462 | &*self.inner 463 | } 464 | } 465 | 466 | impl super::ControlSet for SpinLockedControlSet { 467 | fn insert(&self, ctx: IfCtx) -> bool { 468 | let mut lock = self.inner.write(); 469 | lock.insert(ctx) 470 | } 471 | 472 | fn contains(&self, ctx: &IfCtx) -> bool { 473 | let lock = self.inner.read(); 474 | lock.contains(ctx) 475 | } 476 | 477 | fn remove(&self, ctx: &IfCtx) { 478 | let mut lock = self.inner.write(); 479 | lock.remove(ctx); 480 | } 481 | } 482 | 483 | impl UmemChunk { 484 | /// Turn this whole chunk into a concrete descriptor for the transmit ring. 485 | /// 486 | /// If you've the address or offset are not as returned by the ring then the result is 487 | /// unspecified, but sound. And potentially safe to use, but the kernel may complain. 488 | pub fn as_xdp(self) -> XdpDesc { 489 | let len = ptr_len(self.addr.as_ptr()) as u32; 490 | self.as_xdp_with_len(len) 491 | } 492 | 493 | /// Turn into a descriptor with concrete length. 494 | /// 495 | /// # Panics 496 | /// 497 | /// When debug assertions are enabled, this panics if the length is longer than the address 498 | /// range refers to. 499 | pub fn as_xdp_with_len(self, len: u32) -> XdpDesc { 500 | debug_assert!( 501 | len <= ptr_len(self.addr.as_ptr()) as u32, 502 | "Invalid XDP descriptor length {} for chunk of size {}", 503 | len, 504 | ptr_len(self.addr.as_ptr()) as u32, 505 | ); 506 | 507 | XdpDesc { 508 | addr: self.offset, 509 | len, 510 | options: 0, 511 | } 512 | } 513 | } 514 | -------------------------------------------------------------------------------- /src/xsk/user.rs: -------------------------------------------------------------------------------- 1 | use crate::xdp::XdpDesc; 2 | use crate::xsk::{BufIdx, DeviceQueue, RingCons, RingProd, RingRx, RingTx}; 3 | 4 | impl DeviceQueue { 5 | /// Prepare some buffers for the fill ring. 6 | /// 7 | /// The argument is an upper bound of buffers. Use the resulting object to pass specific 8 | /// buffers to the fill queue and commit the write. 9 | pub fn fill(&mut self, max: u32) -> WriteFill<'_> { 10 | WriteFill { 11 | idx: BufIdxIter::reserve(&mut self.fcq.prod, max), 12 | queue: &mut self.fcq.prod, 13 | } 14 | } 15 | 16 | /// Reap some buffers from the completion ring. 17 | /// 18 | /// Return an iterator over completed buffers. 19 | /// 20 | /// The argument is an upper bound of buffers. Use the resulting object to dequeue specific 21 | /// buffers from the completion queue and commit the read. 22 | pub fn complete(&mut self, n: u32) -> ReadComplete<'_> { 23 | ReadComplete { 24 | idx: BufIdxIter::peek(&mut self.fcq.cons, n), 25 | queue: &mut self.fcq.cons, 26 | } 27 | } 28 | 29 | /// Return the difference between our the kernel's producer state and our consumer head. 30 | pub fn available(&self) -> u32 { 31 | self.fcq.cons.count_pending() 32 | } 33 | 34 | /// Return the difference between our committed consumer state and the kernel's producer state. 35 | pub fn pending(&self) -> u32 { 36 | self.fcq.prod.count_pending() 37 | } 38 | 39 | /// Get the raw file descriptor of this ring. 40 | /// 41 | /// # Safety 42 | /// 43 | /// Use the file descriptor to attach the ring to an XSK map, for instance, but do not close it 44 | /// and avoid modifying it (unless you know what you're doing). It should be treated as a 45 | /// `BorrowedFd<'_>`. That said, it's not instant UB but probably delayed UB when the 46 | /// `DeviceQueue` modifies a reused file descriptor that it assumes to own. 47 | pub fn as_raw_fd(&self) -> libc::c_int { 48 | self.socket.fd.0 49 | } 50 | 51 | /// Query if the fill queue needs to be woken to proceed receiving. 52 | /// 53 | /// This is only accurate if `Umem::XDP_BIND_NEED_WAKEUP` was set. 54 | pub fn needs_wakeup(&self) -> bool { 55 | self.fcq.prod.check_flags() & RingTx::XDP_RING_NEED_WAKEUP != 0 56 | } 57 | 58 | /// Poll the fill queue descriptor, to wake it up. 59 | pub fn wake(&mut self) { 60 | // A bit more complex than TX, here we do a full poll on the FD. 61 | let mut poll = libc::pollfd { 62 | fd: self.socket.fd.0, 63 | events: 0, 64 | revents: 0, 65 | }; 66 | 67 | // FIXME: should somehow log this, right? 68 | let _err = unsafe { libc::poll(&mut poll as *mut _, 1, 0) }; 69 | } 70 | } 71 | 72 | impl Drop for DeviceQueue { 73 | fn drop(&mut self) { 74 | self.devices.remove(&self.socket.info.ctx); 75 | } 76 | } 77 | 78 | impl RingRx { 79 | /// Receive some buffers. 80 | /// 81 | /// Returns an iterator over the descriptors. 82 | pub fn receive(&mut self, n: u32) -> ReadRx<'_> { 83 | ReadRx { 84 | idx: BufIdxIter::peek(&mut self.ring, n), 85 | queue: &mut self.ring, 86 | } 87 | } 88 | 89 | /// Query the number of available descriptors. 90 | /// 91 | /// This operation is advisory only. It performs a __relaxed__ atomic load of the kernel 92 | /// producer. An `acquire` barrier, such as performed by [`RingRx::receive`], is always needed 93 | /// before reading any of the written descriptors to ensure that these reads do not race with 94 | /// the kernel's writes. 95 | pub fn available(&self) -> u32 { 96 | self.ring.count_pending() 97 | } 98 | 99 | /// Get the raw file descriptor of this RX ring. 100 | /// 101 | /// # Safety 102 | /// 103 | /// Use the file descriptor to attach the ring to an XSK map, for instance, but do not close it 104 | /// and avoid modifying it (unless you know what you're doing). It should be treated as a 105 | /// `BorrowedFd<'_>`. That said, it's not instant UB but probably delayed UB when the `RingRx` 106 | /// modifies a reused file descriptor that it assumes to own... 107 | pub fn as_raw_fd(&self) -> libc::c_int { 108 | self.fd.0 109 | } 110 | } 111 | 112 | impl RingTx { 113 | const XDP_RING_NEED_WAKEUP: u32 = 1 << 0; 114 | 115 | /// Transmit some buffers. 116 | /// 117 | /// Returns a proxy that can be fed descriptors. 118 | pub fn transmit(&mut self, n: u32) -> WriteTx<'_> { 119 | WriteTx { 120 | idx: BufIdxIter::reserve(&mut self.ring, n), 121 | queue: &mut self.ring, 122 | } 123 | } 124 | 125 | /// Return the difference between our committed producer state and the kernel's consumer head. 126 | pub fn pending(&self) -> u32 { 127 | self.ring.count_pending() 128 | } 129 | 130 | /// Query if the transmit queue needs to be woken to proceed receiving. 131 | /// 132 | /// This is only accurate if `Umem::XDP_BIND_NEED_WAKEUP` was set. 133 | pub fn needs_wakeup(&self) -> bool { 134 | self.ring.check_flags() & Self::XDP_RING_NEED_WAKEUP != 0 135 | } 136 | 137 | /// Send a message (with `MSG_DONTWAIT`) to wake up the transmit queue. 138 | pub fn wake(&self) { 139 | // FIXME: should somehow log this on failure, right? 140 | let _ = unsafe { 141 | libc::sendto( 142 | self.fd.0, 143 | core::ptr::null_mut(), 144 | 0, 145 | libc::MSG_DONTWAIT, 146 | core::ptr::null_mut(), 147 | 0, 148 | ) 149 | }; 150 | } 151 | 152 | /// Get the raw file descriptor of this TX ring. 153 | /// 154 | /// # Safety 155 | /// 156 | /// Use the file descriptor to attach the ring to an XSK map, for instance, but do not close it 157 | /// and avoid modifying it (unless you know what you're doing). It should be treated as a 158 | /// `BorrowedFd<'_>`. That said, it's not instant UB but probably delayed UB when the 159 | /// `RingTx` modifies a reused file descriptor that it assumes to own (for instance, `wake` 160 | /// sends a message to it). 161 | pub fn as_raw_fd(&self) -> libc::c_int { 162 | self.fd.0 163 | } 164 | } 165 | 166 | struct BufIdxIter { 167 | /// The base of our operation. 168 | base: BufIdx, 169 | /// The number of peeked buffers. 170 | buffers: u32, 171 | /// The number of buffers still left. 172 | remain: u32, 173 | } 174 | 175 | /// A writer to a fill queue. 176 | /// 177 | /// Created with [`DeviceQueue::fill`]. 178 | /// 179 | /// The owner of this value should call some of the insertion methods in any order, then release 180 | /// the writes by [`WriteFill::commit`] which performs an atomic release in the Umem queue. 181 | #[must_use = "Does nothing unless the writes are committed"] 182 | pub struct WriteFill<'queue> { 183 | idx: BufIdxIter, 184 | /// The queue we read from. 185 | queue: &'queue mut RingProd, 186 | } 187 | 188 | /// A reader from a completion queue. 189 | /// 190 | /// Created with [`DeviceQueue::complete`]. 191 | /// 192 | /// The owner of this value should call some of the reader methods or iteration in any order, then 193 | /// mark the reads by [`ReadComplete::release`], which performs an atomic release in the Umem 194 | /// queue. 195 | #[must_use = "Does nothing unless the reads are committed"] 196 | pub struct ReadComplete<'queue> { 197 | idx: BufIdxIter, 198 | /// The queue we read from. 199 | queue: &'queue mut RingCons, 200 | } 201 | 202 | /// A writer to a transmission (TX) queue. 203 | /// 204 | /// Created with [`RingTx::transmit`]. 205 | /// 206 | /// The owner of this value should call some of the insertion methods in any order, then release 207 | /// the writes by [`WriteTx::commit`] which performs an atomic release in the Umem queue. 208 | #[must_use = "Does nothing unless the writes are committed"] 209 | pub struct WriteTx<'queue> { 210 | idx: BufIdxIter, 211 | /// The queue we read from. 212 | queue: &'queue mut RingProd, 213 | } 214 | 215 | /// A reader from an receive (RX) queue. 216 | /// 217 | /// Created with [`RingRx::receive`]. 218 | /// 219 | /// The owner of this value should call some of the reader methods or iteration in any order, then 220 | /// mark the reads by [`ReadRx::release`], which performs an atomic release in the Umem queue. 221 | #[must_use = "Does nothing unless the reads are committed"] 222 | pub struct ReadRx<'queue> { 223 | idx: BufIdxIter, 224 | /// The queue we read from. 225 | queue: &'queue mut RingCons, 226 | } 227 | 228 | impl Iterator for BufIdxIter { 229 | type Item = BufIdx; 230 | fn next(&mut self) -> Option { 231 | let next = self.remain.checked_sub(1)?; 232 | self.remain = next; 233 | let ret = self.base; 234 | self.base.0 = self.base.0.wrapping_add(1); 235 | Some(ret) 236 | } 237 | } 238 | 239 | impl BufIdxIter { 240 | fn peek(queue: &mut RingCons, n: u32) -> Self { 241 | let mut this = BufIdxIter { 242 | buffers: 0, 243 | remain: 0, 244 | base: BufIdx(0), 245 | }; 246 | this.buffers = queue.peek(1..=n, &mut this.base); 247 | this.remain = this.buffers; 248 | this 249 | } 250 | 251 | fn reserve(queue: &mut RingProd, n: u32) -> Self { 252 | let mut this = BufIdxIter { 253 | buffers: 0, 254 | remain: 0, 255 | base: BufIdx(0), 256 | }; 257 | this.buffers = queue.reserve(1..=n, &mut this.base); 258 | this.remain = this.buffers; 259 | this 260 | } 261 | 262 | fn commit_prod(&mut self, queue: &mut RingProd) { 263 | // This contains an atomic write, which LLVM won't even try to optimize away. 264 | // But, as long as queues are filled there's a decent chance that we didn't manage to 265 | // reserve or fill a single buffer. 266 | // 267 | // FIXME: Should we expose this as a hint to the user? I.e. `commit_likely_empty` with a 268 | // hint. As well as better ways to avoid doing any work at all. 269 | if self.buffers > 0 { 270 | let count = self.buffers - self.remain; 271 | queue.submit(count); 272 | self.buffers -= count; 273 | self.base.0 += count; 274 | } 275 | } 276 | 277 | fn release_cons(&mut self, queue: &mut RingCons) { 278 | // See also `commit_prod`. 279 | if self.buffers > 0 { 280 | let count = self.buffers - self.remain; 281 | queue.release(count); 282 | self.buffers -= count; 283 | self.base.0 += count; 284 | } 285 | } 286 | } 287 | 288 | impl WriteFill<'_> { 289 | /// The total number of available slots. 290 | pub fn capacity(&self) -> u32 { 291 | self.idx.buffers 292 | } 293 | 294 | /// Fill one device descriptor to be filled. 295 | /// 296 | /// A descriptor is an offset in the respective Umem's memory. Any offset within a chunk can 297 | /// be used to mark the chunk as available for fill. The kernel will overwrite the contents 298 | /// arbitrarily until the chunk is returned via the RX queue. 299 | /// 300 | /// Returns if the insert was successful, that is false if the ring is full. It's guaranteed 301 | /// that the first [`WriteFill::capacity`] inserts with this function succeed. 302 | pub fn insert_once(&mut self, nr: u64) -> bool { 303 | self.insert(core::iter::once(nr)) > 0 304 | } 305 | 306 | /// Fill additional slots that were reserved. 307 | /// 308 | /// The iterator is polled only for each available slot until either is empty. Returns the 309 | /// total number of slots filled. 310 | pub fn insert(&mut self, it: impl Iterator) -> u32 { 311 | let mut n = 0; 312 | for (item, bufidx) in it.zip(self.idx.by_ref()) { 313 | n += 1; 314 | unsafe { *self.queue.fill_addr(bufidx).as_ptr() = item }; 315 | } 316 | n 317 | } 318 | 319 | /// Commit the previously written buffers to the kernel. 320 | pub fn commit(&mut self) { 321 | self.idx.commit_prod(self.queue) 322 | } 323 | } 324 | 325 | impl Drop for WriteFill<'_> { 326 | fn drop(&mut self) { 327 | // Unless everything is committed, roll back the cached queue state. 328 | if self.idx.buffers != 0 { 329 | self.queue.cancel(self.idx.buffers) 330 | } 331 | } 332 | } 333 | 334 | impl ReadComplete<'_> { 335 | /// The total number of available buffers. 336 | pub fn capacity(&self) -> u32 { 337 | self.idx.buffers 338 | } 339 | 340 | /// Read the next descriptor, an address of a chunk that was transmitted. 341 | pub fn read(&mut self) -> Option { 342 | let bufidx = self.idx.next()?; 343 | // Safety: the buffer is from that same queue by construction. 344 | Some(unsafe { *self.queue.comp_addr(bufidx).as_ptr() }) 345 | } 346 | 347 | /// Commit some of the written buffers to the kernel. 348 | pub fn release(&mut self) { 349 | self.idx.release_cons(self.queue) 350 | } 351 | } 352 | 353 | impl Drop for ReadComplete<'_> { 354 | fn drop(&mut self) { 355 | // Unless everything is committed, roll back the cached queue state. 356 | if self.idx.buffers != 0 { 357 | self.queue.cancel(self.idx.buffers) 358 | } 359 | } 360 | } 361 | 362 | impl Iterator for ReadComplete<'_> { 363 | type Item = u64; 364 | 365 | fn next(&mut self) -> Option { 366 | self.read() 367 | } 368 | } 369 | 370 | impl WriteTx<'_> { 371 | /// The total number of available slots. 372 | pub fn capacity(&self) -> u32 { 373 | self.idx.buffers 374 | } 375 | 376 | /// Insert a chunk descriptor to be sent. 377 | /// 378 | /// Returns if the insert was successful, that is false if the ring is full. It's guaranteed 379 | /// that the first [`WriteTx::capacity`] inserts with this function succeed. 380 | pub fn insert_once(&mut self, nr: XdpDesc) -> bool { 381 | self.insert(core::iter::once(nr)) > 0 382 | } 383 | 384 | /// Fill the transmit ring from an iterator. 385 | /// 386 | /// Returns the total number of enqueued descriptor. This is a `u32` as it is the common 387 | /// integral type for describing cardinalities of descriptors in a ring. Use an inspecting 388 | /// iterator for a more intrusive callback. 389 | pub fn insert(&mut self, it: impl Iterator) -> u32 { 390 | let mut n = 0; 391 | // FIXME: incorrect iteration order? Some items may get consumed but not inserted. 392 | for (item, bufidx) in it.zip(self.idx.by_ref()) { 393 | n += 1; 394 | unsafe { *self.queue.tx_desc(bufidx).as_ptr() = item }; 395 | } 396 | n 397 | } 398 | 399 | /// Commit the previously written buffers to the kernel. 400 | pub fn commit(&mut self) { 401 | self.idx.commit_prod(self.queue); 402 | } 403 | } 404 | 405 | impl Drop for WriteTx<'_> { 406 | fn drop(&mut self) { 407 | // Unless everything is committed, roll back the cached queue state. 408 | if self.idx.buffers != 0 { 409 | self.queue.cancel(self.idx.buffers) 410 | } 411 | } 412 | } 413 | 414 | impl ReadRx<'_> { 415 | /// The total number of available buffers. 416 | pub fn capacity(&self) -> u32 { 417 | self.idx.buffers 418 | } 419 | 420 | /// Read one descriptor from the receive ring. 421 | pub fn read(&mut self) -> Option { 422 | let bufidx = self.idx.next()?; 423 | // Safety: the buffer is from that same queue by construction, by assumption this is within 424 | // the valid memory region of the mapping. 425 | // FIXME: queue could validate that this is aligned. 426 | Some(unsafe { *self.queue.rx_desc(bufidx).as_ptr() }) 427 | } 428 | 429 | /// Commit some of the written buffers to the kernel. 430 | pub fn release(&mut self) { 431 | self.idx.release_cons(self.queue) 432 | } 433 | } 434 | 435 | impl Drop for ReadRx<'_> { 436 | fn drop(&mut self) { 437 | // Unless everything is committed, roll back the cached queue state. 438 | if self.idx.buffers != 0 { 439 | self.queue.cancel(self.idx.buffers) 440 | } 441 | } 442 | } 443 | 444 | impl Iterator for ReadRx<'_> { 445 | type Item = XdpDesc; 446 | 447 | fn next(&mut self) -> Option { 448 | self.read() 449 | } 450 | } 451 | --------------------------------------------------------------------------------