├── LICENSE
├── README.md
├── bench
    ├── CMakeLists.txt
    ├── read_baseline.cpp
    ├── read_bpf.cpp
    ├── uring_baseline.cpp
    └── uring_bpf.cpp
├── bpf
    ├── Makefile
    ├── bpf_loader.c
    ├── bpf_program.c
    └── load_bpf.sh
└── kernel
    ├── nvme_driver_hook.diff
    └── syscall_hook.diff


/LICENSE:
--------------------------------------------------------------------------------
  1 |                     GNU GENERAL PUBLIC LICENSE
  2 |                        Version 2, June 1991
  3 | 
  4 |  Copyright (C) 1989, 1991 Free Software Foundation, Inc.,
  5 |  51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  6 |  Everyone is permitted to copy and distribute verbatim copies
  7 |  of this license document, but changing it is not allowed.
  8 | 
  9 |                             Preamble
 10 | 
 11 |   The licenses for most software are designed to take away your
 12 | freedom to share and change it.  By contrast, the GNU General Public
 13 | License is intended to guarantee your freedom to share and change free
 14 | software--to make sure the software is free for all its users.  This
 15 | General Public License applies to most of the Free Software
 16 | Foundation's software and to any other program whose authors commit to
 17 | using it.  (Some other Free Software Foundation software is covered by
 18 | the GNU Lesser General Public License instead.)  You can apply it to
 19 | your programs, too.
 20 | 
 21 |   When we speak of free software, we are referring to freedom, not
 22 | price.  Our General Public Licenses are designed to make sure that you
 23 | have the freedom to distribute copies of free software (and charge for
 24 | this service if you wish), that you receive source code or can get it
 25 | if you want it, that you can change the software or use pieces of it
 26 | in new free programs; and that you know you can do these things.
 27 | 
 28 |   To protect your rights, we need to make restrictions that forbid
 29 | anyone to deny you these rights or to ask you to surrender the rights.
 30 | These restrictions translate to certain responsibilities for you if you
 31 | distribute copies of the software, or if you modify it.
 32 | 
 33 |   For example, if you distribute copies of such a program, whether
 34 | gratis or for a fee, you must give the recipients all the rights that
 35 | you have.  You must make sure that they, too, receive or can get the
 36 | source code.  And you must show them these terms so they know their
 37 | rights.
 38 | 
 39 |   We protect your rights with two steps: (1) copyright the software, and
 40 | (2) offer you this license which gives you legal permission to copy,
 41 | distribute and/or modify the software.
 42 | 
 43 |   Also, for each author's protection and ours, we want to make certain
 44 | that everyone understands that there is no warranty for this free
 45 | software.  If the software is modified by someone else and passed on, we
 46 | want its recipients to know that what they have is not the original, so
 47 | that any problems introduced by others will not reflect on the original
 48 | authors' reputations.
 49 | 
 50 |   Finally, any free program is threatened constantly by software
 51 | patents.  We wish to avoid the danger that redistributors of a free
 52 | program will individually obtain patent licenses, in effect making the
 53 | program proprietary.  To prevent this, we have made it clear that any
 54 | patent must be licensed for everyone's free use or not licensed at all.
 55 | 
 56 |   The precise terms and conditions for copying, distribution and
 57 | modification follow.
 58 | 
 59 |                     GNU GENERAL PUBLIC LICENSE
 60 |    TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
 61 | 
 62 |   0. This License applies to any program or other work which contains
 63 | a notice placed by the copyright holder saying it may be distributed
 64 | under the terms of this General Public License.  The "Program", below,
 65 | refers to any such program or work, and a "work based on the Program"
 66 | means either the Program or any derivative work under copyright law:
 67 | that is to say, a work containing the Program or a portion of it,
 68 | either verbatim or with modifications and/or translated into another
 69 | language.  (Hereinafter, translation is included without limitation in
 70 | the term "modification".)  Each licensee is addressed as "you".
 71 | 
 72 | Activities other than copying, distribution and modification are not
 73 | covered by this License; they are outside its scope.  The act of
 74 | running the Program is not restricted, and the output from the Program
 75 | is covered only if its contents constitute a work based on the
 76 | Program (independent of having been made by running the Program).
 77 | Whether that is true depends on what the Program does.
 78 | 
 79 |   1. You may copy and distribute verbatim copies of the Program's
 80 | source code as you receive it, in any medium, provided that you
 81 | conspicuously and appropriately publish on each copy an appropriate
 82 | copyright notice and disclaimer of warranty; keep intact all the
 83 | notices that refer to this License and to the absence of any warranty;
 84 | and give any other recipients of the Program a copy of this License
 85 | along with the Program.
 86 | 
 87 | You may charge a fee for the physical act of transferring a copy, and
 88 | you may at your option offer warranty protection in exchange for a fee.
 89 | 
 90 |   2. You may modify your copy or copies of the Program or any portion
 91 | of it, thus forming a work based on the Program, and copy and
 92 | distribute such modifications or work under the terms of Section 1
 93 | above, provided that you also meet all of these conditions:
 94 | 
 95 |     a) You must cause the modified files to carry prominent notices
 96 |     stating that you changed the files and the date of any change.
 97 | 
 98 |     b) You must cause any work that you distribute or publish, that in
 99 |     whole or in part contains or is derived from the Program or any
100 |     part thereof, to be licensed as a whole at no charge to all third
101 |     parties under the terms of this License.
102 | 
103 |     c) If the modified program normally reads commands interactively
104 |     when run, you must cause it, when started running for such
105 |     interactive use in the most ordinary way, to print or display an
106 |     announcement including an appropriate copyright notice and a
107 |     notice that there is no warranty (or else, saying that you provide
108 |     a warranty) and that users may redistribute the program under
109 |     these conditions, and telling the user how to view a copy of this
110 |     License.  (Exception: if the Program itself is interactive but
111 |     does not normally print such an announcement, your work based on
112 |     the Program is not required to print an announcement.)
113 | 
114 | These requirements apply to the modified work as a whole.  If
115 | identifiable sections of that work are not derived from the Program,
116 | and can be reasonably considered independent and separate works in
117 | themselves, then this License, and its terms, do not apply to those
118 | sections when you distribute them as separate works.  But when you
119 | distribute the same sections as part of a whole which is a work based
120 | on the Program, the distribution of the whole must be on the terms of
121 | this License, whose permissions for other licensees extend to the
122 | entire whole, and thus to each and every part regardless of who wrote it.
123 | 
124 | Thus, it is not the intent of this section to claim rights or contest
125 | your rights to work written entirely by you; rather, the intent is to
126 | exercise the right to control the distribution of derivative or
127 | collective works based on the Program.
128 | 
129 | In addition, mere aggregation of another work not based on the Program
130 | with the Program (or with a work based on the Program) on a volume of
131 | a storage or distribution medium does not bring the other work under
132 | the scope of this License.
133 | 
134 |   3. You may copy and distribute the Program (or a work based on it,
135 | under Section 2) in object code or executable form under the terms of
136 | Sections 1 and 2 above provided that you also do one of the following:
137 | 
138 |     a) Accompany it with the complete corresponding machine-readable
139 |     source code, which must be distributed under the terms of Sections
140 |     1 and 2 above on a medium customarily used for software interchange; or,
141 | 
142 |     b) Accompany it with a written offer, valid for at least three
143 |     years, to give any third party, for a charge no more than your
144 |     cost of physically performing source distribution, a complete
145 |     machine-readable copy of the corresponding source code, to be
146 |     distributed under the terms of Sections 1 and 2 above on a medium
147 |     customarily used for software interchange; or,
148 | 
149 |     c) Accompany it with the information you received as to the offer
150 |     to distribute corresponding source code.  (This alternative is
151 |     allowed only for noncommercial distribution and only if you
152 |     received the program in object code or executable form with such
153 |     an offer, in accord with Subsection b above.)
154 | 
155 | The source code for a work means the preferred form of the work for
156 | making modifications to it.  For an executable work, complete source
157 | code means all the source code for all modules it contains, plus any
158 | associated interface definition files, plus the scripts used to
159 | control compilation and installation of the executable.  However, as a
160 | special exception, the source code distributed need not include
161 | anything that is normally distributed (in either source or binary
162 | form) with the major components (compiler, kernel, and so on) of the
163 | operating system on which the executable runs, unless that component
164 | itself accompanies the executable.
165 | 
166 | If distribution of executable or object code is made by offering
167 | access to copy from a designated place, then offering equivalent
168 | access to copy the source code from the same place counts as
169 | distribution of the source code, even though third parties are not
170 | compelled to copy the source along with the object code.
171 | 
172 |   4. You may not copy, modify, sublicense, or distribute the Program
173 | except as expressly provided under this License.  Any attempt
174 | otherwise to copy, modify, sublicense or distribute the Program is
175 | void, and will automatically terminate your rights under this License.
176 | However, parties who have received copies, or rights, from you under
177 | this License will not have their licenses terminated so long as such
178 | parties remain in full compliance.
179 | 
180 |   5. You are not required to accept this License, since you have not
181 | signed it.  However, nothing else grants you permission to modify or
182 | distribute the Program or its derivative works.  These actions are
183 | prohibited by law if you do not accept this License.  Therefore, by
184 | modifying or distributing the Program (or any work based on the
185 | Program), you indicate your acceptance of this License to do so, and
186 | all its terms and conditions for copying, distributing or modifying
187 | the Program or works based on it.
188 | 
189 |   6. Each time you redistribute the Program (or any work based on the
190 | Program), the recipient automatically receives a license from the
191 | original licensor to copy, distribute or modify the Program subject to
192 | these terms and conditions.  You may not impose any further
193 | restrictions on the recipients' exercise of the rights granted herein.
194 | You are not responsible for enforcing compliance by third parties to
195 | this License.
196 | 
197 |   7. If, as a consequence of a court judgment or allegation of patent
198 | infringement or for any other reason (not limited to patent issues),
199 | conditions are imposed on you (whether by court order, agreement or
200 | otherwise) that contradict the conditions of this License, they do not
201 | excuse you from the conditions of this License.  If you cannot
202 | distribute so as to satisfy simultaneously your obligations under this
203 | License and any other pertinent obligations, then as a consequence you
204 | may not distribute the Program at all.  For example, if a patent
205 | license would not permit royalty-free redistribution of the Program by
206 | all those who receive copies directly or indirectly through you, then
207 | the only way you could satisfy both it and this License would be to
208 | refrain entirely from distribution of the Program.
209 | 
210 | If any portion of this section is held invalid or unenforceable under
211 | any particular circumstance, the balance of the section is intended to
212 | apply and the section as a whole is intended to apply in other
213 | circumstances.
214 | 
215 | It is not the purpose of this section to induce you to infringe any
216 | patents or other property right claims or to contest validity of any
217 | such claims; this section has the sole purpose of protecting the
218 | integrity of the free software distribution system, which is
219 | implemented by public license practices.  Many people have made
220 | generous contributions to the wide range of software distributed
221 | through that system in reliance on consistent application of that
222 | system; it is up to the author/donor to decide if he or she is willing
223 | to distribute software through any other system and a licensee cannot
224 | impose that choice.
225 | 
226 | This section is intended to make thoroughly clear what is believed to
227 | be a consequence of the rest of this License.
228 | 
229 |   8. If the distribution and/or use of the Program is restricted in
230 | certain countries either by patents or by copyrighted interfaces, the
231 | original copyright holder who places the Program under this License
232 | may add an explicit geographical distribution limitation excluding
233 | those countries, so that distribution is permitted only in or among
234 | countries not thus excluded.  In such case, this License incorporates
235 | the limitation as if written in the body of this License.
236 | 
237 |   9. The Free Software Foundation may publish revised and/or new versions
238 | of the General Public License from time to time.  Such new versions will
239 | be similar in spirit to the present version, but may differ in detail to
240 | address new problems or concerns.
241 | 
242 | Each version is given a distinguishing version number.  If the Program
243 | specifies a version number of this License which applies to it and "any
244 | later version", you have the option of following the terms and conditions
245 | either of that version or of any later version published by the Free
246 | Software Foundation.  If the Program does not specify a version number of
247 | this License, you may choose any version ever published by the Free Software
248 | Foundation.
249 | 
250 |   10. If you wish to incorporate parts of the Program into other free
251 | programs whose distribution conditions are different, write to the author
252 | to ask for permission.  For software which is copyrighted by the Free
253 | Software Foundation, write to the Free Software Foundation; we sometimes
254 | make exceptions for this.  Our decision will be guided by the two goals
255 | of preserving the free status of all derivatives of our free software and
256 | of promoting the sharing and reuse of software generally.
257 | 
258 |                             NO WARRANTY
259 | 
260 |   11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY
261 | FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW.  EXCEPT WHEN
262 | OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES
263 | PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED
264 | OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
265 | MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.  THE ENTIRE RISK AS
266 | TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU.  SHOULD THE
267 | PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING,
268 | REPAIR OR CORRECTION.
269 | 
270 |   12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
271 | WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR
272 | REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES,
273 | INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING
274 | OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED
275 | TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY
276 | YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER
277 | PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE
278 | POSSIBILITY OF SUCH DAMAGES.
279 | 
280 |                      END OF TERMS AND CONDITIONS
281 | 
282 |             How to Apply These Terms to Your New Programs
283 | 
284 |   If you develop a new program, and you want it to be of the greatest
285 | possible use to the public, the best way to achieve this is to make it
286 | free software which everyone can redistribute and change under these terms.
287 | 
288 |   To do so, attach the following notices to the program.  It is safest
289 | to attach them to the start of each source file to most effectively
290 | convey the exclusion of warranty; and each file should have at least
291 | the "copyright" line and a pointer to where the full notice is found.
292 | 
293 |     <one line to give the program's name and a brief idea of what it does.>
294 |     Copyright (C) <year>  <name of author>
295 | 
296 |     This program is free software; you can redistribute it and/or modify
297 |     it under the terms of the GNU General Public License as published by
298 |     the Free Software Foundation; either version 2 of the License, or
299 |     (at your option) any later version.
300 | 
301 |     This program is distributed in the hope that it will be useful,
302 |     but WITHOUT ANY WARRANTY; without even the implied warranty of
303 |     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
304 |     GNU General Public License for more details.
305 | 
306 |     You should have received a copy of the GNU General Public License along
307 |     with this program; if not, write to the Free Software Foundation, Inc.,
308 |     51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
309 | 
310 | Also add information on how to contact you by electronic and paper mail.
311 | 
312 | If the program is interactive, make it output a short notice like this
313 | when it starts in an interactive mode:
314 | 
315 |     Gnomovision version 69, Copyright (C) year name of author
316 |     Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
317 |     This is free software, and you are welcome to redistribute it
318 |     under certain conditions; type `show c' for details.
319 | 
320 | The hypothetical commands `show w' and `show c' should show the appropriate
321 | parts of the General Public License.  Of course, the commands you use may
322 | be called something other than `show w' and `show c'; they could even be
323 | mouse-clicks or menu items--whatever suits your program.
324 | 
325 | You should also get your employer (if you work as a programmer) or your
326 | school, if any, to sign a "copyright disclaimer" for the program, if
327 | necessary.  Here is a sample; alter the names:
328 | 
329 |   Yoyodyne, Inc., hereby disclaims all copyright interest in the program
330 |   `Gnomovision' (which makes passes at compilers) written by James Hacker.
331 | 
332 |   <signature of Ty Coon>, 1 April 1989
333 |   Ty Coon, President of Vice
334 | 
335 | This General Public License does not permit incorporating your program into
336 | proprietary programs.  If your program is a subroutine library, you may
337 | consider it more useful to permit linking proprietary applications with the
338 | library.  If this is what you want to do, use the GNU Lesser General
339 | Public License instead of this License.
340 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # BPF for Storage: An Exokernel-Inspired Approach
  2 | 
  3 | This repository contains instructions and source code for reproducing the micro-benchmarks in the HotOS'21 paper *BPF for Storage: An Exokernel-Inspired Approach*. [[paper]](https://dl.acm.org/doi/abs/10.1145/3458336.3465290) [[talk]](https://youtu.be/E7K1aRSy7co)
  4 | 
  5 | ## Dependency
  6 | 
  7 | Operating System: Ubuntu 20.04 with modified Linux kernel 5.8.0
  8 | 
  9 | Disk: Intel Optane SSD P5800X
 10 | 
 11 | ## Code Organization
 12 | 
 13 | * `kernel/syscall_hook.diff`: Linux kernel patch with the dispatch hook in the  syscall layer
 14 | * `kernel/nvme_driver_hook.diff`: Linux kernel patch with the dispatch look in the NVMe driver interrupt handler
 15 | * `bpf/load_bpf.sh`: Script to load BPF program into the kernel
 16 | * `bpf/bpf_loader.c`: BPF program loader
 17 | * `bpf/bpf_program.c`: BPF program running memcpy
 18 | * `bpf/Makefile`: Makefile for the BPF program
 19 | * `bench/read_baseline.cpp`: Benchmark program for baseline read()
 20 | * `bench/read_bpf.cpp`: Benchmark program for read() with BPF
 21 | * `bench/uring_baseline.cpp`: Benchmark program for baseline io_uring
 22 | * `bench/uring_bpf.cpp`: Benchmark program for io_uring with BPF
 23 | * `bench/CMakeLists.txt`: CMakeLists for the benchmark programs
 24 | 
 25 | ## Compile Kernel
 26 | 
 27 | There are two different kernel patches (`syscall_hook.diff` and `nvme_driver_hook.diff`) that contain dispatch hooks in the syscall layer and the NVMe driver, respectively. To run experiments with different dispatch hooks, we need to compile and install different kernels.
 28 | 
 29 | First, make sure that we have all the dependencies required to build a Linux kernel. You can run the following script to install those dependencies:
 30 | 
 31 | ```bash
 32 | # enable deb-src
 33 | sudo cp /etc/apt/sources.list /etc/apt/sources.list~
 34 | sudo sed -Ei 's/^# deb-src /deb-src /' /etc/apt/sources.list
 35 | sudo apt-get update
 36 | 
 37 | # install build dependency
 38 | sudo apt-get build-dep linux linux-image-$(uname -r) -y
 39 | sudo apt-get install libncurses-dev flex bison openssl libssl-dev dkms libelf-dev libudev-dev libpci-dev libiberty-dev autoconf fakeroot -y
 40 | ```
 41 | 
 42 | Then, clone the Linux repository and checkout to 5.8:
 43 | 
 44 | ```bash
 45 | git clone https://github.com/torvalds/linux.git
 46 | cd linux
 47 | git checkout tags/v5.8
 48 | ```
 49 | 
 50 | Apply the kernel patch you need and compile the modified kernel:
 51 | 
 52 | ```bash
 53 | git apply syscall_hook.diff  # apply nvme_driver_hook.diff instead if you want to run experiments with the dispatch hook in the NVMe driver
 54 | make localmodconfig
 55 | make deb-pkg
 56 | ```
 57 | 
 58 | After the kernel is successfully compiled, install all the  `.deb`  files generated in the parent folder of  `linux`:
 59 | 
 60 | ```bash
 61 | cd ..
 62 | sudo dpkg -i *.deb
 63 | ```
 64 | 
 65 | Finally, reboot the machine and make sure that you boot into the right kernel. You can examine your current kernel by running `uname -r` and boot into another kernel using `grub-reboot` with a reboot.
 66 | 
 67 | ## Load BPF Program
 68 | 
 69 | In the micro-benchmarks mentioned in the papar, we use a simple BPF program running memcpy to simulate B-Tree page parsing.
 70 | 
 71 | First, install the dependencies for building and loading BPF programs:
 72 | 
 73 | ```bash
 74 | sudo apt update
 75 | sudo apt install gcc-multilib clang llvm libelf-dev libdwarf-dev -y
 76 | 
 77 | wget http://archive.ubuntu.com/ubuntu/pool/universe/libb/libbpf/libbpf0_0.1.0-1_amd64.deb
 78 | wget http://archive.ubuntu.com/ubuntu/pool/universe/libb/libbpf/libbpf-dev_0.1.0-1_amd64.deb
 79 | sudo dpkg -i libbpf0_0.1.0-1_amd64.deb
 80 | sudo dpkg -i libbpf-dev_0.1.0-1_amd64.deb
 81 | ```
 82 | 
 83 | Then, run the script provided in this repository to compile and load the BPF program before running the benchmarks:
 84 | 
 85 | ```bash
 86 | cd bpf
 87 | sudo ./load_bpf.sh
 88 | ```
 89 | 
 90 | ## Run Benchmark
 91 | 
 92 | First, compile the benchmark programs:
 93 | 
 94 | ```bash
 95 | # install CMake
 96 | apt install cmake -y
 97 | 
 98 | # compile benchmark programs 
 99 | cd bench
100 | mkdir build
101 | cd build
102 | cmake ..
103 | make
104 | ```
105 | 
106 | Before running the benchmark, you may disable hyper-threading and CPU frequency scaling to avoid instable results. To disable hyper-threading, you can run:
107 | 
108 | ```bash
109 | sudo bash -c "echo off > /sys/devices/system/cpu/smt/control"  # need to be run again after each reboot
110 | ```
111 | 
112 | To disable CPU frequency scaling on Intel CPUs, you can:
113 | 
114 | * Add ` intel_pstate=passive intel_pstate=no_hwp` to your kernel parameters and then reboot
115 | 
116 |   * After reboot, `cat /sys/devices/system/cpu/intel_pstate/status` should show `passive` instead of `active`
117 | 
118 | * For each online CPU core, set the `scaling_governor` to `performance`, and set both `scaling_max_freq` and `scaling_min_freq` to the max frequency
119 | 
120 |   * `scaling_governor`, `scaling_max_freq`, and `scaling_min_freq` for each CPU core are available in `/sys/devices/system/cpu/cpu$CPUID/`, where `$CPUID` is the core number
121 |   * You can find the max frequency of a CPU core in `cpuinfo_max_freq`
122 | 
123 | * Disable all C-states except for C0 state for each online CPU core
124 | 
125 |   * C-state knobs for each CPU core are available in `/sys/devices/system/cpu/cpu$CPUID/cpuidle`, where `$CPUID` is the core number
126 | 
127 | * Run the following script to disable global CPU frequency scaling and turbo boost:
128 | 
129 |   ```bash
130 |   cd /sys/devices/system/cpu/intel_pstate
131 |   sudo bash -c "echo 1 > no_turbo"
132 |   sudo bash -c "echo 100 > max_perf_pct"
133 |   sudo bash -c "echo 100 > min_perf_pct"
134 |   ```
135 | 
136 | ### read()
137 | 
138 | To run the B-Tree lookup simulation with `read()` syscall, run:
139 | 
140 | ```bash
141 | # B-Tree lookup simulation with normal read() syscall
142 | sudo ./read_baseline <number of threads> <b-tree depth> <number of iterations> <devices, e.g. /dev/nvme0n1 /dev/nvme1n1 /dev/nvme2n1>
143 | 
144 | # B-Tree lookup simulation with read() syscall and in-kernel dispatching
145 | sudo ./read_bpf <number of threads> <b-tree depth> <number of iterations> <devices, e.g. /dev/nvme0n1 /dev/nvme1n1 /dev/nvme2n1>
146 | ```
147 | 
148 | After the benchmark is finished, it will print the latency of each simulated b-tree lookup at nanosecond scale.
149 | 
150 | To monitor the IOPS, you can run `sar -d -p 1 3600`. Note that for `./read_bpf` with the dispatch hook in the NVMe driver, the actual IOPS is the IOPS reported by `sar` times the B-Tree depth, since `sar` only captures IOPS in the Linux block layer, while the I/O request resubmission happens in the NVMe driver in this case.
151 | 
152 | ### io_uring
153 | 
154 | To run the B-Tree lookup simulation with io_uring, run:
155 | 
156 | ```bash
157 | # B-Tree lookup simulation with normal io_uring
158 | sudo ./uring_baseline <batch size> <b-tree depth> <number of iterations> <devices, e.g. /dev/nvme0n1 /dev/nvme1n1 /dev/nvme2n1>
159 | 
160 | # B-Tree lookup simulation with io_uring and in-kernel dispatching
161 | sudo ./uring_bpf <batch size> <b-tree depth> <number of iterations> <devices, e.g. /dev/nvme0n1 /dev/nvme1n1 /dev/nvme2n1>
162 | ```
163 | 
164 | After the benchmark is finished, it will print the latency of each simulated b-tree lookup at nanosecond scale.
165 | 
166 | To monitor the IOPS, you can run `sar -d -p 1 3600`. Note that for `./uring_bpf` with the dispatch hook in the NVMe driver, the actual IOPS is the IOPS reported by `sar` times the B-Tree depth, since `sar` only captures IOPS in the Linux block layer, while the I/O request resubmission happens in the NVMe driver in this case.
167 | 
168 | ## Contact
169 | 
170 | For any questions or comments, please reach out to yuhong.zhong@columbia.edu.


--------------------------------------------------------------------------------
/bench/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | cmake_minimum_required(VERSION 3.10)
 2 | project(bpf_storage)
 3 | 
 4 | set(CMAKE_CXX_STANDARD 14)
 5 | 
 6 | add_compile_options(-fpermissive)
 7 | 
 8 | add_executable(read_bpf read_bpf.cpp)
 9 | add_executable(read_baseline read_baseline.cpp)
10 | 
11 | add_executable(uring_bpf uring_bpf.cpp)
12 | add_executable(uring_baseline uring_baseline.cpp)
13 | 
14 | target_link_libraries(read_bpf pthread)
15 | target_link_libraries(read_baseline pthread)
16 | 


--------------------------------------------------------------------------------
/bench/read_baseline.cpp:
--------------------------------------------------------------------------------
  1 | #define _GNU_SOURCE
  2 | 
  3 | #include <sys/types.h>
  4 | #include <sys/stat.h>
  5 | #include <fcntl.h>
  6 | #include <stdio.h>
  7 | #include <errno.h>
  8 | #include <time.h>
  9 | #include <string.h>
 10 | #include <unistd.h>
 11 | #include <stdlib.h>
 12 | #include <chrono>
 13 | #include <math.h>
 14 | #include <thread>
 15 | #include <fstream>
 16 | #include <sys/syscall.h>
 17 | #include <string.h>
 18 | 
 19 | #define PAGE_SHIFT 12
 20 | #define PAGE_SIZE (1 << PAGE_SHIFT)
 21 | 
 22 | #define READ_SIZE_SHIFT 9
 23 | #define READ_SIZE (1 << READ_SIZE_SHIFT)
 24 | 
 25 | #define MAX_PAGE_INDEX  (1 << 23)
 26 | 
 27 | using namespace std::chrono;
 28 | using namespace std;
 29 | 
 30 | 
 31 | int num_thread;
 32 | int num_file;
 33 | int level;
 34 | long iteration;
 35 | 
 36 | long *latency_measure;
 37 | char **file_names;
 38 | 
 39 | 
 40 | void read_key(int fd, long index, void *buffer) {
 41 | 	off_t lseek_ret = lseek(fd, index << PAGE_SHIFT, SEEK_SET);
 42 | 	if (lseek_ret != index << PAGE_SHIFT) {
 43 | 		printf("lseek error, errno %d, ret: %ld\n", errno, lseek_ret);
 44 | 		exit(1);
 45 | 	}
 46 | 	int read_ret = read(fd, buffer, READ_SIZE);
 47 | 	if (read_ret != READ_SIZE) {
 48 | 		printf("read error, errno %d, ret: %d\n", errno, read_ret);
 49 | 		exit(1);
 50 | 	}
 51 | }
 52 | 
 53 | void read_thread_fn(int thread_idx) {
 54 | 	unsigned int seedp = thread_idx;
 55 | 	void *buffer = aligned_alloc(PAGE_SIZE, PAGE_SIZE);
 56 | 	if (!buffer) {
 57 | 		printf("cannot allocate buffer\n");
 58 | 		exit(1);
 59 | 	}
 60 | 	memset(buffer, 0, PAGE_SIZE);
 61 | 
 62 | 	int *fd_arr = (int *) malloc(num_file * sizeof(int));
 63 | 	if (!fd_arr) {
 64 | 		printf("cannot allocate fs array\n");
 65 | 		exit(1);
 66 | 	}
 67 | 	for (int file_idx = 0; file_idx < num_file; ++file_idx) {
 68 | 		fd_arr[file_idx] = open(file_names[file_idx], O_DIRECT | O_RDONLY);
 69 | 		if (fd_arr[file_idx] < 0) {
 70 | 			printf("cannot open file, errno: %d\n", errno);
 71 | 			exit(1);
 72 | 		}
 73 | 	}
 74 | 
 75 | 	steady_clock::time_point *start_time_arr = new steady_clock::time_point[iteration];
 76 | 	if (!start_time_arr) {
 77 | 		printf("cannot allocate start_time_arr\n");
 78 | 		exit(1);
 79 | 	}
 80 | 	steady_clock::time_point *end_time_arr = new steady_clock::time_point[iteration];
 81 | 	if (!end_time_arr) {
 82 | 		printf("cannot allocate end_time_arr\n");
 83 | 		exit(1);
 84 | 	}
 85 | 
 86 | 	for (long i = 0; i < iteration; i++) {
 87 | 		start_time_arr[i] = steady_clock::now();
 88 | 		for (int j = 0; j < level; j++) {
 89 | 			read_key(fd_arr[rand_r(&seedp) % num_file], rand_r(&seedp) % MAX_PAGE_INDEX, buffer);
 90 | 		}
 91 | 		end_time_arr[i] = steady_clock::now();
 92 | 	}
 93 | 
 94 | 	for (long i = 0; i < iteration; i++) {
 95 | 		auto duration = duration_cast<nanoseconds>(end_time_arr[i] - start_time_arr[i]);
 96 | 		latency_measure[thread_idx * iteration + i] = duration.count();
 97 | 	}
 98 | }
 99 | 
100 | int main(int argc, char *argv[]) {
101 | 	if (argc < 5) {
102 | 		printf("Usage: %s <num_thread> <level> <iteration> <filenames>\n", argv[0]);
103 | 		exit(1);
104 | 	}
105 | 	sscanf(argv[1], "%d", &num_thread);
106 | 	sscanf(argv[2], "%d", &level);
107 | 	sscanf(argv[3], "%ld", &iteration);
108 | 	num_file = argc - 4;
109 | 	file_names = argv + 4;
110 | 
111 | 	latency_measure = (long *) malloc(sizeof(long) * num_thread * iteration);
112 | 	if (!latency_measure) {
113 | 		printf("cannot allocate measurements\n");
114 | 		return 1;
115 | 	}
116 | 	memset(latency_measure, 0, sizeof(long) * num_thread * iteration);
117 | 
118 | 	thread *read_threads = new thread[num_thread];
119 | 	for (int i = 0; i < num_thread; i++) {
120 | 		read_threads[i] = thread(read_thread_fn, i);
121 | 	}
122 | 	for (int i = 0; i < num_thread; i++) {
123 | 		read_threads[i].join();
124 | 	}
125 | 
126 | 	for (long i = 0; i < num_thread * iteration; ++i) {
127 | 		printf("%ld\n", latency_measure[i]);
128 | 	}
129 | }
130 | 


--------------------------------------------------------------------------------
/bench/read_bpf.cpp:
--------------------------------------------------------------------------------
  1 | #define _GNU_SOURCE
  2 | 
  3 | #include <sys/types.h>
  4 | #include <sys/stat.h>
  5 | #include <fcntl.h>
  6 | #include <stdio.h>
  7 | #include <errno.h>
  8 | #include <time.h>
  9 | #include <string.h>
 10 | #include <unistd.h>
 11 | #include <stdlib.h>
 12 | #include <chrono>
 13 | #include <math.h>
 14 | #include <thread>
 15 | #include <fstream>
 16 | #include <sys/syscall.h>
 17 | #include <string.h>
 18 | #include <sched.h>
 19 | 
 20 | #define __NR_set_bpf_level 440
 21 | 
 22 | #define PAGE_SHIFT 12
 23 | #define PAGE_SIZE (1 << PAGE_SHIFT)
 24 | 
 25 | #define READ_SIZE_SHIFT 9
 26 | #define READ_SIZE (1 << READ_SIZE_SHIFT)
 27 | 
 28 | #define MAX_PAGE_INDEX  (1 << 23)
 29 | 
 30 | using namespace std::chrono;
 31 | using namespace std;
 32 | 
 33 | 
 34 | int num_thread;
 35 | int num_file;
 36 | int level;
 37 | long iteration;
 38 | 
 39 | long *latency_measure;
 40 | char **file_names;
 41 | 
 42 | 
 43 | long sys_bpf_set_level(int fd, int level) {
 44 | 	return syscall(__NR_set_bpf_level, fd, level);
 45 | }
 46 | 
 47 | void read_key(int fd, long index, void *buffer) {
 48 | 	off_t lseek_ret = lseek(fd, index << PAGE_SHIFT, SEEK_SET);
 49 | 	if (lseek_ret != index << PAGE_SHIFT) {
 50 | 		printf("lseek error, errno %d, ret: %ld\n", errno, lseek_ret);
 51 | 		exit(1);
 52 | 	}
 53 | 	int read_ret = read(fd, buffer, READ_SIZE);
 54 | 	if (read_ret != READ_SIZE) {
 55 | 		printf("read error, errno %d, ret: %d\n", errno, read_ret);
 56 | 		exit(1);
 57 | 	}
 58 | }
 59 | 
 60 | void read_thread_fn(int thread_idx) {
 61 | 	unsigned int seedp = thread_idx;
 62 | 	void *buffer = aligned_alloc(PAGE_SIZE, PAGE_SIZE);
 63 | 	if (!buffer) {
 64 | 		printf("cannot allocate buffer\n");
 65 | 		exit(1);
 66 | 	}
 67 | 	memset(buffer, 0, PAGE_SIZE);
 68 | 
 69 | 	int *fd_arr = (int *) malloc(num_file * sizeof(int));
 70 | 	if (!fd_arr) {
 71 | 		printf("cannot allocate fs array\n");
 72 | 		exit(1);
 73 | 	}
 74 | 	for (int file_idx = 0; file_idx < num_file; ++file_idx) {
 75 | 		fd_arr[file_idx] = open(file_names[file_idx], O_DIRECT | O_RDONLY);
 76 | 		if (fd_arr[file_idx] < 0) {
 77 | 			printf("cannot open file, errno: %d\n", errno);
 78 | 			exit(1);
 79 | 		}
 80 | 		long sys_ret = sys_bpf_set_level(fd_arr[file_idx], level);
 81 | 		if (sys_ret < 0) {
 82 | 			printf("sys_bpf_set_level error, ret: %ld\n", sys_ret);
 83 | 			exit(1);
 84 | 		}
 85 | 	}
 86 | 
 87 | 	steady_clock::time_point *start_time_arr = new steady_clock::time_point[iteration];
 88 | 	if (!start_time_arr) {
 89 | 		printf("cannot allocate start_time_arr\n");
 90 | 		exit(1);
 91 | 	}
 92 | 	steady_clock::time_point *end_time_arr = new steady_clock::time_point[iteration];
 93 | 	if (!end_time_arr) {
 94 | 		printf("cannot allocate end_time_arr\n");
 95 | 		exit(1);
 96 | 	}
 97 | 
 98 | 	for (long i = 0; i < iteration; i++) {
 99 | 		start_time_arr[i] = steady_clock::now();
100 | 		read_key(fd_arr[rand_r(&seedp) % num_file], rand_r(&seedp) % MAX_PAGE_INDEX, buffer);
101 | 		end_time_arr[i] = steady_clock::now();
102 | 	}
103 | 
104 | 	for (long i = 0; i < iteration; i++) {
105 | 		auto duration = duration_cast<nanoseconds>(end_time_arr[i] - start_time_arr[i]);
106 | 		latency_measure[thread_idx * iteration + i] = duration.count();
107 | 	}
108 | }
109 | 
110 | int main(int argc, char *argv[]) {
111 | 	if (argc < 5) {
112 | 		printf("Usage: %s <num_thread> <level> <iteration> <filenames>\n", argv[0]);
113 | 		exit(1);
114 | 	}
115 | 	sscanf(argv[1], "%d", &num_thread);
116 | 	sscanf(argv[2], "%d", &level);
117 | 	sscanf(argv[3], "%ld", &iteration);
118 | 	num_file = argc - 4;
119 | 	file_names = argv + 4;
120 | 
121 | 	latency_measure = (long *) malloc(sizeof(long) * num_thread * iteration);
122 | 	if (!latency_measure) {
123 | 		printf("cannot allocate measurements\n");
124 | 		return 1;
125 | 	}
126 | 	memset(latency_measure, 0, sizeof(long) * num_thread * iteration);
127 | 
128 | 	thread *read_threads = new thread[num_thread];
129 | 	for (int i = 0; i < num_thread; i++) {
130 | 		read_threads[i] = thread(read_thread_fn, i);
131 | 	}
132 | 	for (int i = 0; i < num_thread; i++) {
133 | 		read_threads[i].join();
134 | 	}
135 | 
136 | 	for (long i = 0; i < num_thread * iteration; ++i) {
137 | 		printf("%ld\n", latency_measure[i]);
138 | 	}
139 | }
140 | 


--------------------------------------------------------------------------------
/bench/uring_baseline.cpp:
--------------------------------------------------------------------------------
  1 | /* built upon io_uring codebase https://github.com/shuveb/io_uring-by-example */
  2 | #include <stdio.h>
  3 | #include <stdlib.h>
  4 | #include <sys/stat.h>
  5 | #include <sys/ioctl.h>
  6 | #include <sys/syscall.h>
  7 | #include <sys/mman.h>
  8 | #include <sys/uio.h>
  9 | #include <linux/fs.h>
 10 | #include <fcntl.h>
 11 | #include <unistd.h>
 12 | #include <string.h>
 13 | #include <iostream>
 14 | #include <ctime>
 15 | #include <ratio>
 16 | #include <chrono>
 17 | 
 18 | /* If your compilation fails because the header file below is missing,
 19 | * your kernel is probably too old to support io_uring.
 20 | * */
 21 | #include <linux/io_uring.h>
 22 | 
 23 | #define READ_SIZE   512
 24 | #define PAGE_SHIFT  12
 25 | #define PAGE_SIZE   (1 << PAGE_SHIFT)
 26 | #define MAX_PAGE_INDEX  (1 << 23)
 27 | 
 28 | /* This is x86 specific */
 29 | #define read_barrier()  __asm__ __volatile__("":::"memory")
 30 | #define write_barrier() __asm__ __volatile__("":::"memory")
 31 | 
 32 | using namespace std::chrono;
 33 | 
 34 | struct app_io_sq_ring {
 35 | 	unsigned *head;
 36 | 	unsigned *tail;
 37 | 	unsigned *ring_mask;
 38 | 	unsigned *ring_entries;
 39 | 	unsigned *flags;
 40 | 	unsigned *array;
 41 | };
 42 | 
 43 | struct app_io_cq_ring {
 44 | 	unsigned *head;
 45 | 	unsigned *tail;
 46 | 	unsigned *ring_mask;
 47 | 	unsigned *ring_entries;
 48 | 	struct io_uring_cqe *cqes;
 49 | };
 50 | 
 51 | struct entry {
 52 | 	steady_clock::time_point start_time;
 53 | 	long total_time;
 54 | 
 55 | 	int cur_level;
 56 | };
 57 | 
 58 | struct submitter {
 59 | 	int ring_fd;
 60 | 	struct app_io_sq_ring sq_ring;
 61 | 	struct io_uring_sqe *sqes;
 62 | 	struct app_io_cq_ring cq_ring;
 63 | 
 64 | 	int batch_size;
 65 | 	int num_file;
 66 | 	int *fd_arr;
 67 | 
 68 | 	void *buffer;
 69 | 	struct iovec *iovecs;
 70 | 
 71 | 	struct entry *entry_arr;
 72 | 	long *completion_arr;
 73 | 	long finished_op;
 74 | };
 75 | 
 76 | /*
 77 | * This code is written in the days when io_uring-related system calls are not
 78 | * part of standard C libraries. So, we roll our own system call wrapper
 79 | * functions.
 80 | * */
 81 | 
 82 | int io_uring_setup(unsigned entries, struct io_uring_params *p) {
 83 | 	return (int) syscall(__NR_io_uring_setup, entries, p);
 84 | }
 85 | 
 86 | int io_uring_enter(int ring_fd, unsigned int to_submit,
 87 | 		   unsigned int min_complete, unsigned int flags) {
 88 | 	return (int) syscall(__NR_io_uring_enter, ring_fd, to_submit, min_complete,
 89 | 			     flags, NULL, 0);
 90 | }
 91 | 
 92 | int io_uring_register(int fd, unsigned int opcode, void *arg,
 93 | 		      unsigned int nr_args) {
 94 | 	return (int) syscall(__NR_io_uring_register, fd, opcode, arg, nr_args);
 95 | }
 96 | 
 97 | /*
 98 | * io_uring requires a lot of setup which looks pretty hairy, but isn't all
 99 | * that difficult to understand. Because of all this boilerplate code,
100 | * io_uring's author has created liburing, which is relatively easy to use.
101 | * However, you should take your time and understand this code. It is always
102 | * good to know how it all works underneath. Apart from bragging rights,
103 | * it does offer you a certain strange geeky peace.
104 | * */
105 | 
106 | int app_setup_uring(struct submitter *s) {
107 | 	struct app_io_sq_ring *sring = &s->sq_ring;
108 | 	struct app_io_cq_ring *cring = &s->cq_ring;
109 | 	struct io_uring_params p;
110 | 	void *sq_ptr, *cq_ptr;
111 | 
112 | 	/*
113 | 	* We need to pass in the io_uring_params structure to the io_uring_setup()
114 | 	* call zeroed out. We could set any flags if we need to, but for this
115 | 	* example, we don't.
116 | 	* */
117 | 	memset(&p, 0, sizeof(p));
118 | 	s->ring_fd = io_uring_setup(s->batch_size, &p);
119 | 	if (s->ring_fd < 0) {
120 | 		perror("io_uring_setup");
121 | 		return 1;
122 | 	}
123 | 
124 | 	/*
125 | 	* io_uring communication happens via 2 shared kernel-user space ring buffers,
126 | 	* which can be jointly mapped with a single mmap() call in recent kernels.
127 | 	* While the completion queue is directly manipulated, the submission queue
128 | 	* has an indirection array in between. We map that in as well.
129 | 	* */
130 | 
131 | 	int sring_sz = p.sq_off.array + p.sq_entries * sizeof(unsigned);
132 | 	int cring_sz = p.cq_off.cqes + p.cq_entries * sizeof(struct io_uring_cqe);
133 | 
134 | 	/* In kernel version 5.4 and above, it is possible to map the submission and
135 | 	* completion buffers with a single mmap() call. Rather than check for kernel
136 | 	* versions, the recommended way is to just check the features field of the
137 | 	* io_uring_params structure, which is a bit mask. If the
138 | 	* IORING_FEAT_SINGLE_MMAP is set, then we can do away with the second mmap()
139 | 	* call to map the completion ring.
140 | 	* */
141 | 	if (p.features & IORING_FEAT_SINGLE_MMAP) {
142 | 		if (cring_sz > sring_sz) {
143 | 			sring_sz = cring_sz;
144 | 		}
145 | 		cring_sz = sring_sz;
146 | 	}
147 | 
148 | 	/* Map in the submission and completion queue ring buffers.
149 | 	* Older kernels only map in the submission queue, though.
150 | 	* */
151 | 	sq_ptr = mmap(0, sring_sz, PROT_READ | PROT_WRITE,
152 | 		      MAP_SHARED | MAP_POPULATE,
153 | 		      s->ring_fd, IORING_OFF_SQ_RING);
154 | 	if (sq_ptr == MAP_FAILED) {
155 | 		perror("mmap");
156 | 		return 1;
157 | 	}
158 | 
159 | 	if (p.features & IORING_FEAT_SINGLE_MMAP) {
160 | 		cq_ptr = sq_ptr;
161 | 	} else {
162 | 		/* Map in the completion queue ring buffer in older kernels separately */
163 | 		cq_ptr = mmap(0, cring_sz, PROT_READ | PROT_WRITE,
164 | 			      MAP_SHARED | MAP_POPULATE,
165 | 			      s->ring_fd, IORING_OFF_CQ_RING);
166 | 		if (cq_ptr == MAP_FAILED) {
167 | 			perror("mmap");
168 | 			return 1;
169 | 		}
170 | 	}
171 | 	/* Save useful fields in a global app_io_sq_ring struct for later
172 | 	* easy reference */
173 | 	sring->head = sq_ptr + p.sq_off.head;
174 | 	sring->tail = sq_ptr + p.sq_off.tail;
175 | 	sring->ring_mask = sq_ptr + p.sq_off.ring_mask;
176 | 	sring->ring_entries = sq_ptr + p.sq_off.ring_entries;
177 | 	sring->flags = sq_ptr + p.sq_off.flags;
178 | 	sring->array = sq_ptr + p.sq_off.array;
179 | 
180 | 	/* Map in the submission queue entries array */
181 | 	s->sqes = mmap(0, p.sq_entries * sizeof(struct io_uring_sqe),
182 | 		       PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE,
183 | 		       s->ring_fd, IORING_OFF_SQES);
184 | 	if (s->sqes == MAP_FAILED) {
185 | 		perror("mmap");
186 | 		return 1;
187 | 	}
188 | 
189 | 	/* Save useful fields in a global app_io_cq_ring struct for later
190 | 	* easy reference */
191 | 	cring->head = cq_ptr + p.cq_off.head;
192 | 	cring->tail = cq_ptr + p.cq_off.tail;
193 | 	cring->ring_mask = cq_ptr + p.cq_off.ring_mask;
194 | 	cring->ring_entries = cq_ptr + p.cq_off.ring_entries;
195 | 	cring->cqes = cq_ptr + p.cq_off.cqes;
196 | 
197 | 	return 0;
198 | }
199 | 
200 | /*
201 | * Read from completion queue.
202 | * In this function, we read completion events from the completion queue, get
203 | * the data buffer that will have the file data and print it to the console.
204 | * */
205 | int poll_from_cq(struct submitter *s) {
206 | 	struct app_io_cq_ring *cring = &s->cq_ring;
207 | 	struct io_uring_cqe *cqe;
208 | 	unsigned head;
209 | 	int reaped = 0;
210 | 
211 | 	head = *cring->head;
212 | 	read_barrier();
213 | 
214 | 	while (true) {
215 | 		/*
216 | 		* Remember, this is a ring buffer. If head == tail, it means that the
217 | 		* buffer is empty.
218 | 		* */
219 | 		if (head == *cring->tail) {
220 | 			break;
221 | 		}
222 | 		read_barrier();
223 | 
224 | 		/* Get the entry */
225 | 		cqe = &cring->cqes[head & *s->cq_ring.ring_mask];
226 | 		if (cqe->res != READ_SIZE) {
227 | 			printf("read_from_cq error, ret: %d\n", cqe->res);
228 | 			exit(1);
229 | 		}
230 | 		s->completion_arr[reaped++] = cqe->user_data;
231 | 		head++;
232 | 	}
233 | 
234 | 	*cring->head = head;
235 | 	write_barrier();
236 | 
237 | 	return reaped;
238 | }
239 | 
240 | /*
241 | * Submit to submission queue.
242 | * In this function, we submit requests to the submission queue. You can submit
243 | * many types of requests. Ours is going to be the readv() request, which we
244 | * specify via IORING_OP_READV.
245 | *
246 | * */
247 | void submit_to_sq(struct submitter *s, unsigned long long user_data, void *addr, bool with_barrier) {
248 | 	struct app_io_sq_ring *sring = &s->sq_ring;
249 | 	unsigned index = 0, tail = 0, next_tail = 0;
250 | 
251 | 	/* Add our submission queue entry to the tail of the SQE ring buffer */
252 | 	next_tail = tail = *sring->tail;
253 | 	next_tail++;
254 | 	index = tail & *s->sq_ring.ring_mask;
255 | 	struct io_uring_sqe *sqe = &s->sqes[index];
256 | 	sqe->fd = rand() % s->num_file;  /* randomly choose a device */
257 | 	sqe->flags = IOSQE_FIXED_FILE;
258 | 	sqe->opcode = IORING_OP_READV;
259 | 	sqe->addr = (unsigned long) addr;
260 | 	sqe->len = 1;
261 | 	sqe->off = ((long) (rand() % MAX_PAGE_INDEX)) << PAGE_SHIFT;  /* randomly choose an offset */
262 | 	sqe->user_data = user_data;
263 | 	sring->array[index] = index;
264 | 	tail = next_tail;
265 | 
266 | 	if (with_barrier) {
267 | 		write_barrier();
268 | 	}
269 | 
270 | 	/* Update the tail so the kernel can see it. */
271 | 	if (*sring->tail != tail) {
272 | 		*sring->tail = tail;
273 | 		if (with_barrier) {
274 | 			write_barrier();
275 | 		}
276 | 	}
277 | }
278 | 
279 | int main(int argc, char *argv[]) {
280 | 	struct submitter *s;
281 | 
282 | 	if (argc < 5) {
283 | 		fprintf(stderr, "Usage: %s <batch_size> <level> <iteration> <filenames>\n", argv[0]);
284 | 		return 1;
285 | 	}
286 | 	int batch_size;
287 | 	int level;
288 | 	long iteration;
289 | 	int num_file = argc - 4;
290 | 	sscanf(argv[1], "%d", &batch_size);
291 | 	sscanf(argv[2], "%d", &level);
292 | 	sscanf(argv[3], "%ld", &iteration);
293 | 
294 | 	s = (struct submitter *) malloc(sizeof(*s));
295 | 	if (!s) {
296 | 		perror("malloc");
297 | 		return 1;
298 | 	}
299 | 	memset(s, 0, sizeof(*s));
300 | 	s->batch_size = batch_size;
301 | 	s->num_file = num_file;
302 | 
303 | 	if (app_setup_uring(s)) {
304 | 		fprintf(stderr, "Unable to setup uring!\n");
305 | 		return 1;
306 | 	}
307 | 
308 | 	s->fd_arr = (int *) malloc(sizeof(int) * num_file);
309 | 	if (!s->fd_arr) {
310 | 		perror("s->fd_arr");
311 | 		return 1;
312 | 	}
313 | 
314 | 	for (int i = 0; i < num_file; ++i) {
315 | 		s->fd_arr[i] = open(argv[4 + i], O_RDONLY | O_DIRECT);
316 | 		if (s->fd_arr[i] < 0) {
317 | 			perror("open");
318 | 			exit(1);
319 | 		}
320 | 	}
321 | 	int ret = io_uring_register(s->ring_fd, IORING_REGISTER_FILES, s->fd_arr, s->num_file);
322 | 	if (ret) {
323 | 		perror("io_uring_register");
324 | 		exit(1);
325 | 	}
326 | 
327 | 	if (posix_memalign(&s->buffer, READ_SIZE, READ_SIZE)) {
328 | 		perror("posix_memalign");
329 | 		return 1;
330 | 	}
331 | 	s->iovecs = (struct iovec *) malloc(batch_size * sizeof(struct iovec));
332 | 	if (!s->iovecs) {
333 | 		perror("s->iovecs");
334 | 		exit(1);
335 | 	}
336 | 	for (int i = 0; i < batch_size; ++i) {
337 | 		s->iovecs[i].iov_base = s->buffer;
338 | 		s->iovecs[i].iov_len = READ_SIZE;
339 | 	}
340 | 
341 | 	s->entry_arr = (struct entry *) malloc(iteration * batch_size * sizeof(*s->entry_arr));
342 | 	if (!s->entry_arr) {
343 | 		perror("s->entry_arr");
344 | 		exit(1);
345 | 	}
346 | 	memset(s->entry_arr, 0, iteration * batch_size * sizeof(*s->entry_arr));
347 | 	s->completion_arr = (long *) malloc(batch_size * sizeof(long));
348 | 	if (!s->completion_arr) {
349 | 		perror("s->completion_arr");
350 | 		exit(1);
351 | 	}
352 | 	memset(s->completion_arr, 0, batch_size * sizeof(long));
353 | 	s->finished_op = 0;
354 | 
355 | 	/* send the first batch */
356 | 	for (int i = 0; i < batch_size; ++i) {
357 | 		s->entry_arr[i].start_time = steady_clock::now();
358 | 		submit_to_sq(s, i, &s->iovecs[i], false);
359 | 	}
360 | 	write_barrier();
361 | 	int enter_ret = io_uring_enter(s->ring_fd, s->batch_size, 0, IORING_ENTER_GETEVENTS);
362 | 	if (enter_ret < 0) {
363 | 		perror("io_uring_enter");
364 | 		return 1;
365 | 	}
366 | 
367 | 	while (s->finished_op < iteration * batch_size * level) {
368 | 		int reaped = poll_from_cq(s);
369 | 		int submitted = 0;
370 | 		for (int i = 0; i < reaped; ++i) {
371 | 			long index = s->completion_arr[i];
372 | 			long batch_index = index / batch_size;
373 | 			long sub_index = index % batch_size;
374 | 			struct entry *e = &s->entry_arr[index];
375 | 			e->total_time += duration_cast<nanoseconds>(steady_clock::now() - e->start_time).count();
376 | 			++e->cur_level;
377 | 
378 | 			if (e->cur_level == level && batch_index == iteration - 1) {
379 | 				/* no further request */
380 | 				continue;
381 | 			}
382 | 			long next_batch_index = (e->cur_level == level) ? batch_index + 1 : batch_index;
383 | 			long next_index = next_batch_index * batch_size + sub_index;
384 | 			struct entry *next_e = &s->entry_arr[next_index];
385 | 			next_e->start_time = steady_clock::now();
386 | 			submit_to_sq(s, next_index, &s->iovecs[sub_index], false);
387 | 			++submitted;
388 | 		}
389 | 		s->finished_op += reaped;
390 | 		write_barrier();
391 | 		if (submitted == 0) {
392 | 			continue;
393 | 		}
394 | 		int enter_ret = io_uring_enter(s->ring_fd, submitted, 0, IORING_ENTER_GETEVENTS);
395 | 		if (enter_ret < 0) {
396 | 			perror("io_uring_enter");
397 | 			return 1;
398 | 		}
399 | 	}
400 | 
401 | 	for (long i = 0; i < iteration * batch_size; ++i) {
402 | 		std::cout << s->entry_arr[i].total_time << std::endl;
403 | 	}
404 | 
405 | 	return 0;
406 | }
407 | 


--------------------------------------------------------------------------------
/bench/uring_bpf.cpp:
--------------------------------------------------------------------------------
  1 | /* built upon io_uring codebase https://github.com/shuveb/io_uring-by-example */
  2 | #include <stdio.h>
  3 | #include <stdlib.h>
  4 | #include <sys/stat.h>
  5 | #include <sys/ioctl.h>
  6 | #include <sys/syscall.h>
  7 | #include <sys/mman.h>
  8 | #include <sys/uio.h>
  9 | #include <linux/fs.h>
 10 | #include <fcntl.h>
 11 | #include <unistd.h>
 12 | #include <string.h>
 13 | #include <iostream>
 14 | #include <ctime>
 15 | #include <ratio>
 16 | #include <chrono>
 17 | 
 18 | /* If your compilation fails because the header file below is missing,
 19 | * your kernel is probably too old to support io_uring.
 20 | * */
 21 | #include <linux/io_uring.h>
 22 | 
 23 | #define READ_SIZE   512
 24 | #define PAGE_SHIFT  12
 25 | #define PAGE_SIZE   (1 << PAGE_SHIFT)
 26 | #define MAX_PAGE_INDEX  (1 << 23)
 27 | 
 28 | #define __NR_set_bpf_level 440
 29 | 
 30 | /* This is x86 specific */
 31 | #define read_barrier()  __asm__ __volatile__("":::"memory")
 32 | #define write_barrier() __asm__ __volatile__("":::"memory")
 33 | 
 34 | using namespace std::chrono;
 35 | 
 36 | struct app_io_sq_ring {
 37 | 	unsigned *head;
 38 | 	unsigned *tail;
 39 | 	unsigned *ring_mask;
 40 | 	unsigned *ring_entries;
 41 | 	unsigned *flags;
 42 | 	unsigned *array;
 43 | };
 44 | 
 45 | struct app_io_cq_ring {
 46 | 	unsigned *head;
 47 | 	unsigned *tail;
 48 | 	unsigned *ring_mask;
 49 | 	unsigned *ring_entries;
 50 | 	struct io_uring_cqe *cqes;
 51 | };
 52 | 
 53 | struct entry {
 54 | 	steady_clock::time_point start_time;
 55 | 	long total_time;
 56 | };
 57 | 
 58 | struct submitter {
 59 | 	int ring_fd;
 60 | 	struct app_io_sq_ring sq_ring;
 61 | 	struct io_uring_sqe *sqes;
 62 | 	struct app_io_cq_ring cq_ring;
 63 | 
 64 | 	int batch_size;
 65 | 	int num_file;
 66 | 	int *fd_arr;
 67 | 
 68 | 	void *buffer;
 69 | 	struct iovec *iovecs;
 70 | 
 71 | 	struct entry *entry_arr;
 72 | 	long *completion_arr;
 73 | 	long finished_op;
 74 | };
 75 | 
 76 | long sys_set_bpf_level(int fd, int level) {
 77 | 	return syscall(__NR_set_bpf_level, fd, level);
 78 | }
 79 | 
 80 | /*
 81 | * This code is written in the days when io_uring-related system calls are not
 82 | * part of standard C libraries. So, we roll our own system call wrapper
 83 | * functions.
 84 | * */
 85 | 
 86 | int io_uring_setup(unsigned entries, struct io_uring_params *p) {
 87 | 	return (int) syscall(__NR_io_uring_setup, entries, p);
 88 | }
 89 | 
 90 | int io_uring_enter(int ring_fd, unsigned int to_submit,
 91 | 		   unsigned int min_complete, unsigned int flags) {
 92 | 	return (int) syscall(__NR_io_uring_enter, ring_fd, to_submit, min_complete,
 93 | 			     flags, NULL, 0);
 94 | }
 95 | 
 96 | int io_uring_register(int fd, unsigned int opcode, void *arg,
 97 | 		      unsigned int nr_args) {
 98 | 	return (int) syscall(__NR_io_uring_register, fd, opcode, arg, nr_args);
 99 | }
100 | 
101 | /*
102 | * io_uring requires a lot of setup which looks pretty hairy, but isn't all
103 | * that difficult to understand. Because of all this boilerplate code,
104 | * io_uring's author has created liburing, which is relatively easy to use.
105 | * However, you should take your time and understand this code. It is always
106 | * good to know how it all works underneath. Apart from bragging rights,
107 | * it does offer you a certain strange geeky peace.
108 | * */
109 | 
110 | int app_setup_uring(struct submitter *s) {
111 | 	struct app_io_sq_ring *sring = &s->sq_ring;
112 | 	struct app_io_cq_ring *cring = &s->cq_ring;
113 | 	struct io_uring_params p;
114 | 	void *sq_ptr, *cq_ptr;
115 | 
116 | 	/*
117 | 	* We need to pass in the io_uring_params structure to the io_uring_setup()
118 | 	* call zeroed out. We could set any flags if we need to, but for this
119 | 	* example, we don't.
120 | 	* */
121 | 	memset(&p, 0, sizeof(p));
122 | 	s->ring_fd = io_uring_setup(s->batch_size, &p);
123 | 	if (s->ring_fd < 0) {
124 | 		perror("io_uring_setup");
125 | 		return 1;
126 | 	}
127 | 
128 | 	/*
129 | 	* io_uring communication happens via 2 shared kernel-user space ring buffers,
130 | 	* which can be jointly mapped with a single mmap() call in recent kernels.
131 | 	* While the completion queue is directly manipulated, the submission queue
132 | 	* has an indirection array in between. We map that in as well.
133 | 	* */
134 | 
135 | 	int sring_sz = p.sq_off.array + p.sq_entries * sizeof(unsigned);
136 | 	int cring_sz = p.cq_off.cqes + p.cq_entries * sizeof(struct io_uring_cqe);
137 | 
138 | 	/* In kernel version 5.4 and above, it is possible to map the submission and
139 | 	* completion buffers with a single mmap() call. Rather than check for kernel
140 | 	* versions, the recommended way is to just check the features field of the
141 | 	* io_uring_params structure, which is a bit mask. If the
142 | 	* IORING_FEAT_SINGLE_MMAP is set, then we can do away with the second mmap()
143 | 	* call to map the completion ring.
144 | 	* */
145 | 	if (p.features & IORING_FEAT_SINGLE_MMAP) {
146 | 		if (cring_sz > sring_sz) {
147 | 			sring_sz = cring_sz;
148 | 		}
149 | 		cring_sz = sring_sz;
150 | 	}
151 | 
152 | 	/* Map in the submission and completion queue ring buffers.
153 | 	* Older kernels only map in the submission queue, though.
154 | 	* */
155 | 	sq_ptr = mmap(0, sring_sz, PROT_READ | PROT_WRITE,
156 | 		      MAP_SHARED | MAP_POPULATE,
157 | 		      s->ring_fd, IORING_OFF_SQ_RING);
158 | 	if (sq_ptr == MAP_FAILED) {
159 | 		perror("mmap");
160 | 		return 1;
161 | 	}
162 | 
163 | 	if (p.features & IORING_FEAT_SINGLE_MMAP) {
164 | 		cq_ptr = sq_ptr;
165 | 	} else {
166 | 		/* Map in the completion queue ring buffer in older kernels separately */
167 | 		cq_ptr = mmap(0, cring_sz, PROT_READ | PROT_WRITE,
168 | 			      MAP_SHARED | MAP_POPULATE,
169 | 			      s->ring_fd, IORING_OFF_CQ_RING);
170 | 		if (cq_ptr == MAP_FAILED) {
171 | 			perror("mmap");
172 | 			return 1;
173 | 		}
174 | 	}
175 | 	/* Save useful fields in a global app_io_sq_ring struct for later
176 | 	* easy reference */
177 | 	sring->head = sq_ptr + p.sq_off.head;
178 | 	sring->tail = sq_ptr + p.sq_off.tail;
179 | 	sring->ring_mask = sq_ptr + p.sq_off.ring_mask;
180 | 	sring->ring_entries = sq_ptr + p.sq_off.ring_entries;
181 | 	sring->flags = sq_ptr + p.sq_off.flags;
182 | 	sring->array = sq_ptr + p.sq_off.array;
183 | 
184 | 	/* Map in the submission queue entries array */
185 | 	s->sqes = mmap(0, p.sq_entries * sizeof(struct io_uring_sqe),
186 | 		       PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE,
187 | 		       s->ring_fd, IORING_OFF_SQES);
188 | 	if (s->sqes == MAP_FAILED) {
189 | 		perror("mmap");
190 | 		return 1;
191 | 	}
192 | 
193 | 	/* Save useful fields in a global app_io_cq_ring struct for later
194 | 	* easy reference */
195 | 	cring->head = cq_ptr + p.cq_off.head;
196 | 	cring->tail = cq_ptr + p.cq_off.tail;
197 | 	cring->ring_mask = cq_ptr + p.cq_off.ring_mask;
198 | 	cring->ring_entries = cq_ptr + p.cq_off.ring_entries;
199 | 	cring->cqes = cq_ptr + p.cq_off.cqes;
200 | 
201 | 	return 0;
202 | }
203 | 
204 | /*
205 | * Read from completion queue.
206 | * In this function, we read completion events from the completion queue, get
207 | * the data buffer that will have the file data and print it to the console.
208 | * */
209 | int poll_from_cq(struct submitter *s) {
210 | 	struct app_io_cq_ring *cring = &s->cq_ring;
211 | 	struct io_uring_cqe *cqe;
212 | 	unsigned head;
213 | 	int reaped = 0;
214 | 
215 | 	head = *cring->head;
216 | 	read_barrier();
217 | 
218 | 	while (true) {
219 | 		/*
220 | 		* Remember, this is a ring buffer. If head == tail, it means that the
221 | 		* buffer is empty.
222 | 		* */
223 | 		if (head == *cring->tail) {
224 | 			break;
225 | 		}
226 | 		read_barrier();
227 | 
228 | 		/* Get the entry */
229 | 		cqe = &cring->cqes[head & *s->cq_ring.ring_mask];
230 | 		if (cqe->res != READ_SIZE) {
231 | 			printf("read_from_cq error, ret: %d\n", cqe->res);
232 | 			exit(1);
233 | 		}
234 | 		s->completion_arr[reaped++] = cqe->user_data;
235 | 		head++;
236 | 	}
237 | 
238 | 	*cring->head = head;
239 | 	write_barrier();
240 | 
241 | 	return reaped;
242 | }
243 | 
244 | /*
245 | * Submit to submission queue.
246 | * In this function, we submit requests to the submission queue. You can submit
247 | * many types of requests. Ours is going to be the readv() request, which we
248 | * specify via IORING_OP_READV.
249 | *
250 | * */
251 | void submit_to_sq(struct submitter *s, unsigned long long user_data, void *addr, bool with_barrier) {
252 | 	struct app_io_sq_ring *sring = &s->sq_ring;
253 | 	unsigned index = 0, tail = 0, next_tail = 0;
254 | 
255 | 	/* Add our submission queue entry to the tail of the SQE ring buffer */
256 | 	next_tail = tail = *sring->tail;
257 | 	next_tail++;
258 | 	index = tail & *s->sq_ring.ring_mask;
259 | 	struct io_uring_sqe *sqe = &s->sqes[index];
260 | 	sqe->fd = rand() % s->num_file;  /* randomly choose a device */
261 | 	sqe->flags = IOSQE_FIXED_FILE;
262 | 	sqe->opcode = IORING_OP_READV;
263 | 	sqe->addr = (unsigned long) addr;
264 | 	sqe->len = 1;
265 | 	sqe->off = ((long) (rand() % MAX_PAGE_INDEX)) << PAGE_SHIFT;  /* randomly choose an offset */
266 | 	sqe->user_data = user_data;
267 | 	sring->array[index] = index;
268 | 	tail = next_tail;
269 | 
270 | 	if (with_barrier) {
271 | 		write_barrier();
272 | 	}
273 | 
274 | 	/* Update the tail so the kernel can see it. */
275 | 	if (*sring->tail != tail) {
276 | 		*sring->tail = tail;
277 | 		if (with_barrier) {
278 | 			write_barrier();
279 | 		}
280 | 	}
281 | }
282 | 
283 | int main(int argc, char *argv[]) {
284 | 	struct submitter *s;
285 | 
286 | 	if (argc < 5) {
287 | 		fprintf(stderr, "Usage: %s <batch_size> <level> <iteration> <filenames>\n", argv[0]);
288 | 		return 1;
289 | 	}
290 | 	int batch_size;
291 | 	int level;
292 | 	long iteration;
293 | 	int num_file = argc - 4;
294 | 	sscanf(argv[1], "%d", &batch_size);
295 | 	sscanf(argv[2], "%d", &level);
296 | 	sscanf(argv[3], "%ld", &iteration);
297 | 
298 | 	s = (struct submitter *) malloc(sizeof(*s));
299 | 	if (!s) {
300 | 		perror("malloc");
301 | 		return 1;
302 | 	}
303 | 	memset(s, 0, sizeof(*s));
304 | 	s->batch_size = batch_size;
305 | 	s->num_file = num_file;
306 | 
307 | 	if (app_setup_uring(s)) {
308 | 		fprintf(stderr, "Unable to setup uring!\n");
309 | 		return 1;
310 | 	}
311 | 
312 | 	s->fd_arr = (int *) malloc(sizeof(int) * num_file);
313 | 	if (!s->fd_arr) {
314 | 		perror("s->fd_arr");
315 | 		return 1;
316 | 	}
317 | 
318 | 	for (int i = 0; i < num_file; ++i) {
319 | 		s->fd_arr[i] = open(argv[4 + i], O_RDONLY | O_DIRECT);
320 | 		if (s->fd_arr[i] < 0) {
321 | 			perror("open");
322 | 			exit(1);
323 | 		}
324 | 		sys_set_bpf_level(s->fd_arr[i], level);
325 | 	}
326 | 	int ret = io_uring_register(s->ring_fd, IORING_REGISTER_FILES, s->fd_arr, s->num_file);
327 | 	if (ret) {
328 | 		perror("io_uring_register");
329 | 		exit(1);
330 | 	}
331 | 
332 | 	if (posix_memalign(&s->buffer, READ_SIZE, READ_SIZE)) {
333 | 		perror("posix_memalign");
334 | 		return 1;
335 | 	}
336 | 	s->iovecs = (struct iovec *) malloc(batch_size * sizeof(struct iovec));
337 | 	if (!s->iovecs) {
338 | 		perror("s->iovecs");
339 | 		exit(1);
340 | 	}
341 | 	for (int i = 0; i < batch_size; ++i) {
342 | 		s->iovecs[i].iov_base = s->buffer;
343 | 		s->iovecs[i].iov_len = READ_SIZE;
344 | 	}
345 | 
346 | 	s->entry_arr = (struct entry *) malloc(iteration * batch_size * sizeof(*s->entry_arr));
347 | 	if (!s->entry_arr) {
348 | 		perror("s->entry_arr");
349 | 		exit(1);
350 | 	}
351 | 	memset(s->entry_arr, 0, iteration * batch_size * sizeof(*s->entry_arr));
352 | 	s->completion_arr = (long *) malloc(batch_size * sizeof(long));
353 | 	if (!s->completion_arr) {
354 | 		perror("s->completion_arr");
355 | 		exit(1);
356 | 	}
357 | 	memset(s->completion_arr, 0, batch_size * sizeof(long));
358 | 	s->finished_op = 0;
359 | 
360 | 	/* send the first batch */
361 | 	for (int i = 0; i < batch_size; ++i) {
362 | 		s->entry_arr[i].start_time = steady_clock::now();
363 | 		submit_to_sq(s, i, &s->iovecs[i], false);
364 | 	}
365 | 	write_barrier();
366 | 	int enter_ret = io_uring_enter(s->ring_fd, s->batch_size, 0, IORING_ENTER_GETEVENTS);
367 | 	if (enter_ret < 0) {
368 | 		perror("io_uring_enter");
369 | 		return 1;
370 | 	}
371 | 
372 | 	while (s->finished_op < iteration * batch_size) {
373 | 		int reaped = poll_from_cq(s);
374 | 		int submitted = 0;
375 | 		for (int i = 0; i < reaped; ++i) {
376 | 			long index = s->completion_arr[i];
377 | 			long batch_index = index / batch_size;
378 | 			long sub_index = index % batch_size;
379 | 			struct entry *e = &s->entry_arr[index];
380 | 			e->total_time += duration_cast<nanoseconds>(steady_clock::now() - e->start_time).count();
381 | 
382 | 			if (batch_index == iteration - 1) {
383 | 				/* no further request */
384 | 				continue;
385 | 			}
386 | 			long next_batch_index = batch_index + 1;
387 | 			long next_index = next_batch_index * batch_size + sub_index;
388 | 			struct entry *next_e = &s->entry_arr[next_index];
389 | 			next_e->start_time = steady_clock::now();
390 | 			submit_to_sq(s, next_index, &s->iovecs[sub_index], false);
391 | 			++submitted;
392 | 		}
393 | 		s->finished_op += reaped;
394 | 		write_barrier();
395 | 		if (submitted == 0) {
396 | 			continue;
397 | 		}
398 | 		int enter_ret = io_uring_enter(s->ring_fd, submitted, 0, IORING_ENTER_GETEVENTS);
399 | 		if (enter_ret < 0) {
400 | 			perror("io_uring_enter");
401 | 			return 1;
402 | 		}
403 | 	}
404 | 
405 | 	for (long i = 0; i < iteration * batch_size; ++i) {
406 | 		std::cout << s->entry_arr[i].total_time << std::endl;
407 | 	}
408 | 
409 | 	return 0;
410 | }
411 | 


--------------------------------------------------------------------------------
/bpf/Makefile:
--------------------------------------------------------------------------------
 1 | LLC ?= llc
 2 | CLANG ?= clang
 3 | CC ?= gcc
 4 | 
 5 | BPF_CFLAGS ?= -I$(LIBBPF_DIR)/build/usr/
 6 | 
 7 | all: bpf_program.o
 8 | 
 9 | %.o: %.c
10 | 	$(CLANG) -S \
11 | 	    -target bpf \
12 | 	    -D __BPF_TRACING__ \
13 | 	    $(BPF_CFLAGS) \
14 | 	    -Wall \
15 | 	    -Wno-unused-value \
16 | 	    -Wno-pointer-sign \
17 | 	    -Wno-compare-distinct-pointer-types \
18 | 	    -Werror \
19 | 	    -O2 -emit-llvm -c -g -o ${@:.o=.ll} $<
20 | 	$(LLC) -march=bpf -filetype=obj -o $@ ${@:.o=.ll}
21 | 
22 | clean: 
23 | 	rm -rf %.o
24 | 


--------------------------------------------------------------------------------
/bpf/bpf_loader.c:
--------------------------------------------------------------------------------
 1 | #include <linux/bpf.h>
 2 | #include <linux/lirc.h>
 3 | #include <linux/input.h>
 4 | #include <errno.h>
 5 | #include <stdio.h>
 6 | #include <stdlib.h>
 7 | #include <string.h>
 8 | #include <unistd.h>
 9 | #include <poll.h>
10 | #include <sys/types.h>
11 | #include <sys/ioctl.h>
12 | #include <sys/stat.h>
13 | #include <fcntl.h>
14 | 
15 | #include <bpf/bpf.h>
16 | #include <bpf/libbpf.h>
17 | #include <asm/types.h>
18 | 
19 | int main(int argc, char **argv)
20 | {
21 | 	struct bpf_object *obj;
22 | 	int ret, progfd;
23 | 
24 | 	ret = bpf_prog_load("bpf_program.o", BPF_PROG_TYPE_STORAGE, &obj, &progfd);
25 | 	if (ret) {
26 | 		printf("Failed to load bpf program\n");
27 | 		exit(1);
28 | 	}
29 | 
30 | 	ret = bpf_prog_attach(/*prog_fd=*/progfd, /*target_fd=*/0, BPF_STORAGE, 0);
31 | 	if (ret) {
32 | 		printf("Failed to attach bpf\n");
33 | 		exit(1);
34 | 	}
35 | 
36 | 	return 0;
37 | }
38 | 


--------------------------------------------------------------------------------
/bpf/bpf_program.c:
--------------------------------------------------------------------------------
 1 | #include <linux/bpf.h>
 2 | #include <asm-generic/types.h>
 3 | #include <bpf/bpf_helpers.h>
 4 | 
 5 | char _license[] SEC("license") = "GPL";
 6 | 
 7 | #define __inline inline __attribute__((always_inline))
 8 | #define __noinline __attribute__((noinline))
 9 | #define __nooptimize __attribute__((optnone))
10 | 
11 | #define memcpy(dest, src, n) __builtin_memcpy((dest), (src), (n))
12 | #define memset(dest, value, n) __builtin_memset((dest), (value), (n))
13 | 
14 | SEC("prog")
15 | __u32 main_func(struct bpf_storage *context) {
16 |     long number = 0;
17 |     int i = 0;
18 |     #pragma unroll
19 |     for (i = 0; i < (512 / sizeof(long)); ++i) {
20 |         memcpy(&number, &context->data[i * sizeof(long)], sizeof(long));
21 |     }
22 |     return 0;
23 | }
24 | 


--------------------------------------------------------------------------------
/bpf/load_bpf.sh:
--------------------------------------------------------------------------------
1 | make
2 | clang bpf_loader.c -lbpf
3 | ./a.out
4 | 


--------------------------------------------------------------------------------
/kernel/nvme_driver_hook.diff:
--------------------------------------------------------------------------------
  1 | diff --git a/Makefile b/Makefile
  2 | index 24a4c1b97bb0..d702769637fd 100644
  3 | --- a/Makefile
  4 | +++ b/Makefile
  5 | @@ -2,7 +2,7 @@
  6 |  VERSION = 5
  7 |  PATCHLEVEL = 8
  8 |  SUBLEVEL = 0
  9 | -EXTRAVERSION =
 10 | +EXTRAVERSION =-bpf-storage-nvme
 11 |  NAME = Kleptomaniac Octopus
 12 |  
 13 |  # *DOCUMENTATION*
 14 | diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
 15 | index 78847b32e137..f9a4b4b7bdda 100644
 16 | --- a/arch/x86/entry/syscalls/syscall_64.tbl
 17 | +++ b/arch/x86/entry/syscalls/syscall_64.tbl
 18 | @@ -360,6 +360,7 @@
 19 |  437	common	openat2			sys_openat2
 20 |  438	common	pidfd_getfd		sys_pidfd_getfd
 21 |  439	common	faccessat2		sys_faccessat2
 22 | +440	common	set_bpf_level		sys_set_bpf_level
 23 |  
 24 |  #
 25 |  # x32-specific system call numbers start at 512 to avoid cache impact
 26 | diff --git a/block/blk-core.c b/block/blk-core.c
 27 | index 03252af8c82c..41345936edcf 100644
 28 | --- a/block/blk-core.c
 29 | +++ b/block/blk-core.c
 30 | @@ -907,6 +907,9 @@ static inline int blk_partition_remap(struct bio *bio)
 31 |  		if (bio_check_eod(bio, part_nr_sects_read(p)))
 32 |  			goto out;
 33 |  		bio->bi_iter.bi_sector += p->start_sect;
 34 | +		if (bio->_bpf_level > 0) {
 35 | +			bio->_bpf_partition_start_sector = p->start_sect;
 36 | +		}
 37 |  		trace_block_bio_remap(bio->bi_disk->queue, bio, part_devt(p),
 38 |  				      bio->bi_iter.bi_sector - p->start_sect);
 39 |  	}
 40 | diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
 41 | index 09ffc3246f60..64d076067e3e 100644
 42 | --- a/drivers/nvme/host/nvme.h
 43 | +++ b/drivers/nvme/host/nvme.h
 44 | @@ -485,6 +485,10 @@ static inline void nvme_end_request(struct request *req, __le16 status,
 45 |  		union nvme_result result)
 46 |  {
 47 |  	struct nvme_request *rq = nvme_req(req);
 48 | +	if (req->_bpf_command) {
 49 | +		kfree(req->_bpf_command);
 50 | +		req->_bpf_command = NULL;
 51 | +	}
 52 |  
 53 |  	rq->status = le16_to_cpu(status) >> 1;
 54 |  	rq->result = result;
 55 | diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
 56 | index d4b1ff747123..ed1dbb010552 100644
 57 | --- a/drivers/nvme/host/pci.c
 58 | +++ b/drivers/nvme/host/pci.c
 59 | @@ -24,6 +24,8 @@
 60 |  #include <linux/io-64-nonatomic-lo-hi.h>
 61 |  #include <linux/sed-opal.h>
 62 |  #include <linux/pci-p2pdma.h>
 63 | +#include <linux/bpf.h>
 64 | +#include <linux/filter.h>
 65 |  
 66 |  #include "trace.h"
 67 |  #include "nvme.h"
 68 | @@ -469,14 +471,15 @@ static inline void nvme_write_sq_db(struct nvme_queue *nvmeq)
 69 |  static void nvme_submit_cmd(struct nvme_queue *nvmeq, struct nvme_command *cmd,
 70 |  			    bool write_sq)
 71 |  {
 72 | -	spin_lock(&nvmeq->sq_lock);
 73 | +	unsigned long flags;
 74 | +	spin_lock_irqsave(&nvmeq->sq_lock, flags);
 75 |  	memcpy(nvmeq->sq_cmds + (nvmeq->sq_tail << nvmeq->sqes),
 76 |  	       cmd, sizeof(*cmd));
 77 |  	if (++nvmeq->sq_tail == nvmeq->q_depth)
 78 |  		nvmeq->sq_tail = 0;
 79 |  	if (write_sq)
 80 |  		nvme_write_sq_db(nvmeq);
 81 | -	spin_unlock(&nvmeq->sq_lock);
 82 | +	spin_unlock_irqrestore(&nvmeq->sq_lock, flags);
 83 |  }
 84 |  
 85 |  static void nvme_commit_rqs(struct blk_mq_hw_ctx *hctx)
 86 | @@ -860,9 +863,23 @@ static blk_status_t nvme_queue_rq(struct blk_mq_hw_ctx *hctx,
 87 |  	struct nvme_dev *dev = nvmeq->dev;
 88 |  	struct request *req = bd->rq;
 89 |  	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
 90 | -	struct nvme_command cmnd;
 91 | +	struct nvme_command cmnd, *cmndp;
 92 |  	blk_status_t ret;
 93 |  
 94 | +	if (req->bio && req->bio->_bpf_level > 0) {
 95 | +		cmndp = kmalloc(sizeof(struct nvme_command), GFP_NOWAIT);
 96 | +		if (!cmndp) {
 97 | +			printk("nvme_queue_rq: failed to allocate struct nvme_command\n");
 98 | +			cmndp = &cmnd;
 99 | +			req->_bpf_command = NULL;
100 | +		} else {
101 | +			req->_bpf_command = cmndp;
102 | +		}
103 | +	} else {
104 | +		cmndp = &cmnd;
105 | +		req->_bpf_command = NULL;
106 | +	}
107 | +
108 |  	iod->aborted = 0;
109 |  	iod->npages = -1;
110 |  	iod->nents = 0;
111 | @@ -874,24 +891,24 @@ static blk_status_t nvme_queue_rq(struct blk_mq_hw_ctx *hctx,
112 |  	if (unlikely(!test_bit(NVMEQ_ENABLED, &nvmeq->flags)))
113 |  		return BLK_STS_IOERR;
114 |  
115 | -	ret = nvme_setup_cmd(ns, req, &cmnd);
116 | +	ret = nvme_setup_cmd(ns, req, cmndp);
117 |  	if (ret)
118 |  		return ret;
119 |  
120 |  	if (blk_rq_nr_phys_segments(req)) {
121 | -		ret = nvme_map_data(dev, req, &cmnd);
122 | +		ret = nvme_map_data(dev, req, cmndp);
123 |  		if (ret)
124 |  			goto out_free_cmd;
125 |  	}
126 |  
127 |  	if (blk_integrity_rq(req)) {
128 | -		ret = nvme_map_metadata(dev, req, &cmnd);
129 | +		ret = nvme_map_metadata(dev, req, cmndp);
130 |  		if (ret)
131 |  			goto out_unmap_data;
132 |  	}
133 |  
134 |  	blk_mq_start_request(req);
135 | -	nvme_submit_cmd(nvmeq, &cmnd, bd->last);
136 | +	nvme_submit_cmd(nvmeq, cmndp, bd->last);
137 |  	return BLK_STS_OK;
138 |  out_unmap_data:
139 |  	nvme_unmap_data(dev, req);
140 | @@ -937,10 +954,16 @@ static inline struct blk_mq_tags *nvme_queue_tagset(struct nvme_queue *nvmeq)
141 |  	return nvmeq->dev->tagset.tags[nvmeq->qid - 1];
142 |  }
143 |  
144 | +extern struct bpf_prog __rcu *_bpf_prog;
145 | +extern struct bpf_storage_kern _bpf_g_context;
146 | +
147 |  static inline void nvme_handle_cqe(struct nvme_queue *nvmeq, u16 idx)
148 |  {
149 |  	struct nvme_completion *cqe = &nvmeq->cqes[idx];
150 |  	struct request *req;
151 | +	long _index;
152 | +	struct bpf_prog *_local_bpf_prog;
153 | +	u32 _bpf_return;
154 |  
155 |  	if (unlikely(cqe->command_id >= nvmeq->q_depth)) {
156 |  		dev_warn(nvmeq->dev->ctrl.device,
157 | @@ -963,7 +986,31 @@ static inline void nvme_handle_cqe(struct nvme_queue *nvmeq, u16 idx)
158 |  
159 |  	req = blk_mq_tag_to_rq(nvme_queue_tagset(nvmeq), cqe->command_id);
160 |  	trace_nvme_sq(req, cqe->sq_head, nvmeq->sq_tail);
161 | -	nvme_end_request(req, cqe->status, cqe->result);
162 | +
163 | +	if (!req->bio || req->bio->_bpf_level == 0) {
164 | +		nvme_end_request(req, cqe->status, cqe->result);
165 | +	} else {
166 | +		++req->bio->_bpf_count;
167 | +		if (req->bio->_bpf_count < req->bio->_bpf_level) {
168 | +			/* resubmit another request */
169 | +			rcu_read_lock();
170 | +			_local_bpf_prog = rcu_dereference(_bpf_prog);
171 | +			if (_local_bpf_prog) {
172 | +				/* run bpf program if present */
173 | +				_bpf_return = BPF_PROG_RUN(_local_bpf_prog, &_bpf_g_context);
174 | +			}
175 | +			rcu_read_unlock();
176 | +			_index = req->bio->bi_iter.bi_sector >> (12 - 9);
177 | +			_index = (_index * 1103515245 + 12345) % (1 << 23);  /* randomly choose the next offset */
178 | +			req->bio->bi_iter.bi_sector = _index << (12 - 9);
179 | +			req->__sector = req->bio->bi_iter.bi_sector + req->bio->_bpf_partition_start_sector;
180 | +			req->_bpf_command->rw.slba = cpu_to_le64(nvme_sect_to_lba(req->q->queuedata, blk_rq_pos(req)));
181 | +			nvme_submit_cmd(nvmeq, req->_bpf_command, true);
182 | +		} else {
183 | +			/* complete this IO chain */
184 | +			nvme_end_request(req, cqe->status, cqe->result);
185 | +		}
186 | +	}
187 |  }
188 |  
189 |  static inline void nvme_update_cq_head(struct nvme_queue *nvmeq)
190 | diff --git a/fs/block_dev.c b/fs/block_dev.c
191 | index 0ae656e022fd..98ba2b352d81 100644
192 | --- a/fs/block_dev.c
193 | +++ b/fs/block_dev.c
194 | @@ -231,6 +231,9 @@ __blkdev_direct_IO_simple(struct kiocb *iocb, struct iov_iter *iter,
195 |  	bio.bi_end_io = blkdev_bio_end_io_simple;
196 |  	bio.bi_ioprio = iocb->ki_ioprio;
197 |  
198 | +	bio._bpf_level = file->_bpf_level;
199 | +	bio._bpf_partition_start_sector = 0;
200 | +
201 |  	ret = bio_iov_iter_get_pages(&bio, iter);
202 |  	if (unlikely(ret))
203 |  		goto out;
204 | @@ -381,6 +384,8 @@ __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, int nr_pages)
205 |  		bio->bi_private = dio;
206 |  		bio->bi_end_io = blkdev_bio_end_io;
207 |  		bio->bi_ioprio = iocb->ki_ioprio;
208 | +		bio->_bpf_level = file->_bpf_level;
209 | +		bio->_bpf_partition_start_sector = 0;
210 |  
211 |  		ret = bio_iov_iter_get_pages(bio, iter);
212 |  		if (unlikely(ret)) {
213 | diff --git a/fs/ioctl.c b/fs/ioctl.c
214 | index d69786d1dd91..9ccd5201331d 100644
215 | --- a/fs/ioctl.c
216 | +++ b/fs/ioctl.c
217 | @@ -19,6 +19,8 @@
218 |  #include <linux/falloc.h>
219 |  #include <linux/sched/signal.h>
220 |  #include <linux/fiemap.h>
221 | +#include <linux/bpf.h>
222 | +#include <linux/filter.h>
223 |  
224 |  #include "internal.h"
225 |  
226 | @@ -762,6 +764,57 @@ SYSCALL_DEFINE3(ioctl, unsigned int, fd, unsigned int, cmd, unsigned long, arg)
227 |  	return ksys_ioctl(fd, cmd, arg);
228 |  }
229 |  
230 | +SYSCALL_DEFINE2(set_bpf_level, int, fd, int, level)
231 | +{
232 | +	struct fd f = fdget_pos(fd);
233 | +	long ret = -EBADF;
234 | +
235 | +	if (f.file) {
236 | +		f.file->_bpf_level = level;
237 | +		fdput_pos(f);
238 | +		ret = 0;
239 | +	} else {
240 | +		printk("set_bpf_level: bad file descriptor\n");
241 | +	}
242 | +
243 | +	return ret;
244 | +}
245 | +
246 | +struct bpf_prog __rcu *_bpf_prog;
247 | +EXPORT_SYMBOL(_bpf_prog);
248 | +
249 | +struct bpf_storage_kern _bpf_g_context;
250 | +EXPORT_SYMBOL(_bpf_g_context);
251 | +
252 | +int _storage_bpf_prog_attach(const union bpf_attr *attr, struct bpf_prog *prog)
253 | +{
254 | +	rcu_assign_pointer(_bpf_prog, prog);
255 | +	return 0;
256 | +}
257 | +
258 | +int _storage_bpf_prog_detach(const union bpf_attr *attr)
259 | +{
260 | +	rcu_assign_pointer(_bpf_prog, NULL);
261 | +	return 0;
262 | +}
263 | +
264 | +const struct bpf_prog_ops storage_prog_ops = {};
265 | +
266 | +static const struct bpf_func_proto *
267 | +storage_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
268 | +{
269 | +	return bpf_base_func_proto(func_id);
270 | +}
271 | +
272 | +static bool storage_is_valid_access(int off, int size, enum bpf_access_type type, const struct bpf_prog *prog, struct bpf_insn_access_aux *info){
273 | +	return true;
274 | +}
275 | +
276 | +const struct bpf_verifier_ops storage_verifier_ops = {
277 | +	.get_func_proto = storage_func_proto,
278 | +	.is_valid_access = storage_is_valid_access,
279 | +};
280 | +
281 |  #ifdef CONFIG_COMPAT
282 |  /**
283 |   * compat_ptr_ioctl - generic implementation of .compat_ioctl file operation
284 | diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c
285 | index ec7b78e6feca..a6e53c8b311d 100644
286 | --- a/fs/iomap/direct-io.c
287 | +++ b/fs/iomap/direct-io.c
288 | @@ -277,6 +277,9 @@ iomap_dio_bio_actor(struct inode *inode, loff_t pos, loff_t length,
289 |  		bio->bi_private = dio;
290 |  		bio->bi_end_io = iomap_dio_bio_end_io;
291 |  
292 | +		bio->_bpf_level = dio->iocb->ki_filp->_bpf_level;
293 | +		bio->_bpf_partition_start_sector = 0;
294 | +
295 |  		ret = bio_iov_iter_get_pages(bio, dio->submit.iter);
296 |  		if (unlikely(ret)) {
297 |  			/*
298 | diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
299 | index ccb895f911b1..013d8aa2396e 100644
300 | --- a/include/linux/blk_types.h
301 | +++ b/include/linux/blk_types.h
302 | @@ -211,6 +211,10 @@ struct bio {
303 |  
304 |  	struct bio_set		*bi_pool;
305 |  
306 | +	int _bpf_level;
307 | +	int _bpf_count;
308 | +	u64 _bpf_partition_start_sector;
309 | +
310 |  	/*
311 |  	 * We can inline a number of vecs at the end of the bio, to avoid
312 |  	 * double allocations for a small number of bio_vecs. This member
313 | diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
314 | index 57241417ff2f..c59a532c0376 100644
315 | --- a/include/linux/blkdev.h
316 | +++ b/include/linux/blkdev.h
317 | @@ -27,6 +27,7 @@
318 |  #include <linux/percpu-refcount.h>
319 |  #include <linux/scatterlist.h>
320 |  #include <linux/blkzoned.h>
321 | +#include <linux/nvme.h>
322 |  
323 |  struct module;
324 |  struct scsi_ioctl_command;
325 | @@ -241,6 +242,8 @@ struct request {
326 |  		u64 fifo_time;
327 |  	};
328 |  
329 | +	struct nvme_command *_bpf_command;
330 | +
331 |  	/*
332 |  	 * completion callback.
333 |  	 */
334 | diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h
335 | index a18ae82a298a..53e695703b57 100644
336 | --- a/include/linux/bpf_types.h
337 | +++ b/include/linux/bpf_types.h
338 | @@ -76,6 +76,9 @@ BPF_PROG_TYPE(BPF_PROG_TYPE_LSM, lsm,
339 |  #endif /* CONFIG_BPF_LSM */
340 |  #endif
341 |  
342 | +BPF_PROG_TYPE(BPF_PROG_TYPE_STORAGE, storage,
343 | +              struct bpf_storage, struct bpf_storage_kern)
344 | +
345 |  BPF_MAP_TYPE(BPF_MAP_TYPE_ARRAY, array_map_ops)
346 |  BPF_MAP_TYPE(BPF_MAP_TYPE_PERCPU_ARRAY, percpu_array_map_ops)
347 |  BPF_MAP_TYPE(BPF_MAP_TYPE_PROG_ARRAY, prog_array_map_ops)
348 | diff --git a/include/linux/filter.h b/include/linux/filter.h
349 | index 0b0144752d78..cb81a2458192 100644
350 | --- a/include/linux/filter.h
351 | +++ b/include/linux/filter.h
352 | @@ -1278,4 +1278,8 @@ struct bpf_sockopt_kern {
353 |  	s32		retval;
354 |  };
355 |  
356 | +struct bpf_storage_kern {
357 | +	char data[512];
358 | +};
359 | +
360 |  #endif /* __LINUX_FILTER_H__ */
361 | diff --git a/include/linux/fs.h b/include/linux/fs.h
362 | index f5abba86107d..a209bd67209e 100644
363 | --- a/include/linux/fs.h
364 | +++ b/include/linux/fs.h
365 | @@ -950,6 +950,8 @@ struct file {
366 |  	struct inode		*f_inode;	/* cached value */
367 |  	const struct file_operations	*f_op;
368 |  
369 | +	int _bpf_level;
370 | +
371 |  	/*
372 |  	 * Protects f_ep_links, f_flags.
373 |  	 * Must not be taken from IRQ context.
374 | diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
375 | index b951a87da987..84e9b80c4bf3 100644
376 | --- a/include/linux/syscalls.h
377 | +++ b/include/linux/syscalls.h
378 | @@ -1424,4 +1424,6 @@ long compat_ksys_semtimedop(int semid, struct sembuf __user *tsems,
379 |  			    unsigned int nsops,
380 |  			    const struct old_timespec32 __user *timeout);
381 |  
382 | +asmlinkage long sys_set_bpf_level(int fd, int level);
383 | +
384 |  #endif
385 | diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
386 | index 8bd33050b7bb..81f5bf3768fd 100644
387 | --- a/include/uapi/linux/bpf.h
388 | +++ b/include/uapi/linux/bpf.h
389 | @@ -189,6 +189,7 @@ enum bpf_prog_type {
390 |  	BPF_PROG_TYPE_STRUCT_OPS,
391 |  	BPF_PROG_TYPE_EXT,
392 |  	BPF_PROG_TYPE_LSM,
393 | +	BPF_PROG_TYPE_STORAGE,
394 |  };
395 |  
396 |  enum bpf_attach_type {
397 | @@ -226,7 +227,8 @@ enum bpf_attach_type {
398 |  	BPF_CGROUP_INET4_GETSOCKNAME,
399 |  	BPF_CGROUP_INET6_GETSOCKNAME,
400 |  	BPF_XDP_DEVMAP,
401 | -	__MAX_BPF_ATTACH_TYPE
402 | +	BPF_STORAGE,
403 | +	__MAX_BPF_ATTACH_TYPE,
404 |  };
405 |  
406 |  #define MAX_BPF_ATTACH_TYPE __MAX_BPF_ATTACH_TYPE
407 | @@ -4261,4 +4263,9 @@ struct bpf_pidns_info {
408 |  	__u32 pid;
409 |  	__u32 tgid;
410 |  };
411 | +
412 | +struct bpf_storage {
413 | +	char data[512];
414 | +};
415 | +
416 |  #endif /* _UAPI__LINUX_BPF_H__ */
417 | diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
418 | index 0fd80ac81f70..8550179bbc43 100644
419 | --- a/kernel/bpf/syscall.c
420 | +++ b/kernel/bpf/syscall.c
421 | @@ -2815,6 +2815,8 @@ attach_type_to_prog_type(enum bpf_attach_type attach_type)
422 |  		return BPF_PROG_TYPE_CGROUP_SOCKOPT;
423 |  	case BPF_TRACE_ITER:
424 |  		return BPF_PROG_TYPE_TRACING;
425 | +	case BPF_STORAGE:
426 | +		return BPF_PROG_TYPE_STORAGE;
427 |  	default:
428 |  		return BPF_PROG_TYPE_UNSPEC;
429 |  	}
430 | @@ -2825,6 +2827,9 @@ attach_type_to_prog_type(enum bpf_attach_type attach_type)
431 |  #define BPF_F_ATTACH_MASK \
432 |  	(BPF_F_ALLOW_OVERRIDE | BPF_F_ALLOW_MULTI | BPF_F_REPLACE)
433 |  
434 | +int _storage_bpf_prog_attach(const union bpf_attr *attr, struct bpf_prog *prog);
435 | +int _storage_bpf_prog_detach(const union bpf_attr *attr);
436 | +
437 |  static int bpf_prog_attach(const union bpf_attr *attr)
438 |  {
439 |  	enum bpf_prog_type ptype;
440 | @@ -2870,6 +2875,9 @@ static int bpf_prog_attach(const union bpf_attr *attr)
441 |  	case BPF_PROG_TYPE_SOCK_OPS:
442 |  		ret = cgroup_bpf_prog_attach(attr, ptype, prog);
443 |  		break;
444 | +	case BPF_PROG_TYPE_STORAGE:
445 | +		ret = _storage_bpf_prog_attach(attr, prog);
446 | +		break;
447 |  	default:
448 |  		ret = -EINVAL;
449 |  	}
450 | @@ -2906,6 +2914,8 @@ static int bpf_prog_detach(const union bpf_attr *attr)
451 |  	case BPF_PROG_TYPE_CGROUP_SYSCTL:
452 |  	case BPF_PROG_TYPE_SOCK_OPS:
453 |  		return cgroup_bpf_prog_detach(attr, ptype);
454 | +	case BPF_PROG_TYPE_STORAGE:
455 | +		return _storage_bpf_prog_detach(attr);
456 |  	default:
457 |  		return -EINVAL;
458 |  	}
459 | 


--------------------------------------------------------------------------------
/kernel/syscall_hook.diff:
--------------------------------------------------------------------------------
  1 | diff --git a/Makefile b/Makefile
  2 | index 24a4c1b97bb0..5449ee659860 100644
  3 | --- a/Makefile
  4 | +++ b/Makefile
  5 | @@ -2,7 +2,7 @@
  6 |  VERSION = 5
  7 |  PATCHLEVEL = 8
  8 |  SUBLEVEL = 0
  9 | -EXTRAVERSION =
 10 | +EXTRAVERSION =-bpf-storage-syscall
 11 |  NAME = Kleptomaniac Octopus
 12 |  
 13 |  # *DOCUMENTATION*
 14 | diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
 15 | index 78847b32e137..f9a4b4b7bdda 100644
 16 | --- a/arch/x86/entry/syscalls/syscall_64.tbl
 17 | +++ b/arch/x86/entry/syscalls/syscall_64.tbl
 18 | @@ -360,6 +360,7 @@
 19 |  437	common	openat2			sys_openat2
 20 |  438	common	pidfd_getfd		sys_pidfd_getfd
 21 |  439	common	faccessat2		sys_faccessat2
 22 | +440	common	set_bpf_level		sys_set_bpf_level
 23 |  
 24 |  #
 25 |  # x32-specific system call numbers start at 512 to avoid cache impact
 26 | diff --git a/fs/ioctl.c b/fs/ioctl.c
 27 | index d69786d1dd91..9ccd5201331d 100644
 28 | --- a/fs/ioctl.c
 29 | +++ b/fs/ioctl.c
 30 | @@ -19,6 +19,8 @@
 31 |  #include <linux/falloc.h>
 32 |  #include <linux/sched/signal.h>
 33 |  #include <linux/fiemap.h>
 34 | +#include <linux/bpf.h>
 35 | +#include <linux/filter.h>
 36 |  
 37 |  #include "internal.h"
 38 |  
 39 | @@ -762,6 +764,57 @@ SYSCALL_DEFINE3(ioctl, unsigned int, fd, unsigned int, cmd, unsigned long, arg)
 40 |  	return ksys_ioctl(fd, cmd, arg);
 41 |  }
 42 |  
 43 | +SYSCALL_DEFINE2(set_bpf_level, int, fd, int, level)
 44 | +{
 45 | +	struct fd f = fdget_pos(fd);
 46 | +	long ret = -EBADF;
 47 | +
 48 | +	if (f.file) {
 49 | +		f.file->_bpf_level = level;
 50 | +		fdput_pos(f);
 51 | +		ret = 0;
 52 | +	} else {
 53 | +		printk("set_bpf_level: bad file descriptor\n");
 54 | +	}
 55 | +
 56 | +	return ret;
 57 | +}
 58 | +
 59 | +struct bpf_prog __rcu *_bpf_prog;
 60 | +EXPORT_SYMBOL(_bpf_prog);
 61 | +
 62 | +struct bpf_storage_kern _bpf_g_context;
 63 | +EXPORT_SYMBOL(_bpf_g_context);
 64 | +
 65 | +int _storage_bpf_prog_attach(const union bpf_attr *attr, struct bpf_prog *prog)
 66 | +{
 67 | +	rcu_assign_pointer(_bpf_prog, prog);
 68 | +	return 0;
 69 | +}
 70 | +
 71 | +int _storage_bpf_prog_detach(const union bpf_attr *attr)
 72 | +{
 73 | +	rcu_assign_pointer(_bpf_prog, NULL);
 74 | +	return 0;
 75 | +}
 76 | +
 77 | +const struct bpf_prog_ops storage_prog_ops = {};
 78 | +
 79 | +static const struct bpf_func_proto *
 80 | +storage_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 81 | +{
 82 | +	return bpf_base_func_proto(func_id);
 83 | +}
 84 | +
 85 | +static bool storage_is_valid_access(int off, int size, enum bpf_access_type type, const struct bpf_prog *prog, struct bpf_insn_access_aux *info){
 86 | +	return true;
 87 | +}
 88 | +
 89 | +const struct bpf_verifier_ops storage_verifier_ops = {
 90 | +	.get_func_proto = storage_func_proto,
 91 | +	.is_valid_access = storage_is_valid_access,
 92 | +};
 93 | +
 94 |  #ifdef CONFIG_COMPAT
 95 |  /**
 96 |   * compat_ptr_ioctl - generic implementation of .compat_ioctl file operation
 97 | diff --git a/fs/read_write.c b/fs/read_write.c
 98 | index 4fb797822567..22e9880af502 100644
 99 | --- a/fs/read_write.c
100 | +++ b/fs/read_write.c
101 | @@ -21,6 +21,8 @@
102 |  #include <linux/mount.h>
103 |  #include <linux/fs.h>
104 |  #include "internal.h"
105 | +#include <linux/bpf.h>
106 | +#include <linux/filter.h>
107 |  
108 |  #include <linux/uaccess.h>
109 |  #include <asm/unistd.h>
110 | @@ -612,9 +614,63 @@ ssize_t ksys_read(unsigned int fd, char __user *buf, size_t count)
111 |  	return ret;
112 |  }
113 |  
114 | +extern struct bpf_prog __rcu *_bpf_prog;
115 | +extern struct bpf_storage_kern _bpf_g_context;
116 | +
117 |  SYSCALL_DEFINE3(read, unsigned int, fd, char __user *, buf, size_t, count)
118 |  {
119 | -	return ksys_read(fd, buf, count);
120 | +	struct fd f;
121 | +	int _bpf_level;
122 | +	loff_t pos, *ppos;
123 | +
124 | +	/* check bpf level info */
125 | +	f = fdget_pos(fd);
126 | +	if (!f.file) {
127 | +		return -EBADF;
128 | +	}
129 | +	_bpf_level = f.file->_bpf_level;
130 | +	ppos = file_ppos(f.file);
131 | +	if (ppos) {
132 | +		pos = *ppos;
133 | +	}
134 | +	fdput_pos(f);
135 | +
136 | +	if (_bpf_level == 0) {
137 | +		/* normal read */
138 | +		return ksys_read(fd, buf, count);
139 | +	} else {
140 | +		/* bpf read with resubmission */
141 | +		if (!ppos) {
142 | +			printk("bpf read: invalid offset\n");
143 | +			return -EBADF;
144 | +		}
145 | +		long index = pos >> 12;
146 | +		int i;
147 | +		struct bpf_prog *_local_bpf_prog;
148 | +		u32 _bpf_return;
149 | +		for (i = 0; i < _bpf_level; ++i) {
150 | +			if (i > 0) {
151 | +				rcu_read_lock();
152 | +				_local_bpf_prog = rcu_dereference(_bpf_prog);
153 | +				if (_local_bpf_prog) {
154 | +					_bpf_return = BPF_PROG_RUN(_local_bpf_prog, &_bpf_g_context);
155 | +				}
156 | +				rcu_read_unlock();
157 | +			}
158 | +			off_t lseek_ret = ksys_lseek(fd, index << 12, SEEK_SET);
159 | +			if (lseek_ret != index << 12) {
160 | +				printk("bpf read: ksys_lseek failed\n");
161 | +				return -EBADF;
162 | +			}
163 | +			ssize_t read_ret = ksys_read(fd, buf, count);
164 | +			if (read_ret != count) {
165 | +				printk("bpf read: ksys_read failed\n");
166 | +				return -EBADF;
167 | +			}
168 | +			index = (index * 1103515245 + 12345) % (1 << 23);  /* randomly choose next offset */
169 | +		}
170 | +		return count;
171 | +	}
172 |  }
173 |  
174 |  ssize_t ksys_write(unsigned int fd, const char __user *buf, size_t count)
175 | diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h
176 | index a18ae82a298a..53e695703b57 100644
177 | --- a/include/linux/bpf_types.h
178 | +++ b/include/linux/bpf_types.h
179 | @@ -76,6 +76,9 @@ BPF_PROG_TYPE(BPF_PROG_TYPE_LSM, lsm,
180 |  #endif /* CONFIG_BPF_LSM */
181 |  #endif
182 |  
183 | +BPF_PROG_TYPE(BPF_PROG_TYPE_STORAGE, storage,
184 | +              struct bpf_storage, struct bpf_storage_kern)
185 | +
186 |  BPF_MAP_TYPE(BPF_MAP_TYPE_ARRAY, array_map_ops)
187 |  BPF_MAP_TYPE(BPF_MAP_TYPE_PERCPU_ARRAY, percpu_array_map_ops)
188 |  BPF_MAP_TYPE(BPF_MAP_TYPE_PROG_ARRAY, prog_array_map_ops)
189 | diff --git a/include/linux/filter.h b/include/linux/filter.h
190 | index 0b0144752d78..cb81a2458192 100644
191 | --- a/include/linux/filter.h
192 | +++ b/include/linux/filter.h
193 | @@ -1278,4 +1278,8 @@ struct bpf_sockopt_kern {
194 |  	s32		retval;
195 |  };
196 |  
197 | +struct bpf_storage_kern {
198 | +	char data[512];
199 | +};
200 | +
201 |  #endif /* __LINUX_FILTER_H__ */
202 | diff --git a/include/linux/fs.h b/include/linux/fs.h
203 | index f5abba86107d..a209bd67209e 100644
204 | --- a/include/linux/fs.h
205 | +++ b/include/linux/fs.h
206 | @@ -950,6 +950,8 @@ struct file {
207 |  	struct inode		*f_inode;	/* cached value */
208 |  	const struct file_operations	*f_op;
209 |  
210 | +	int _bpf_level;
211 | +
212 |  	/*
213 |  	 * Protects f_ep_links, f_flags.
214 |  	 * Must not be taken from IRQ context.
215 | diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
216 | index b951a87da987..84e9b80c4bf3 100644
217 | --- a/include/linux/syscalls.h
218 | +++ b/include/linux/syscalls.h
219 | @@ -1424,4 +1424,6 @@ long compat_ksys_semtimedop(int semid, struct sembuf __user *tsems,
220 |  			    unsigned int nsops,
221 |  			    const struct old_timespec32 __user *timeout);
222 |  
223 | +asmlinkage long sys_set_bpf_level(int fd, int level);
224 | +
225 |  #endif
226 | diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
227 | index 8bd33050b7bb..0fce9b3d875c 100644
228 | --- a/include/uapi/linux/bpf.h
229 | +++ b/include/uapi/linux/bpf.h
230 | @@ -189,6 +189,7 @@ enum bpf_prog_type {
231 |  	BPF_PROG_TYPE_STRUCT_OPS,
232 |  	BPF_PROG_TYPE_EXT,
233 |  	BPF_PROG_TYPE_LSM,
234 | +	BPF_PROG_TYPE_STORAGE,
235 |  };
236 |  
237 |  enum bpf_attach_type {
238 | @@ -226,6 +227,7 @@ enum bpf_attach_type {
239 |  	BPF_CGROUP_INET4_GETSOCKNAME,
240 |  	BPF_CGROUP_INET6_GETSOCKNAME,
241 |  	BPF_XDP_DEVMAP,
242 | +	BPF_STORAGE,
243 |  	__MAX_BPF_ATTACH_TYPE
244 |  };
245 |  
246 | @@ -4261,4 +4263,9 @@ struct bpf_pidns_info {
247 |  	__u32 pid;
248 |  	__u32 tgid;
249 |  };
250 | +
251 | +struct bpf_storage {
252 | +	char data[512];
253 | +};
254 | +
255 |  #endif /* _UAPI__LINUX_BPF_H__ */
256 | diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
257 | index 0fd80ac81f70..8550179bbc43 100644
258 | --- a/kernel/bpf/syscall.c
259 | +++ b/kernel/bpf/syscall.c
260 | @@ -2815,6 +2815,8 @@ attach_type_to_prog_type(enum bpf_attach_type attach_type)
261 |  		return BPF_PROG_TYPE_CGROUP_SOCKOPT;
262 |  	case BPF_TRACE_ITER:
263 |  		return BPF_PROG_TYPE_TRACING;
264 | +	case BPF_STORAGE:
265 | +		return BPF_PROG_TYPE_STORAGE;
266 |  	default:
267 |  		return BPF_PROG_TYPE_UNSPEC;
268 |  	}
269 | @@ -2825,6 +2827,9 @@ attach_type_to_prog_type(enum bpf_attach_type attach_type)
270 |  #define BPF_F_ATTACH_MASK \
271 |  	(BPF_F_ALLOW_OVERRIDE | BPF_F_ALLOW_MULTI | BPF_F_REPLACE)
272 |  
273 | +int _storage_bpf_prog_attach(const union bpf_attr *attr, struct bpf_prog *prog);
274 | +int _storage_bpf_prog_detach(const union bpf_attr *attr);
275 | +
276 |  static int bpf_prog_attach(const union bpf_attr *attr)
277 |  {
278 |  	enum bpf_prog_type ptype;
279 | @@ -2870,6 +2875,9 @@ static int bpf_prog_attach(const union bpf_attr *attr)
280 |  	case BPF_PROG_TYPE_SOCK_OPS:
281 |  		ret = cgroup_bpf_prog_attach(attr, ptype, prog);
282 |  		break;
283 | +	case BPF_PROG_TYPE_STORAGE:
284 | +		ret = _storage_bpf_prog_attach(attr, prog);
285 | +		break;
286 |  	default:
287 |  		ret = -EINVAL;
288 |  	}
289 | @@ -2906,6 +2914,8 @@ static int bpf_prog_detach(const union bpf_attr *attr)
290 |  	case BPF_PROG_TYPE_CGROUP_SYSCTL:
291 |  	case BPF_PROG_TYPE_SOCK_OPS:
292 |  		return cgroup_bpf_prog_detach(attr, ptype);
293 | +	case BPF_PROG_TYPE_STORAGE:
294 | +		return _storage_bpf_prog_detach(attr);
295 |  	default:
296 |  		return -EINVAL;
297 |  	}
298 | 


--------------------------------------------------------------------------------