├── .gitignore ├── README.md ├── go.mod ├── go.sum ├── main.go ├── main.py └── main.zig /.gitignore: -------------------------------------------------------------------------------- 1 | main 2 | *.bin 3 | *~ 4 | *.csv 5 | gobufioplayground 6 | *.o -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # IO Playground 2 | 3 | The point of this repo is to get an intuition about different IO 4 | models. Comparing across languages isn't for benchmark wars it's more 5 | a check for correctness. The relative difference between IO models 6 | should be similar across language. 7 | 8 | ## Machine 9 | 10 | I am running these tests on a dedicated bare metal instance, [OVH 11 | Rise-1](https://eco.us.ovhcloud.com/#filterType=range_element&filterValue=rise). 12 | 13 | * RAM: 64 GB DDR4 ECC 2,133 MHz 14 | * Disk: 2x450 GB SSD NVMe in Soft RAID 15 | * Processor: Intel Xeon E3-1230v6 - 4c/8t - 3.5 GHz/3.9 GHz 16 | * `uname --kernel-release`: 6.3.8-100.fc37.x86_64 17 | 18 | ## Write 1GiB to one file (4KiB buffer) 19 | 20 | Each implementation (other than `dd`) produces a CSV of results. Use 21 | the following DuckDB command to analyze it. 22 | 23 | ``` 24 | $ duckdb -c " 25 | SELECT 26 | column0 AS method, 27 | AVG(column1::DOUBLE) || 's' avg_time, 28 | FORMAT_BYTES(AVG(column2::DOUBLE)::BIGINT) || '/s' AS avg_throughput 29 | FROM 'out.csv' 30 | GROUP BY column0 31 | ORDER BY AVG(column1::DOUBLE) ASC" 32 | ``` 33 | 34 | ### `dd` (Control) 35 | 36 | ``` 37 | $ dd if=/dev/zero of=test.bin bs=4k count=1M 38 | 1048576+0 records in 39 | 1048576+0 records out 40 | 4294967296 bytes (4.3 GB, 4.0 GiB) copied, 3.09765 s, 1.4 GB/s 41 | ``` 42 | 43 | ### Go 44 | 45 | This version reads up to N entries but then blocks until all N entries 46 | complete. This is not ideal. But I'm not sure Iceber/iouring-go 47 | supports anything else. 48 | 49 | First: 50 | 51 | ``` 52 | $ go run main.go | tee out.csv 53 | ``` 54 | 55 | Then run the DuckDB command from above: 56 | 57 | ``` 58 | ┌────────────────────────────────────────────┬─────────────────────┬────────────────┐ 59 | │ method │ avg_time │ avg_throughput │ 60 | │ varchar │ varchar │ varchar │ 61 | ├────────────────────────────────────────────┼─────────────────────┼────────────────┤ 62 | │ 1_goroutines_pwrite │ 0.7111268999999999s │ 1.5GB/s │ 63 | │ blocking │ 0.7128968s │ 1.5GB/s │ 64 | │ 1_goroutines_io_uring_pwrite_128_entries │ 1.0402713s │ 1.0GB/s │ 65 | │ 10_goroutines_pwrite │ 1.111215s │ 966.2MB/s │ 66 | │ 100_goroutines_io_uring_pwrite_128_entries │ 1.3004915000000001s │ 825.6MB/s │ 67 | │ 100_goroutines_io_uring_pwrite_1_entries │ 1.5118257s │ 710.2MB/s │ 68 | │ 10_goroutines_io_uring_pwrite_128_entries │ 1.5322980999999998s │ 771.6MB/s │ 69 | │ 10_goroutines_io_uring_pwrite_1_entries │ 1.6577722000000001s │ 648.1MB/s │ 70 | │ 1_goroutines_io_uring_pwrite_1_entries │ 4.705483s │ 228.2MB/s │ 71 | └────────────────────────────────────────────┴─────────────────────┴────────────────┘ 72 | ``` 73 | 74 | ### Zig 75 | 76 | Unlike the Go implementation, this version does not batch only/always 77 | N entries at a time. It starts out batching N entries at a time but 78 | *does not block* waiting for all N to complete. Instead it just reads 79 | what has completed and keeps trying to add more entry batches. 80 | 81 | First: 82 | 83 | ``` 84 | $ zig build-exe main.zig 85 | $ ./main | tee out.csv 86 | ``` 87 | 88 | Then run the DuckDB command from above: 89 | 90 | ``` 91 | ┌────────────────────────────────────────┬─────────────────────┬────────────────┐ 92 | │ method │ avg_time │ avg_throughput │ 93 | │ varchar │ varchar │ varchar │ 94 | ├────────────────────────────────────────┼─────────────────────┼────────────────┤ 95 | │ 1_threads_iouring_pwrite_128_entries │ 0.6080365773999998s │ 1.7GB/s │ 96 | │ 1_threads_iouring_pwrite_1_entries │ 0.6259650676999999s │ 1.7GB/s │ 97 | │ blocking │ 0.6740227804s │ 1.5GB/s │ 98 | │ 1_threads_pwrite │ 0.6846085126999999s │ 1.5GB/s │ 99 | │ 10_threads_pwrite │ 1.1549885629000003s │ 929.8MB/s │ 100 | │ 10_threads_iouring_pwrite_1_entries │ 2.4174379148s │ 445.7MB/s │ 101 | │ 10_threads_iouring_pwrite_128_entries │ 2.4178504731s │ 445.8MB/s │ 102 | │ 100_threads_iouring_pwrite_128_entries │ 3.6317807736s │ 296.6MB/s │ 103 | │ 100_threads_iouring_pwrite_1_entries │ 3.7681755905000003s │ 287.7MB/s │ 104 | └────────────────────────────────────────┴─────────────────────┴────────────────┘ 105 | ``` 106 | 107 | ### Python 108 | 109 | First: 110 | 111 | ``` 112 | $ python3 main.py | tee out.csv 113 | ``` 114 | 115 | Then run the DuckDB command from above: 116 | 117 | ``` 118 | ┌──────────┬────────────┬────────────────┐ 119 | │ method │ avg_time │ avg_throughput │ 120 | │ varchar │ varchar │ varchar │ 121 | ├──────────┼────────────┼────────────────┤ 122 | │ blocking │ 0.9259369s │ 1.1GB/s │ 123 | └──────────┴────────────┴────────────────┘ 124 | ``` 125 | -------------------------------------------------------------------------------- /go.mod: -------------------------------------------------------------------------------- 1 | module gobufioplayground 2 | 3 | go 1.21.2 4 | 5 | require github.com/iceber/iouring-go v0.0.0-20230403020409-002cfd2e2a90 6 | 7 | require golang.org/x/sys v0.0.0-20200923182605-d9f96fdee20d // indirect 8 | -------------------------------------------------------------------------------- /go.sum: -------------------------------------------------------------------------------- 1 | github.com/iceber/iouring-go v0.0.0-20230403020409-002cfd2e2a90 h1:xrtfZokN++5kencK33hn2Kx3Uj8tGnjMEhdt6FMvHD0= 2 | github.com/iceber/iouring-go v0.0.0-20230403020409-002cfd2e2a90/go.mod h1:LEzdaZarZ5aqROlLIwJ4P7h3+4o71008fSy6wpaEB+s= 3 | golang.org/x/sys v0.0.0-20200923182605-d9f96fdee20d h1:L/IKR6COd7ubZrs2oTnTi73IhgqJ71c9s80WsQnh0Es= 4 | golang.org/x/sys v0.0.0-20200923182605-d9f96fdee20d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= 5 | -------------------------------------------------------------------------------- /main.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "bytes" 5 | "fmt" 6 | "os" 7 | "sync" 8 | "syscall" 9 | "time" 10 | 11 | "github.com/iceber/iouring-go" 12 | ) 13 | 14 | func assert(b bool) { 15 | if !b { 16 | panic("assert") 17 | } 18 | } 19 | 20 | const bufferSize = 4096 21 | 22 | func readNBytes(fn string, n int) []byte { 23 | f, err := os.Open(fn) 24 | if err != nil { 25 | panic(err) 26 | } 27 | defer f.Close() 28 | 29 | data := make([]byte, 0, n) 30 | 31 | var buffer = make([]byte, bufferSize) 32 | for len(data) < n { 33 | read, err := f.Read(buffer) 34 | if err != nil { 35 | panic(err) 36 | } 37 | 38 | data = append(data, buffer[:read]...) 39 | } 40 | 41 | assert(len(data) == n) 42 | 43 | return data 44 | } 45 | 46 | func benchmark(name string, directIO bool, x []byte, fn func(*os.File)) { 47 | fmt.Printf("%s", name) 48 | flags := os.O_RDWR | os.O_CREATE | os.O_TRUNC 49 | if directIO { 50 | flags |= syscall.O_DIRECT 51 | } 52 | f, err := os.OpenFile("out.bin", flags, 0755) 53 | if err != nil { 54 | panic(err) 55 | } 56 | 57 | t1 := time.Now() 58 | 59 | fn(f) 60 | 61 | s := time.Now().Sub(t1).Seconds() 62 | fmt.Printf(",%f,%f\n", s, float64(len(x))/s) 63 | 64 | if err := f.Close(); err != nil { 65 | panic(err) 66 | } 67 | 68 | assert(bytes.Equal(readNBytes("out.bin", len(x)), x)) 69 | } 70 | 71 | func withPwriteAndWorkerRoutines(directIO bool, x []byte, workers int) { 72 | name := fmt.Sprintf("%d_goroutines_pwrite", workers) 73 | benchmark(name, directIO, x, func(f *os.File) { 74 | var wg sync.WaitGroup 75 | 76 | workSize := len(x) / workers 77 | 78 | for i := 0; i < len(x); i += workSize { 79 | wg.Add(1) 80 | go func(i int) { 81 | defer wg.Done() 82 | 83 | for j := i; j < i+workSize; j += bufferSize { 84 | if j >= i+workSize || j >= len(x) { 85 | break 86 | } 87 | size := min(min(bufferSize, (i+workSize)-j), len(x)-j) 88 | n, err := f.WriteAt(x[j:j+size], int64(j)) 89 | if err != nil { 90 | panic(err) 91 | } 92 | 93 | assert(n == size) 94 | } 95 | }(i) 96 | } 97 | wg.Wait() 98 | }) 99 | } 100 | 101 | func withIOUringAndWorkerRoutines(directIO bool, x []byte, entries int, workers int) { 102 | name := fmt.Sprintf("%d_goroutines_io_uring_pwrite_%d_entries", workers, entries) 103 | benchmark(name, directIO, x, func(f *os.File) { 104 | var wg sync.WaitGroup 105 | workSize := len(x) / workers 106 | 107 | for i := 0; i < len(x); i += workSize { 108 | wg.Add(1) 109 | go func(i int) { 110 | requests := make([]iouring.PrepRequest, entries) 111 | iour, err := iouring.New(uint(entries)) 112 | if err != nil { 113 | panic(err) 114 | } 115 | defer iour.Close() 116 | 117 | defer wg.Done() 118 | 119 | for j := i; j < i+workSize; j += bufferSize * entries { 120 | submittedEntries := 0 121 | for k := 0; k < entries; k++ { 122 | base := j + k*bufferSize 123 | if base >= i+workSize || base >= len(x) { 124 | break 125 | } 126 | submittedEntries++ 127 | size := min(min(bufferSize, (i+workSize)-base), len(x)-base) 128 | requests[k] = iouring.Pwrite(int(f.Fd()), x[base:base+size], uint64(base)) 129 | } 130 | 131 | // Unclear how to me this is a 132 | // case that happens but it 133 | // does. If we don't break 134 | // here it locks forever at 135 | // <-res.Done(). 136 | if submittedEntries == 0 { 137 | continue 138 | } 139 | 140 | res, err := iour.SubmitRequests(requests[:submittedEntries], nil) 141 | if err != nil { 142 | panic(err) 143 | } 144 | <-res.Done() 145 | 146 | for _, result := range res.ErrResults() { 147 | n, err := result.ReturnInt() 148 | if err != nil { 149 | panic(err) 150 | } 151 | 152 | assert(n == bufferSize) 153 | } 154 | } 155 | }(i) 156 | } 157 | wg.Wait() 158 | }) 159 | } 160 | 161 | func main() { 162 | size := 1073741824 // 1GiB 163 | x := readNBytes("/dev/random", size) 164 | 165 | var directIO = false 166 | for _, arg := range os.Args { 167 | if arg == "--directio" { 168 | directIO = true 169 | } 170 | } 171 | 172 | for i := 0; i < 10; i++ { 173 | // No buffering 174 | benchmark("blocking", directIO, x, func(f *os.File) { 175 | for i := 0; i < len(x); i += bufferSize { 176 | size := min(bufferSize, len(x)-i) 177 | n, err := f.Write(x[i : i+size]) 178 | if err != nil { 179 | panic(err) 180 | } 181 | 182 | assert(n == bufferSize) 183 | } 184 | }) 185 | 186 | withPwriteAndWorkerRoutines(directIO, x, 1) 187 | withPwriteAndWorkerRoutines(directIO, x, 10) 188 | 189 | withIOUringAndWorkerRoutines(directIO, x, 1, 10) 190 | withIOUringAndWorkerRoutines(directIO, x, 128, 10) 191 | withIOUringAndWorkerRoutines(directIO, x, 1, 100) 192 | withIOUringAndWorkerRoutines(directIO, x, 128, 100) 193 | withIOUringAndWorkerRoutines(directIO, x, 1, 1) 194 | withIOUringAndWorkerRoutines(directIO, x, 128, 1) 195 | } 196 | } 197 | -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | import datetime 2 | 3 | def read_n_bytes(fn, want): 4 | bytes = bytearray(want) 5 | with open(fn, "rb") as f: 6 | written = 0 7 | while written < want: 8 | chunk = f.read(4096) 9 | n = min(len(chunk), want - written) 10 | bytes[written:written + n] = chunk[:n] 11 | written += n 12 | 13 | assert len(bytes) == want 14 | return bytes 15 | 16 | def main(): 17 | x = read_n_bytes("/dev/random", 2**30) 18 | 19 | for _ in range(10): 20 | with open("out.bin", "wb") as f: 21 | t1 = datetime.datetime.now() 22 | 23 | i = 0 24 | while i < len(x): 25 | f.write(x[i:i+4096]) 26 | i += 4096 27 | 28 | t2 = datetime.datetime.now() 29 | diff = (t2-t1).total_seconds() 30 | print(f"blocking,{diff},{len(x) / diff}") 31 | 32 | main() 33 | -------------------------------------------------------------------------------- /main.zig: -------------------------------------------------------------------------------- 1 | const std = @import("std"); 2 | 3 | fn readNBytes(allocator: *const std.mem.Allocator, filename: []const u8, n: usize) ![]const u8 { 4 | const file = try std.fs.cwd().openFile(filename, .{}); 5 | defer file.close(); 6 | 7 | var data = try allocator.alloc(u8, n); 8 | var buf = try allocator.alloc(u8, 4096); 9 | 10 | var written: usize = 0; 11 | while (data.len < n) { 12 | var nwritten = try file.read(buf); 13 | @memcpy(data[written..], buf[0..nwritten]); 14 | written += nwritten; 15 | } 16 | 17 | std.debug.assert(data.len == n); 18 | return data; 19 | } 20 | 21 | fn createFile(f: []const u8, directIO: bool) !std.fs.File { 22 | const file = try std.fs.cwd().createFile(f, .{ 23 | .truncate = true, 24 | }); 25 | 26 | if (directIO) { 27 | const flags: usize = try std.os.fcntl(file.handle, std.os.linux.F.GETFL, 0); 28 | _ = try std.os.fcntl(file.handle, std.os.linux.F.SETFL, flags | std.os.O.DIRECT); 29 | } 30 | return file; 31 | } 32 | 33 | const Benchmark = struct { 34 | t: std.time.Timer, 35 | file: std.fs.File, 36 | data: []const u8, 37 | allocator: *const std.mem.Allocator, 38 | 39 | fn init( 40 | allocator: *const std.mem.Allocator, 41 | name: []const u8, 42 | directIO: bool, 43 | data: []const u8, 44 | ) !Benchmark { 45 | try std.io.getStdOut().writer().print("{s}", .{name}); 46 | if (directIO) { 47 | try std.io.getStdOut().writer().print("_directio", .{}); 48 | } 49 | 50 | var file = try createFile(outFile, directIO); 51 | 52 | return Benchmark{ 53 | .t = try std.time.Timer.start(), 54 | .file = file, 55 | .data = data, 56 | .allocator = allocator, 57 | }; 58 | } 59 | 60 | fn stop(b: *Benchmark) void { 61 | const s = @as(f64, @floatFromInt(b.t.read())) / std.time.ns_per_s; 62 | std.io.getStdOut().writer().print( 63 | ",{d},{d}\n", 64 | .{ s, @as(f64, @floatFromInt(b.data.len)) / s }, 65 | ) catch unreachable; 66 | 67 | b.file.close(); 68 | 69 | var in = readNBytes(b.allocator, outFile, b.data.len) catch unreachable; 70 | std.debug.assert(std.mem.eql(u8, in, b.data)); 71 | b.allocator.free(in); 72 | } 73 | }; 74 | 75 | const ThreadInfo = struct { 76 | file: *const std.fs.File, 77 | data: []const u8, 78 | offset: usize, 79 | workSize: usize, 80 | allocator: *const std.mem.Allocator, 81 | }; 82 | 83 | const outFile = "out.bin"; 84 | const bufferSize: u64 = 4096; // 1048576; // 1mib 85 | 86 | fn pwriteWorker(info: *ThreadInfo) void { 87 | var i: usize = info.offset; 88 | var written: usize = 0; 89 | while (i < info.offset + info.workSize) : (i += bufferSize) { 90 | const size = @min(bufferSize, (info.offset + info.workSize) - i); 91 | const n = info.file.pwrite(info.data[i .. i + size], i) catch unreachable; 92 | written += n; 93 | std.debug.assert(n <= bufferSize); 94 | std.debug.assert(n == size); 95 | } 96 | std.debug.assert(written == info.workSize); 97 | } 98 | 99 | fn threadsAndPwrite( 100 | comptime nWorkers: u8, 101 | allocator: *const std.mem.Allocator, 102 | x: []const u8, 103 | directIO: bool, 104 | ) !void { 105 | const name = try std.fmt.allocPrint(allocator.*, "{}_threads_pwrite", .{nWorkers}); 106 | defer allocator.free(name); 107 | var b = try Benchmark.init(allocator, name, directIO, x); 108 | defer b.stop(); 109 | 110 | var workers: [nWorkers]std.Thread = undefined; 111 | var workerInfo: [nWorkers]ThreadInfo = undefined; 112 | const workSize = x.len / nWorkers; 113 | for (&workers, 0..) |*worker, i| { 114 | workerInfo[i] = ThreadInfo{ 115 | .file = &b.file, 116 | .data = x, 117 | .offset = i * workSize, 118 | .workSize = workSize, 119 | .allocator = allocator, 120 | }; 121 | worker.* = try std.Thread.spawn(.{}, pwriteWorker, .{&workerInfo[i]}); 122 | } 123 | 124 | for (&workers) |*worker| { 125 | worker.join(); 126 | } 127 | } 128 | 129 | fn pwriteIOUringWorker(info: *ThreadInfo, nEntries: u13) void { 130 | var ring = std.os.linux.IO_Uring.init(nEntries, 0) catch |err| { 131 | std.debug.panic("Failed to initialize io_uring: {}\n", .{err}); 132 | return; 133 | }; 134 | defer ring.deinit(); 135 | 136 | var i: usize = info.offset; 137 | var written: usize = 0; 138 | 139 | var cqes = info.allocator.alloc(std.os.linux.io_uring_cqe, nEntries) catch unreachable; 140 | defer info.allocator.free(cqes); 141 | 142 | var totalSubs: usize = 0; 143 | while (i < info.offset + info.workSize or written < info.workSize) { 144 | // Fill in as many submissions as we can. 145 | while (true) { 146 | if (i >= info.offset + info.workSize) { 147 | break; 148 | } 149 | const size = @min(bufferSize, (info.offset + info.workSize) - i); 150 | _ = ring.write(0, info.file.handle, info.data[i .. i + size], i) catch |e| switch (e) { 151 | error.SubmissionQueueFull => break, 152 | else => unreachable, 153 | }; 154 | i += size; 155 | totalSubs += 1; 156 | } 157 | 158 | // Submit and do not block. We'll read whatever's available 159 | // each time through. 160 | _ = ring.submit() catch unreachable; 161 | 162 | const received = ring.copy_cqes(cqes, 0) catch unreachable; 163 | 164 | for (cqes[0..received]) |*cqe| { 165 | if (cqe.err() != .SUCCESS) { 166 | @panic("Request failed"); 167 | } 168 | 169 | std.debug.assert(cqe.res >= 0); 170 | const n = @as(usize, @intCast(cqe.res)); 171 | written += n; 172 | std.debug.assert(n <= bufferSize); 173 | } 174 | } 175 | std.debug.assert(written == info.workSize); 176 | } 177 | 178 | fn threadsAndIOUringPwrite( 179 | comptime nWorkers: u8, 180 | allocator: *const std.mem.Allocator, 181 | x: []const u8, 182 | entries: u13, 183 | directIO: bool, 184 | ) !void { 185 | const name = try std.fmt.allocPrint(allocator.*, "{}_threads_iouring_pwrite_{}_entries", .{ nWorkers, entries }); 186 | defer allocator.free(name); 187 | var b = try Benchmark.init(allocator, name, directIO, x); 188 | defer b.stop(); 189 | 190 | var workers: [nWorkers]std.Thread = undefined; 191 | var workerInfo: [nWorkers]ThreadInfo = undefined; 192 | const workSize = x.len / nWorkers; 193 | for (&workers, 0..) |*worker, i| { 194 | workerInfo[i] = ThreadInfo{ 195 | .file = &b.file, 196 | .data = x, 197 | .offset = i * workSize, 198 | .workSize = workSize, 199 | .allocator = allocator, 200 | }; 201 | worker.* = try std.Thread.spawn(.{}, pwriteIOUringWorker, .{ &workerInfo[i], entries }); 202 | } 203 | 204 | for (&workers) |*worker| { 205 | worker.join(); 206 | } 207 | } 208 | 209 | pub fn main() !void { 210 | var allocator = &std.heap.page_allocator; 211 | 212 | const SIZE = 1073741824; // 1GiB 213 | var x = try readNBytes(allocator, "/dev/random", SIZE); 214 | defer allocator.free(x); 215 | 216 | var args = std.process.args(); 217 | var directIO = false; 218 | while (args.next()) |arg| { 219 | if (std.mem.eql(u8, arg, "--directio")) { 220 | directIO = true; 221 | } 222 | } 223 | 224 | var run: usize = 0; 225 | while (run < 10) : (run += 1) { 226 | { 227 | var b = try Benchmark.init(allocator, "blocking", directIO, x); 228 | defer b.stop(); 229 | 230 | var i: usize = 0; 231 | while (i < x.len) : (i += bufferSize) { 232 | const size = @min(bufferSize, x.len - i); 233 | const n = try b.file.write(x[i .. i + size]); 234 | std.debug.assert(n == size); 235 | } 236 | } 237 | 238 | try threadsAndPwrite(1, allocator, x, directIO); 239 | try threadsAndPwrite(10, allocator, x, directIO); 240 | 241 | try threadsAndIOUringPwrite(1, allocator, x, 1, directIO); 242 | try threadsAndIOUringPwrite(1, allocator, x, 128, directIO); 243 | 244 | try threadsAndIOUringPwrite(10, allocator, x, 1, directIO); 245 | try threadsAndIOUringPwrite(10, allocator, x, 128, directIO); 246 | 247 | try threadsAndIOUringPwrite(100, allocator, x, 1, directIO); 248 | try threadsAndIOUringPwrite(100, allocator, x, 128, directIO); 249 | } 250 | } 251 | --------------------------------------------------------------------------------