├── .gitignore ├── go.mod ├── examples ├── bulk-socket │ ├── go.mod │ ├── main.go │ └── go.sum ├── cp │ ├── benchmark.sh │ ├── stdio.go │ └── iouring.go ├── write │ └── main.go ├── net │ └── main.go └── standalone │ └── main.go ├── uring_enter_test.go ├── uring_setup_test.go ├── .github ├── ISSUE_TEMPLATE │ └── bug_report.md └── workflows │ └── go.yml ├── uring_enter.go ├── tests └── bug-9.go ├── ring_option_test.go ├── conn_test.go ├── ring_test.go ├── LICENSE ├── register_test.go ├── utils.go ├── completer.go ├── go.sum ├── read_writer_test.go ├── submitter.go ├── ring_option.go ├── consts.go ├── ring_conn.go ├── uring_setup.go ├── ring_benchmark_test.go ├── register.go ├── read_writer.go ├── types_test.go ├── types.go ├── conn.go ├── README.md ├── ops_test.go ├── ring.go └── ops.go /.gitignore: -------------------------------------------------------------------------------- 1 | vendor 2 | hello.txt 3 | -------------------------------------------------------------------------------- /go.mod: -------------------------------------------------------------------------------- 1 | module github.com/hodgesds/iouring-go 2 | 3 | go 1.12 4 | 5 | require ( 6 | github.com/pkg/errors v0.9.1 7 | github.com/stretchr/testify v1.4.0 8 | golang.org/x/sys v0.0.0-20200202164722-d101bd2416d5 9 | ) 10 | -------------------------------------------------------------------------------- /examples/bulk-socket/go.mod: -------------------------------------------------------------------------------- 1 | module bulk 2 | 3 | go 1.14 4 | 5 | require ( 6 | github.com/gin-contrib/sse v0.1.0 7 | github.com/gin-gonic/gin v1.6.3 8 | github.com/hodgesds/iouring-go v0.0.0-20200506041732-4ec64dcb5875 9 | github.com/r3labs/sse v0.0.0-20200310095403-ee05428e4d0e 10 | ) 11 | -------------------------------------------------------------------------------- /uring_enter_test.go: -------------------------------------------------------------------------------- 1 | // +build linux 2 | 3 | package iouring 4 | 5 | import ( 6 | "syscall" 7 | "testing" 8 | 9 | "github.com/stretchr/testify/require" 10 | ) 11 | 12 | func TestEnter(t *testing.T) { 13 | p := Params{} 14 | fd, err := Setup(1024, &p) 15 | if err != nil { 16 | t.Fatal(err) 17 | } 18 | defer require.NoError(t, syscall.Close(fd)) 19 | } 20 | -------------------------------------------------------------------------------- /uring_setup_test.go: -------------------------------------------------------------------------------- 1 | // +build linux 2 | 3 | package iouring 4 | 5 | import ( 6 | "testing" 7 | 8 | "github.com/stretchr/testify/require" 9 | ) 10 | 11 | func TestMmapSubmitRing(t *testing.T) { 12 | var p Params 13 | fd, err := Setup(1024, &p) 14 | require.NoError(t, err) 15 | var ( 16 | cq CompletionQueue 17 | sq SubmitQueue 18 | ) 19 | require.NoError(t, MmapRing(fd, &p, &sq, &cq)) 20 | } 21 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/bug_report.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Bug report 3 | about: For all outstanding issues 4 | title: "[BUG]" 5 | labels: '' 6 | assignees: '' 7 | 8 | --- 9 | 10 | **Describe the bug** 11 | A clear and concise description of what the bug is. 12 | 13 | **To Reproduce** 14 | Steps to reproduce the behavior: 15 | 16 | **Expected behavior** 17 | A clear and concise description of what you expected to happen. 18 | 19 | **Linux distribution, Kernel Version (`uname -a`):** 20 | 21 | **Additional context** 22 | Add any other context about the problem here. 23 | -------------------------------------------------------------------------------- /examples/cp/benchmark.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | for fsize in 128 256 512 1024; do 4 | dd if=/dev/urandom of=test bs=1M count="$fsize" 5 | echo "benchmarking standard go file io" 6 | for i in {1..10}; do 7 | echo 3 > /proc/sys/vm/drop_caches 8 | time go run stdio.go test test.copy 9 | cmp --silent test test.copy || exit 1 10 | rm -f test.copy 11 | done 12 | sync 13 | 14 | echo "benchmarking io_uring go file io" 15 | for i in {1..10}; do 16 | echo 3 > /proc/sys/vm/drop_caches 17 | time go run iouring.go test test.copy 18 | cmp test test.copy || exit 1 19 | rm -f $1.copy 20 | done 21 | done 22 | -------------------------------------------------------------------------------- /uring_enter.go: -------------------------------------------------------------------------------- 1 | // +build linux 2 | 3 | package iouring 4 | 5 | import ( 6 | "syscall" 7 | "unsafe" 8 | 9 | "golang.org/x/sys/unix" 10 | ) 11 | 12 | // Enter is used to submit to the queue. 13 | func Enter(fd int, toSubmit uint, minComplete uint, flags uint, sigset *unix.Sigset_t) (int, error) { 14 | res, _, errno := syscall.Syscall6( 15 | EnterSyscall, 16 | uintptr(fd), 17 | uintptr(toSubmit), 18 | uintptr(minComplete), 19 | uintptr(flags), 20 | uintptr(unsafe.Pointer(sigset)), 21 | uintptr(0), 22 | ) 23 | if errno != 0 { 24 | var err error 25 | err = errno 26 | return 0, err 27 | } 28 | if res < 0 { 29 | return 0, syscall.Errno(-res) 30 | } 31 | 32 | return int(res), nil 33 | } 34 | -------------------------------------------------------------------------------- /tests/bug-9.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "log" 5 | "os" 6 | 7 | "github.com/hodgesds/iouring-go" 8 | ) 9 | 10 | func main() { 11 | r, err := iouring.New(1024, &iouring.Params{ 12 | Features: iouring.FeatNoDrop, 13 | }) 14 | if err != nil { 15 | log.Fatal(err) 16 | } 17 | 18 | // Open a file for registering with the ring. 19 | f, err := os.OpenFile("test", os.O_RDWR, 0755) 20 | if err != nil { 21 | log.Fatal(err) 22 | } 23 | 24 | // Register the file with the ring, which returns an io.WriteCloser. 25 | rw, err := r.FileReadWriter(f) 26 | if err != nil { 27 | log.Fatal(err) 28 | } 29 | 30 | buf := make([]byte, 4*1024*1024) 31 | //if _, err := rw.WriteAt(buf, 4328583168); err != nil { 32 | if _, err := rw.WriteAt(buf, 4096); err != nil { 33 | log.Fatal(err) 34 | } 35 | } 36 | -------------------------------------------------------------------------------- /examples/write/main.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "log" 5 | "os" 6 | 7 | "github.com/hodgesds/iouring-go" 8 | ) 9 | 10 | func main() { 11 | r, err := iouring.New(1024, &iouring.Params{ 12 | Features: iouring.FeatNoDrop, 13 | }) 14 | if err != nil { 15 | log.Fatal(err) 16 | } 17 | 18 | // Open a file for registring with the ring. 19 | f, err := os.OpenFile("hello.txt", os.O_RDWR|os.O_CREATE|os.O_TRUNC, 0755) 20 | if err != nil { 21 | log.Fatal(err) 22 | } 23 | 24 | // Register the file with the ring, which returns an io.WriteCloser. 25 | rw, err := r.FileReadWriter(f) 26 | if err != nil { 27 | log.Fatal(err) 28 | } 29 | 30 | if _, err := rw.Write([]byte("hello io_uring!")); err != nil { 31 | log.Fatal(err) 32 | } 33 | 34 | // Close the WriteCloser, which closes the open file (f). 35 | if err := r.Close(); err != nil { 36 | log.Fatal(err) 37 | } 38 | } 39 | -------------------------------------------------------------------------------- /ring_option_test.go: -------------------------------------------------------------------------------- 1 | // +build linux 2 | 3 | package iouring 4 | 5 | import ( 6 | "testing" 7 | 8 | "github.com/stretchr/testify/require" 9 | ) 10 | 11 | func TestWithDebug(t *testing.T) { 12 | r, err := New(2048, nil, WithDebug()) 13 | require.NoError(t, err) 14 | require.NotNil(t, r) 15 | require.True(t, r.debug) 16 | } 17 | 18 | func TestWithFileRegistry(t *testing.T) { 19 | r, err := New(2048, nil, WithFileRegistry()) 20 | require.NoError(t, err) 21 | require.NotNil(t, r) 22 | require.NotNil(t, r.FileRegistry()) 23 | } 24 | 25 | func TestWithEventFd(t *testing.T) { 26 | r, err := New(2048, nil, WithEventFd(0, 0, false)) 27 | require.NoError(t, err) 28 | require.NotNil(t, r) 29 | require.True(t, r.EventFd() > 0) 30 | 31 | r, err = New(2048, nil, WithEventFd(0, 0, true)) 32 | require.NoError(t, err) 33 | require.NotNil(t, r) 34 | require.True(t, r.EventFd() > 0) 35 | } 36 | -------------------------------------------------------------------------------- /.github/workflows/go.yml: -------------------------------------------------------------------------------- 1 | name: Go 2 | 3 | on: 4 | push: 5 | branches: [ master ] 6 | pull_request: 7 | branches: [ master ] 8 | 9 | jobs: 10 | 11 | build: 12 | name: Build 13 | runs-on: ubuntu-latest 14 | steps: 15 | 16 | - name: Set up Go 1.x 17 | uses: actions/setup-go@v2 18 | with: 19 | go-version: ^1.13 20 | id: go 21 | 22 | - name: Check out code into the Go module directory 23 | uses: actions/checkout@v2 24 | 25 | - name: Get dependencies 26 | run: | 27 | go get -v -t -d ./... 28 | if [ -f Gopkg.toml ]; then 29 | curl https://raw.githubusercontent.com/golang/dep/master/install.sh | sh 30 | dep ensure 31 | fi 32 | 33 | - name: Build 34 | run: go build -v . 35 | 36 | - name: Test 37 | run: go test -v . 38 | 39 | - name: Bench 40 | run: go test -bench=. 41 | -------------------------------------------------------------------------------- /conn_test.go: -------------------------------------------------------------------------------- 1 | package iouring 2 | 3 | import ( 4 | "bytes" 5 | "io/ioutil" 6 | "net" 7 | "testing" 8 | 9 | "github.com/stretchr/testify/require" 10 | ) 11 | 12 | func TestSockoptListener(t *testing.T) { 13 | t.Skip("skip") 14 | r, err := New(8192, nil) 15 | require.NoError(t, err) 16 | require.NotNil(t, r) 17 | 18 | l, err := r.SockoptListener("tcp", ":9822", nil) 19 | require.NoError(t, err) 20 | require.NotNil(t, l) 21 | 22 | go func() { 23 | conn2, err := net.Dial("tcp", ":9822") 24 | require.NoError(t, err) 25 | require.NotNil(t, conn2) 26 | require.NoError(t, conn2.Close()) 27 | }() 28 | conn, err := l.Accept() 29 | require.NoError(t, err) 30 | require.NotNil(t, conn) 31 | require.NoError(t, conn.Close()) 32 | } 33 | 34 | func TestFastOpenAllowed(t *testing.T) { 35 | b, err := ioutil.ReadFile("/proc/sys/net/ipv4/tcp_fack") 36 | require.NoError(t, err) 37 | if bytes.Contains(b, []byte("3")) { 38 | require.NoError(t, FastOpenAllowed()) 39 | } else { 40 | require.Error(t, FastOpenAllowed()) 41 | } 42 | } 43 | -------------------------------------------------------------------------------- /ring_test.go: -------------------------------------------------------------------------------- 1 | // +build linux 2 | 3 | package iouring 4 | 5 | import ( 6 | "testing" 7 | 8 | "github.com/stretchr/testify/require" 9 | ) 10 | 11 | func TestNew(t *testing.T) { 12 | r, err := New(2048, nil) 13 | require.NoError(t, err) 14 | require.NotNil(t, r) 15 | 16 | require.NotZero(t, r.sq.Size) 17 | require.NotNil(t, r.sq.Head) 18 | require.NotNil(t, r.sq.Tail) 19 | require.NotNil(t, r.sq.Mask) 20 | require.NotNil(t, r.sq.Entries) 21 | require.NotNil(t, r.sq.Flags) 22 | require.NotNil(t, r.sq.Dropped) 23 | require.NotNil(t, r.sq.Entries) 24 | 25 | require.NotZero(t, r.cq.Size) 26 | require.NotNil(t, r.cq.Head) 27 | require.NotNil(t, r.cq.Tail) 28 | require.NotNil(t, r.cq.Mask) 29 | require.NotNil(t, r.cq.Entries) 30 | require.Equal(t, r.fd, r.Fd()) 31 | require.Equal(t, uint64(1), r.ID()) 32 | require.Nil(t, r.FileRegistry()) 33 | require.Equal(t, r.cq, r.CQ()) 34 | require.Equal(t, r.sq, r.SQ()) 35 | 36 | require.NoError(t, r.Stop()) 37 | } 38 | 39 | func TestNewRingInvalidSize(t *testing.T) { 40 | _, err := New(99999, nil) 41 | require.Error(t, err) 42 | } 43 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2020 Daniel Hodges 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /register_test.go: -------------------------------------------------------------------------------- 1 | // +build linux 2 | 3 | package iouring 4 | 5 | import ( 6 | "io/ioutil" 7 | "os" 8 | "syscall" 9 | "testing" 10 | 11 | "github.com/stretchr/testify/require" 12 | ) 13 | 14 | func TestRegisterBuffers(t *testing.T) { 15 | r, err := New(2048, nil) 16 | require.NoError(t, err) 17 | require.NotNil(t, r) 18 | vecs := make([]*syscall.Iovec, 10) 19 | require.NoError(t, RegisterBuffers(r.Fd(), vecs)) 20 | require.NoError(t, UnregisterBuffers(r.Fd(), vecs)) 21 | } 22 | 23 | func TestFileRegistry(t *testing.T) { 24 | r, err := New(2048, nil) 25 | require.NoError(t, err) 26 | require.NotNil(t, r) 27 | 28 | reg := NewFileRegistry(r.Fd()) 29 | f, err := ioutil.TempFile("", "test-file-registry") 30 | require.NoError(t, err) 31 | defer os.Remove(f.Name()) 32 | f2, err := ioutil.TempFile("", "test-file-registry") 33 | require.NoError(t, err) 34 | defer os.Remove(f2.Name()) 35 | 36 | require.NoError(t, reg.Register(int(f.Fd()))) 37 | require.NoError(t, reg.Register(int(f2.Fd()))) 38 | id, ok := reg.ID(int(f2.Fd())) 39 | require.NotZero(t, id) 40 | require.True(t, ok) 41 | require.NoError(t, reg.Unregister(int(f.Fd()))) 42 | } 43 | -------------------------------------------------------------------------------- /utils.go: -------------------------------------------------------------------------------- 1 | package iouring 2 | 3 | import ( 4 | "reflect" 5 | "runtime" 6 | "unsafe" 7 | ) 8 | 9 | func saferStringToBytes(s *string) []byte { 10 | bytes := make([]byte, 0, 0) 11 | 12 | // Shameless stolen from: 13 | // See: https://github.com/jlauinger/go-safer 14 | // create the string and slice headers by casting. Obtain pointers to the 15 | // headers to be able to change the slice header properties in the next step 16 | stringHeader := (*reflect.StringHeader)(unsafe.Pointer(s)) 17 | sliceHeader := (*reflect.SliceHeader)(unsafe.Pointer(&bytes)) 18 | 19 | // set the slice's length and capacity temporarily to zero (this is actually 20 | // unnecessary here because the slice is already initialized as zero, but if 21 | // you are reusing a different slice this is important 22 | sliceHeader.Len = 0 23 | sliceHeader.Cap = 0 24 | 25 | // change the slice header data address 26 | sliceHeader.Data = stringHeader.Data 27 | 28 | // set the slice capacity and length to the string length 29 | sliceHeader.Cap = stringHeader.Len 30 | sliceHeader.Len = stringHeader.Len 31 | 32 | // use the keep alive dummy function to make sure the original string s is not 33 | // freed up until this point 34 | runtime.KeepAlive(s) 35 | 36 | return bytes 37 | } 38 | -------------------------------------------------------------------------------- /completer.go: -------------------------------------------------------------------------------- 1 | // +build linux 2 | 3 | package iouring 4 | 5 | import ( 6 | "sync/atomic" 7 | ) 8 | 9 | type completer struct { 10 | cq *CompletionQueue 11 | stopCh chan struct{} 12 | seen chan int 13 | } 14 | 15 | func newCompleter(cq *CompletionQueue, bufSize int) *completer { 16 | return &completer{ 17 | cq: cq, 18 | stopCh: make(chan struct{}, 8), 19 | seen: make(chan int, bufSize), 20 | } 21 | } 22 | 23 | func (c *completer) complete(id int) { 24 | c.seen <- id 25 | } 26 | 27 | func (c *completer) stop() { 28 | c.stopCh <- struct{}{} 29 | } 30 | 31 | func (c *completer) run() { 32 | unacked := map[int]struct{}{} 33 | for { 34 | select { 35 | case <-c.stopCh: 36 | return 37 | case id := <-c.seen: 38 | // TODO: is it bad to see twice? 39 | if _, ok := unacked[id]; !ok { 40 | unacked[id] = struct{}{} 41 | } 42 | head := atomic.LoadUint32(c.cq.Head) 43 | mask := atomic.LoadUint32(c.cq.Mask) 44 | seen := int(0) 45 | // Continue to move the head until the next value 46 | // hasn't arrived yet. 47 | curHead := int(head & mask) 48 | for { 49 | _, ok := unacked[curHead+seen] 50 | if !ok { 51 | break 52 | } 53 | delete(unacked, curHead+seen) 54 | seen++ 55 | } 56 | atomic.AddUint32(c.cq.Head, uint32(seen)) 57 | } 58 | } 59 | } 60 | -------------------------------------------------------------------------------- /examples/cp/stdio.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "flag" 5 | "io" 6 | "log" 7 | "os" 8 | 9 | "golang.org/x/sys/unix" 10 | ) 11 | 12 | var bufSize int 13 | 14 | func init() { 15 | flag.IntVar(&bufSize, "buf", 4096, "read buffer size") 16 | } 17 | 18 | func main() { 19 | flag.Parse() 20 | args := flag.Args() 21 | if len(args) != 2 { 22 | log.Fatalf("expected src dst: %v", args) 23 | } 24 | 25 | src, err := os.Open(args[0]) 26 | if err != nil { 27 | log.Fatalf("expected src dst: %v", args) 28 | } 29 | 30 | dst, err := os.OpenFile(args[1], os.O_RDWR|os.O_CREATE|os.O_TRUNC, 0644) 31 | if err != nil { 32 | log.Fatal(err) 33 | } 34 | 35 | // First get size of src. 36 | //stat, err := src.Stat() 37 | //if err != nil { 38 | // log.Fatal(err) 39 | //} 40 | 41 | // fadvise sequential read to EOF. 42 | if err := unix.Fadvise(int(src.Fd()), int64(0), int64(0), 3); err != nil { 43 | log.Fatal(err) 44 | } 45 | 46 | buf := make([]byte, bufSize) 47 | for { 48 | n, err := src.Read(buf) 49 | if err != nil { 50 | if err == io.EOF { 51 | break 52 | } 53 | log.Fatal(err) 54 | } 55 | _, err = dst.Write(buf[:n]) 56 | if err != nil { 57 | log.Fatal(err) 58 | } 59 | } 60 | 61 | if err := src.Close(); err != nil { 62 | log.Fatal(err) 63 | } 64 | 65 | if err := dst.Close(); err != nil { 66 | log.Fatal(err) 67 | } 68 | } 69 | -------------------------------------------------------------------------------- /examples/cp/iouring.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "flag" 5 | "io" 6 | "log" 7 | "os" 8 | 9 | "github.com/hodgesds/iouring-go" 10 | ) 11 | 12 | var bufSize int 13 | 14 | func init() { 15 | flag.IntVar(&bufSize, "buf", 4096, "read buffer size") 16 | } 17 | 18 | func main() { 19 | flag.Parse() 20 | ring, err := iouring.New(4096, &iouring.Params{ 21 | Features: iouring.FeatNoDrop, 22 | }) 23 | if err != nil { 24 | log.Fatal(err) 25 | } 26 | 27 | args := flag.Args() 28 | if len(args) != 2 { 29 | log.Fatal("expected src dst") 30 | } 31 | 32 | src, err := os.Open(args[0]) 33 | if err != nil { 34 | log.Fatal(err) 35 | } 36 | 37 | dst, err := os.OpenFile(args[1], os.O_RDWR|os.O_CREATE|os.O_TRUNC, 0644) 38 | if err != nil { 39 | log.Fatal(err) 40 | } 41 | 42 | r, err := ring.FileReadWriter(src) 43 | if err != nil { 44 | log.Fatal(err) 45 | } 46 | 47 | w, err := ring.FileReadWriter(dst) 48 | if err != nil { 49 | log.Fatal(err) 50 | } 51 | 52 | buf := make([]byte, bufSize) 53 | 54 | for { 55 | n, err := r.Read(buf) 56 | if err != nil { 57 | if err == io.EOF { 58 | break 59 | } 60 | log.Fatal(err) 61 | } 62 | if n == 0 { 63 | break 64 | } 65 | _, err = w.Write(buf[:n]) 66 | if err != nil { 67 | log.Fatal(err) 68 | } 69 | } 70 | 71 | if err := r.Close(); err != nil { 72 | log.Fatal(err) 73 | } 74 | 75 | if err := w.Close(); err != nil { 76 | log.Fatal(err) 77 | } 78 | } 79 | -------------------------------------------------------------------------------- /go.sum: -------------------------------------------------------------------------------- 1 | github.com/davecgh/go-spew v1.1.0 h1:ZDRjVQ15GmhC3fiQ8ni8+OwkZQO4DARzQgrnXU1Liz8= 2 | github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= 3 | github.com/pkg/errors v0.9.1 h1:FEBLx1zS214owpjy7qsBeixbURkuhQAwrK5UwLGTwt4= 4 | github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0= 5 | github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= 6 | github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= 7 | github.com/stretchr/objx v0.1.0 h1:4G4v2dO3VZwixGIRoQ5Lfboy6nUhCyYzaqnIAPPhYs4= 8 | github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= 9 | github.com/stretchr/testify v1.4.0 h1:2E4SXV/wtOkTonXsotYi4li6zVWxYlZuYNCXe9XRJyk= 10 | github.com/stretchr/testify v1.4.0/go.mod h1:j7eGeouHqKxXV5pUuKE4zz7dFj8WfuZ+81PSLYec5m4= 11 | golang.org/x/sys v0.0.0-20200202164722-d101bd2416d5 h1:LfCXLvNmTYH9kEmVgqbnsWfruoXZIrh4YBgqVHtDvw0= 12 | golang.org/x/sys v0.0.0-20200202164722-d101bd2416d5/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= 13 | gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405 h1:yhCVgyC4o1eVCa2tZl7eS0r+SDo693bJlVdllGtEeKM= 14 | gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= 15 | gopkg.in/yaml.v2 v2.2.2 h1:ZCJp+EgiOT7lHqUV2J862kp8Qj64Jo6az82+3Td9dZw= 16 | gopkg.in/yaml.v2 v2.2.2/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= 17 | -------------------------------------------------------------------------------- /read_writer_test.go: -------------------------------------------------------------------------------- 1 | // build linux 2 | 3 | package iouring 4 | 5 | import ( 6 | "io/ioutil" 7 | "os" 8 | "testing" 9 | 10 | "github.com/stretchr/testify/require" 11 | ) 12 | 13 | func TestReadWriterReadAt(t *testing.T) { 14 | r, err := New(1024, &Params{ 15 | Features: FeatNoDrop, 16 | }) 17 | require.NoError(t, err) 18 | require.NotNil(t, r) 19 | 20 | content := []byte("testing...1,2.3") 21 | f, err := ioutil.TempFile("", "example") 22 | require.NoError(t, err) 23 | defer os.Remove(f.Name()) 24 | 25 | _, err = f.Write(content) 26 | require.NoError(t, err) 27 | require.NoError(t, f.Sync()) 28 | 29 | rw, err := r.FileReadWriter(f) 30 | require.NoError(t, err) 31 | 32 | buf := make([]byte, len(content)/2) 33 | n, err := rw.ReadAt(buf, 0) 34 | require.True(t, 35 | n == len(buf), 36 | "Excpected length %d, got: %d", 37 | n, 38 | len(buf), 39 | ) 40 | } 41 | 42 | func TestReadWriterWriteAt(t *testing.T) { 43 | r, err := New(1024, &Params{ 44 | Features: FeatNoDrop, 45 | }) 46 | require.NoError(t, err) 47 | require.NotNil(t, r) 48 | 49 | content := []byte("testing...1,2.3") 50 | f, err := ioutil.TempFile("", "example") 51 | require.NoError(t, err) 52 | defer os.Remove(f.Name()) 53 | 54 | rw, err := r.FileReadWriter(f) 55 | require.NoError(t, err) 56 | 57 | n, err := rw.WriteAt(content, 0) 58 | require.True(t, 59 | n == len(content), 60 | "Excpected length %d, got: %d", 61 | n, 62 | len(content), 63 | ) 64 | } 65 | -------------------------------------------------------------------------------- /submitter.go: -------------------------------------------------------------------------------- 1 | // +build linux 2 | 3 | package iouring 4 | 5 | import "time" 6 | 7 | type submitter interface { 8 | submit(uint64) 9 | stop() 10 | } 11 | 12 | type ringSubmitter struct { 13 | r *Ring 14 | done chan struct{} 15 | work chan struct{} 16 | deadline time.Duration 17 | } 18 | 19 | func newRingSubmitter(r *Ring, deadline time.Duration) *ringSubmitter { 20 | return &ringSubmitter{ 21 | r: r, 22 | done: make(chan struct{}), 23 | work: make(chan struct{}, 128), 24 | deadline: deadline, 25 | } 26 | } 27 | 28 | func (s *ringSubmitter) submit(reqID uint64) { 29 | // We don't actually care about the request id. 30 | s.work <- struct{}{} 31 | } 32 | 33 | func (s *ringSubmitter) run() { 34 | timer := time.NewTimer(s.deadline) 35 | if !timer.Stop() { 36 | <-timer.C 37 | } 38 | count := 0 39 | seen := 0 40 | timerActive := false 41 | for { 42 | select { 43 | case <-timer.C: 44 | enter: 45 | n, err := s.r.Enter(uint(count), uint(0), EnterGetEvents, nil) 46 | if err != nil { 47 | continue 48 | } 49 | seen += n 50 | if seen < count { 51 | goto enter 52 | } 53 | seen = 0 54 | count = 0 55 | timerActive = false 56 | 57 | case <-s.work: 58 | if !timerActive { 59 | timerActive = true 60 | timer.Reset(s.deadline) 61 | } 62 | count++ 63 | 64 | case <-s.done: 65 | if !timer.Stop() { 66 | <-timer.C 67 | } 68 | return 69 | } 70 | } 71 | } 72 | 73 | func (s *ringSubmitter) stop() { 74 | s.done <- struct{}{} 75 | } 76 | -------------------------------------------------------------------------------- /examples/net/main.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "flag" 5 | "fmt" 6 | "log" 7 | "net/http" 8 | 9 | "github.com/hodgesds/iouring-go" 10 | ) 11 | 12 | var ( 13 | port int 14 | debug bool 15 | ) 16 | 17 | func init() { 18 | flag.IntVar(&port, "port", 9999, "HTTP port") 19 | flag.BoolVar(&debug, "debug", false, "debug mode") 20 | } 21 | 22 | func main() { 23 | flag.Parse() 24 | ops := []iouring.RingOption{ 25 | iouring.WithEnterErrHandler(func(err error) { 26 | log.Println(err) 27 | }), 28 | } 29 | if debug { 30 | ops = append(ops, iouring.WithDebug()) 31 | } 32 | r, err := iouring.New( 33 | 8192, 34 | &iouring.Params{ 35 | Features: iouring.FeatNoDrop, 36 | }, 37 | ops..., 38 | ) 39 | if err != nil { 40 | log.Fatal(err) 41 | } 42 | 43 | fmt.Printf("listening on port: %d\n", port) 44 | l, err := r.SockoptListener( 45 | "tcp", 46 | fmt.Sprintf(":%d", port), 47 | func(err error) { 48 | log.Println(err) 49 | }, 50 | iouring.SOReuseport, 51 | ) 52 | if err != nil { 53 | log.Fatal(err) 54 | } 55 | defer l.Close() 56 | 57 | mux := http.NewServeMux() 58 | mux.HandleFunc("/", func(w http.ResponseWriter, req *http.Request) { 59 | // The "/" pattern matches everything, so we need to check 60 | // that we're at the root here. 61 | if req.URL.Path != "/" { 62 | http.NotFound(w, req) 63 | return 64 | } 65 | fmt.Fprintf(w, "hello io_uring!\n") 66 | }) 67 | 68 | s := http.Server{Handler: mux} 69 | if err := s.Serve(l); err != nil { 70 | log.Fatal(err) 71 | } 72 | } 73 | -------------------------------------------------------------------------------- /ring_option.go: -------------------------------------------------------------------------------- 1 | // +build linux 2 | 3 | package iouring 4 | 5 | import ( 6 | "time" 7 | 8 | "golang.org/x/sys/unix" 9 | ) 10 | 11 | // RingOption is an option for configuring a Ring. 12 | type RingOption func(*Ring) error 13 | 14 | // WithDebug is used to print additional debug information. 15 | func WithDebug() RingOption { 16 | return func(r *Ring) error { 17 | r.debug = true 18 | return nil 19 | } 20 | } 21 | 22 | // WithEventFd is used to create an eventfd and register it to the Ring. 23 | // The event fd can be accessed using the EventFd method. 24 | func WithEventFd(initval uint, flags int, async bool) RingOption { 25 | return func(r *Ring) error { 26 | fd, err := unix.Eventfd(initval, flags) 27 | if err != nil { 28 | return err 29 | } 30 | r.eventFd = fd 31 | if async { 32 | return RegisterEventFdAsync(r.fd, fd) 33 | } 34 | return RegisterEventFd(r.fd, fd) 35 | } 36 | } 37 | 38 | // WithFileRegistry is used to register a FileRegistry with the Ring. The 39 | // registery can be accessed with the FileRegistry method on the ring. 40 | func WithFileRegistry() RingOption { 41 | return func(r *Ring) error { 42 | r.fileReg = NewFileRegistry(r.fd) 43 | return nil 44 | } 45 | } 46 | 47 | // WithID is used to set the starting id for the monotonically increasing ID 48 | // method. 49 | func WithID(id uint64) RingOption { 50 | return func(r *Ring) error { 51 | r.idx = &id 52 | return nil 53 | } 54 | } 55 | 56 | // WithEnterErrHandler is used to handle errors on ring enter. 57 | func WithEnterErrHandler(f func(error)) RingOption { 58 | return func(r *Ring) error { 59 | r.enterErrHandler = f 60 | return nil 61 | } 62 | } 63 | 64 | // WithDeadline is used to configure the deadline for submitting IO. 65 | func WithDeadline(d time.Duration) RingOption { 66 | return func(r *Ring) error { 67 | r.deadline = d 68 | s := newRingSubmitter(r, d) 69 | // This is an ugly hack.... 70 | go s.run() 71 | r.submitter = s 72 | return nil 73 | } 74 | } 75 | -------------------------------------------------------------------------------- /examples/bulk-socket/main.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "bytes" 5 | "encoding/json" 6 | "fmt" 7 | "log" 8 | "net" 9 | "net/http" 10 | "time" 11 | "unsafe" 12 | 13 | gsse "github.com/gin-contrib/sse" 14 | "github.com/gin-gonic/gin" 15 | "github.com/hodgesds/iouring-go" 16 | "github.com/r3labs/sse" 17 | ) 18 | 19 | var ( 20 | fds = make([]int32, 0) 21 | message []byte 22 | ring *iouring.Ring 23 | ) 24 | 25 | func main() { 26 | msg := struct { 27 | ID int 28 | Author string 29 | Content string 30 | }{ 31 | 112, 32 | "Sample User", 33 | "Sample message 123", 34 | } 35 | 36 | // Create a static message to deliver that can also be compared against when we receive it over the network 37 | message, _ = json.Marshal(msg) 38 | 39 | ring, _ = iouring.New(1024, &iouring.Params{}) 40 | 41 | // Start a new go routine that sends a message every second 42 | go sendMessage() 43 | 44 | r := gin.Default() 45 | 46 | // Create a SSE endpoint that hijacks all incoming connections and adds their underlying file descriptors to an array 47 | r.GET("/listen", func(c *gin.Context) { 48 | c.Header("Content-Type", "text/event-stream") 49 | c.Writer.WriteHeaderNow() 50 | 51 | nc, _, _ := c.Writer.Hijack() 52 | 53 | sf, _ := nc.(*net.TCPConn).File() 54 | 55 | fds = append(fds, int32(sf.Fd())) 56 | }) 57 | 58 | l, err := net.Listen("tcp", ":0") 59 | if err != nil { 60 | panic(err) 61 | } 62 | 63 | addr := fmt.Sprintf("http://localhost:%d/listen", l.Addr().(*net.TCPAddr).Port) 64 | 65 | go http.Serve(l, r) 66 | 67 | // Spawn n many clients to establish an SSE 68 | for i := 0; i < 10; i++ { 69 | time.Sleep(100 * time.Millisecond) 70 | go spawnClient(addr) 71 | } 72 | 73 | select {} 74 | } 75 | 76 | type backOff struct{} 77 | 78 | func (b *backOff) NextBackOff() time.Duration { return -1 } 79 | func (b *backOff) Reset() {} 80 | 81 | func spawnClient(addr string) { 82 | c := sse.NewClient(addr) 83 | c.ReconnectStrategy = &backOff{} 84 | 85 | // Subscribe to the SSE endpoint 86 | if err := c.Subscribe("", func(evt *sse.Event) { 87 | // If we receive an event that isn't equal to our preset message, it has been corrupted 88 | if string(message) != string(evt.Data) { 89 | log.Fatalf("Client received invalid response, expected: %s but got %s", string(message), string(evt.Data)) 90 | } 91 | }); err != nil { 92 | log.Fatalln(err.Error()) 93 | } 94 | } 95 | 96 | func sendMessage() { 97 | for { 98 | time.Sleep(1 * time.Second) 99 | 100 | if err := send(fds, message); err != nil { 101 | log.Fatal(err.Error()) 102 | } 103 | } 104 | } 105 | 106 | func send(fds []int32, data []byte) error { 107 | fmt.Printf("Sending %d bytes to %d sockets\n", len(data), len(fds)) 108 | 109 | var b bytes.Buffer 110 | // Encode the JSON message into an SSE 111 | if err := gsse.Encode(&b, gsse.Event{ 112 | Event: "message", 113 | Data: json.RawMessage(data), 114 | }); err != nil { 115 | return err 116 | } 117 | 118 | sdata := b.Bytes() 119 | 120 | wire := bytes.Buffer{} 121 | 122 | // Wrap the SSE into the chunked http wire format 123 | fmt.Fprintf(&wire, "%x\r\n", len(sdata)) 124 | wire.Write(sdata) 125 | wire.WriteString("\r\n") 126 | 127 | rawData := wire.Bytes() 128 | 129 | addr := (uint64)(uintptr(unsafe.Pointer(&rawData[0]))) 130 | length := uint32(len(rawData)) 131 | 132 | // Queue up n many SQE's for each file descriptor 133 | for _, fd := range fds { 134 | e, commit := ring.SubmitEntry() 135 | 136 | e.Opcode = iouring.WriteFixed 137 | e.Fd = fd 138 | e.Addr = addr 139 | e.Len = length 140 | 141 | commit() 142 | } 143 | 144 | return ring.Enter(uint(len(fds)), uint(len(fds)), iouring.EnterGetEvents, nil) 145 | } 146 | -------------------------------------------------------------------------------- /consts.go: -------------------------------------------------------------------------------- 1 | // +build linux 2 | 3 | package iouring 4 | 5 | import "io" 6 | 7 | // See uapi/linux/io_uring.h 8 | 9 | // Opcode is an opcode for the ring. 10 | type Opcode uint8 11 | 12 | const ( 13 | // SetupSyscall defines the syscall number for io_uring_setup. 14 | SetupSyscall = 425 15 | // EnterSyscall defines the syscall number for io_uring_enter. 16 | EnterSyscall = 426 17 | // RegisterSyscall defines the syscall number for io_uring_register. 18 | RegisterSyscall = 427 19 | ) 20 | 21 | const ( 22 | 23 | // FeatSingleMmap is used to configure a single mmap'd ring. 24 | FeatSingleMmap = (1 << 0) 25 | // FeatNoDrop is used to ensure that no CQEs are dropped. 26 | FeatNoDrop = (1 << 1) 27 | FeatSubmitStable = (1 << 2) 28 | FeatRwCurPos = (1 << 3) 29 | FeatCurPersonality = (1 << 4) 30 | ) 31 | 32 | const ( 33 | /* 34 | * sqe->flags 35 | */ 36 | SqeFixedFileBit = iota 37 | SqeIoDrainBit 38 | SqeIoLinkBit 39 | SqeIoHardlinkBit 40 | SqeAsyncBit 41 | SqeBufferSelectBit 42 | 43 | // SqeFixedFile use fixed fileset 44 | SqeFixedFile uint8 = (1 << SqeFixedFileBit) 45 | // SqeIoDrain issue after inflight IO 46 | SqeIoDrain uint8 = (1 << SqeIoDrainBit) 47 | // SqeIoLink is used to link multiple SQEs. 48 | SqeIoLink uint8 = (1 << SqeIoLinkBit) 49 | // SqeIoHardlink is a hard link to multiple SQEs 50 | SqeIoHardlink uint8 = (1 << SqeIoHardlinkBit) 51 | // SqeAsync is use to specify async io. 52 | SqeAsync uint8 = (1 << SqeAsyncBit) 53 | // SqeBufferSelect is used to specify buffer select. 54 | SqeBufferSelect uint8 = (1 << SqeBufferSelectBit) 55 | 56 | /* 57 | * io_uring_setup() flags 58 | */ 59 | 60 | // SetupIOPoll io_context is polled 61 | SetupIOPoll uint32 = (1 << 0) 62 | // SetupSQPoll SQ poll thread 63 | SetupSQPoll uint32 = (1 << 1) 64 | // SetupSQAFF sq_thread_cpu is valid 65 | SetupSQAFF uint32 = (1 << 2) 66 | // SetupCqSize app defines CQ size 67 | SetupCqSize uint32 = (1 << 3) 68 | // SetupClamp clamp SQ/CQ ring sizes 69 | SetupClamp uint32 = (1 << 4) 70 | // SetupAttachWq attach to existing wq 71 | SetupAttachWq uint32 = (1 << 5) 72 | ) 73 | 74 | const ( 75 | Nop Opcode = iota 76 | Readv 77 | Writev 78 | Fsync 79 | ReadFixed 80 | WriteFixed 81 | PollAdd 82 | PollRemove 83 | SyncFileRange 84 | SendMsg 85 | RecvMsg 86 | Timeout 87 | TimeoutRemove 88 | Accept 89 | AsyncCancel 90 | LinkTimeout 91 | Connect 92 | Fallocate 93 | OpenAt 94 | Close 95 | FilesUpdate 96 | Statx 97 | Read 98 | Write 99 | Fadvise 100 | Madvise 101 | Send 102 | Recv 103 | Openat2 104 | EpollCtl 105 | Splice 106 | ProvideBuffers 107 | RemoveBuffers 108 | OpSupported = (1 << 0) 109 | ) 110 | const ( 111 | /* 112 | * sqe->fsync_flags 113 | */ 114 | 115 | // FsyncDatasync ... 116 | FsyncDatasync uint = (1 << 0) 117 | 118 | /* 119 | * Magic offsets for the application to mmap the data it needs 120 | */ 121 | 122 | // SqRingOffset is the offset of the submission queue. 123 | SqRingOffset uint64 = 0 124 | // CqRingOffset is the offset of the completion queue. 125 | CqRingOffset uint64 = 0x8000000 126 | // SqeRingOffset is the offset of the submission queue entries. 127 | SqeRingOffset uint64 = 0x10000000 128 | 129 | /* 130 | * sq_ring->flags 131 | */ 132 | 133 | // SqNeedWakeup needs io_uring_enter wakeup 134 | SqNeedWakeup uint32 = (1 << 0) 135 | SqCqOverflow uint32 = (1 << 1) 136 | 137 | /* 138 | * io_uring_enter(2) flags 139 | */ 140 | 141 | // EnterGetEvents ... 142 | EnterGetEvents uint = (1 << 0) 143 | // EnterSqWakeup ... 144 | EnterSqWakeup uint = (1 << 1) 145 | 146 | /* 147 | * io_uring_register(2) opcodes and arguments 148 | */ 149 | 150 | RegRegisterBuffers = 0 151 | RegUnregisterBuffers = 1 152 | RegRegisterFiles = 2 153 | RegUnregisterFiles = 3 154 | RegRegisterEventFd = 4 155 | RegUnregisterEventfd = 5 156 | RegRegisterFilesUpdate = 6 157 | RegRegisterEventFdAsync = 7 158 | RegRegisterProbe = 8 159 | RegRegisterPersonality = 9 160 | RegUnregisterPersonality = 10 161 | ) 162 | 163 | // ReadWriteSeekerCloser is a ReadWriteCloser and ReadWriteSeeker. 164 | type ReadWriteSeekerCloser interface { 165 | io.Reader 166 | io.Writer 167 | io.Seeker 168 | io.Closer 169 | ReadAt([]byte, int64) (int, error) 170 | WriteAt([]byte, int64) (int, error) 171 | } 172 | -------------------------------------------------------------------------------- /ring_conn.go: -------------------------------------------------------------------------------- 1 | // +build linux 2 | 3 | package iouring 4 | 5 | import ( 6 | "context" 7 | "net" 8 | "runtime" 9 | "sync" 10 | "syscall" 11 | "time" 12 | "unsafe" 13 | 14 | "github.com/pkg/errors" 15 | ) 16 | 17 | // ringConn is a net.Conn that is backed by the Ring. 18 | type ringConn struct { 19 | fd int 20 | laddr *addr 21 | raddr *addr 22 | r *Ring 23 | offset *int64 24 | stop chan struct{} 25 | poll chan uint64 26 | pollReady *int32 27 | 28 | deadMu sync.RWMutex 29 | deadline time.Time 30 | readDeadline time.Time 31 | writeDeadline time.Time 32 | } 33 | 34 | // getCqe is used for getting a CQE result. 35 | func (c *ringConn) getCqe(ctx context.Context, reqID uint64) (int, error) { 36 | // TODO: Where should this repoll go? 37 | _, err := c.r.Enter(uint(1024), uint(1), EnterGetEvents, nil) 38 | if err != nil { 39 | return 0, err 40 | } 41 | c.stop <- struct{}{} 42 | var cqe *CompletionEntry 43 | for { 44 | select { 45 | case <-ctx.Done(): 46 | return 0, syscall.ETIMEDOUT 47 | default: 48 | } 49 | cqe, err = c.r.cq.EntryBy(reqID) 50 | if err != nil { 51 | // TODO: How many tries should looking for the cqe be 52 | // tried? 53 | if err != ErrEntryNotFound { 54 | continue 55 | } 56 | return 0, err 57 | } 58 | break 59 | } 60 | res := int(cqe.Res) 61 | if res < 0 { 62 | return 0, syscall.Errno(-res) 63 | } 64 | 65 | return res, nil 66 | } 67 | 68 | func (c *ringConn) rePoll() { 69 | // Reenable the poll on the connection. 70 | id := c.r.ID() 71 | sqe, commit := c.r.SubmitEntry() 72 | sqe.Opcode = PollAdd 73 | sqe.Fd = int32(c.fd) 74 | sqe.UFlags = int32(POLLIN) 75 | sqe.UserData = id 76 | commit() 77 | c.r.Enter(uint(1024), uint(1), EnterGetEvents, nil) 78 | } 79 | 80 | func (c *ringConn) run() { 81 | for { 82 | select { 83 | case <-c.stop: 84 | id := c.r.ID() 85 | sqe, commit := c.r.SubmitEntry() 86 | sqe.Opcode = PollRemove 87 | sqe.Fd = int32(c.fd) 88 | sqe.UserData = id 89 | commit() 90 | c.getCqe(context.Background(), id) 91 | return 92 | } 93 | } 94 | } 95 | 96 | // Read implements the net.Conn interface. 97 | func (c *ringConn) Read(b []byte) (int, error) { 98 | c.rePoll() 99 | sqe, commit := c.r.SubmitEntry() 100 | if sqe == nil { 101 | return 0, errors.New("ring unavailable") 102 | } 103 | 104 | sqe.Opcode = ReadFixed 105 | sqe.Fd = int32(c.fd) 106 | sqe.Len = uint32(len(b)) 107 | sqe.Flags = 0 108 | sqe.Addr = (uint64)(uintptr(unsafe.Pointer(&b[0]))) 109 | // Use reqId as user data so we can return the request from the 110 | // completion queue. 111 | reqID := c.r.ID() 112 | sqe.UserData = reqID 113 | commit() 114 | ctx := context.Background() 115 | 116 | n, err := c.getCqe(ctx, reqID) 117 | runtime.KeepAlive(b) 118 | return n, err 119 | } 120 | 121 | // Write implements the net.Conn interface. 122 | func (c *ringConn) Write(b []byte) (n int, err error) { 123 | sqe, commit := c.r.SubmitEntry() 124 | if sqe == nil { 125 | return 0, errors.New("ring unavailable") 126 | } 127 | 128 | sqe.Opcode = WriteFixed 129 | sqe.Fd = int32(c.fd) 130 | sqe.Len = uint32(len(b)) 131 | sqe.Addr = (uint64)(uintptr(unsafe.Pointer(&b[0]))) 132 | // Use reqId as user data so we can return the request from the 133 | // completion queue. 134 | reqID := c.r.ID() 135 | sqe.UserData = reqID 136 | commit() 137 | 138 | n, err = c.getCqe(context.Background(), reqID) 139 | runtime.KeepAlive(b) 140 | return n, err 141 | } 142 | 143 | // Close implements the net.Conn interface. 144 | func (c *ringConn) Close() error { 145 | c.stop <- struct{}{} 146 | return syscall.Close(c.fd) 147 | } 148 | 149 | // LocalAddr implements the net.Conn interface. 150 | func (c *ringConn) LocalAddr() net.Addr { 151 | return c.laddr 152 | } 153 | 154 | // RemoteAddr implements the net.Conn interface. 155 | func (c *ringConn) RemoteAddr() net.Addr { 156 | return c.raddr 157 | } 158 | 159 | // SetDeadline implements the net.Conn interface. 160 | func (c *ringConn) SetDeadline(t time.Time) error { 161 | c.deadMu.Lock() 162 | c.deadline = t 163 | c.deadMu.Unlock() 164 | return nil 165 | } 166 | 167 | // SetReadDeadline implements the net.Conn interface. 168 | func (c *ringConn) SetReadDeadline(t time.Time) error { 169 | c.deadMu.Lock() 170 | c.readDeadline = t 171 | c.deadMu.Unlock() 172 | return nil 173 | } 174 | 175 | // SetWriteDeadline the net.Conn interface. 176 | func (c *ringConn) SetWriteDeadline(t time.Time) error { 177 | c.deadMu.Lock() 178 | c.writeDeadline = t 179 | c.deadMu.Unlock() 180 | return nil 181 | } 182 | -------------------------------------------------------------------------------- /uring_setup.go: -------------------------------------------------------------------------------- 1 | // +build linux 2 | 3 | package iouring 4 | 5 | import ( 6 | "reflect" 7 | "runtime" 8 | "syscall" 9 | "unsafe" 10 | 11 | "github.com/pkg/errors" 12 | ) 13 | 14 | var ( 15 | uint32Size = unsafe.Sizeof(uint32(0)) 16 | cqeSize = unsafe.Sizeof(CompletionEntry{}) 17 | sqeSize = unsafe.Sizeof(SubmitEntry{}) 18 | ) 19 | 20 | // Setup is used to setup a io_uring using the io_uring_setup syscall. 21 | func Setup(entries uint, params *Params) (int, error) { 22 | fd, _, errno := syscall.Syscall( 23 | SetupSyscall, 24 | uintptr(entries), 25 | uintptr(unsafe.Pointer(params)), 26 | uintptr(0), 27 | ) 28 | if errno != 0 { 29 | err := errno 30 | return 0, err 31 | } 32 | return int(fd), nil 33 | } 34 | 35 | // MmapRing is used to configure the submit and completion queues, it should only 36 | // be called after the Setup function has completed successfully. 37 | // See: 38 | // https://github.com/axboe/liburing/blob/master/src/setup.c#L22 39 | func MmapRing(fd int, p *Params, sq *SubmitQueue, cq *CompletionQueue) error { 40 | var ( 41 | cqPtr uintptr 42 | sqPtr uintptr 43 | errno syscall.Errno 44 | err error 45 | ) 46 | singleMmap := p.Flags&FeatSingleMmap != 0 47 | sq.Size = uint32(uint(p.SqOffset.Array) + (uint(p.SqEntries) * uint(uint32Size))) 48 | cq.Size = uint32(uint(p.CqOffset.Cqes) + (uint(p.CqEntries) * uint(cqeSize))) 49 | 50 | if singleMmap { 51 | if cq.Size > sq.Size { 52 | sq.Size = cq.Size 53 | } else { 54 | cq.Size = sq.Size 55 | } 56 | } 57 | 58 | sqPtr, _, errno = syscall.Syscall6( 59 | syscall.SYS_MMAP, 60 | uintptr(0), 61 | uintptr(sq.Size), 62 | syscall.PROT_READ|syscall.PROT_WRITE, 63 | syscall.MAP_SHARED|syscall.MAP_POPULATE, 64 | uintptr(fd), 65 | uintptr(SqRingOffset), 66 | ) 67 | if errno != 0 { 68 | err = errno 69 | return errors.Wrap(err, "failed to mmap sq ring") 70 | } 71 | sq.ptr = sqPtr 72 | 73 | // Conversion of a uintptr back to Pointer is not valid in general, 74 | // except for: 75 | // 3) Conversion of a Pointer to a uintptr and back, with arithmetic. 76 | 77 | // Go vet doesn't like this so it's probably not valid. 78 | sq.Head = (*uint32)(unsafe.Pointer(sq.ptr + uintptr(p.SqOffset.Head))) 79 | sq.Tail = (*uint32)(unsafe.Pointer(sq.ptr + uintptr(p.SqOffset.Tail))) 80 | sq.Mask = (*uint32)(unsafe.Pointer(sq.ptr + uintptr(p.SqOffset.RingMask))) 81 | sq.Flags = (*uint32)(unsafe.Pointer(sq.ptr + uintptr(p.SqOffset.Flags))) 82 | sq.Dropped = (*uint32)(unsafe.Pointer(sq.ptr + uintptr(p.SqOffset.Dropped))) 83 | 84 | // Map the sqe ring. 85 | sqePtr, _, errno := syscall.Syscall6( 86 | syscall.SYS_MMAP, 87 | uintptr(0), 88 | uintptr(uint(p.SqEntries)*uint(sqeSize)), 89 | syscall.PROT_READ|syscall.PROT_WRITE, 90 | syscall.MAP_SHARED|syscall.MAP_POPULATE, 91 | uintptr(fd), 92 | uintptr(SqeRingOffset), 93 | ) 94 | if errno < 0 { 95 | return syscall.Errno(-errno) 96 | } 97 | 98 | // Making mmap'd slices is annoying. 99 | // BUG: don't use composite literals 100 | sq.Entries = *(*[]SubmitEntry)(unsafe.Pointer(&reflect.SliceHeader{ 101 | Data: uintptr(sqePtr), 102 | Len: int(p.SqEntries), 103 | Cap: int(p.SqEntries), 104 | })) 105 | // BUG: don't use composite literals 106 | sq.Array = *(*[]uint32)(unsafe.Pointer(&reflect.SliceHeader{ 107 | Data: uintptr(unsafe.Pointer(sqPtr + uintptr(p.SqOffset.Array))), 108 | Len: int(p.SqEntries), 109 | Cap: int(p.SqEntries), 110 | })) 111 | runtime.KeepAlive(sqePtr) 112 | 113 | if singleMmap { 114 | cqPtr = sqPtr 115 | } else { 116 | cqPtr, _, errno = syscall.Syscall6( 117 | syscall.SYS_MMAP, 118 | uintptr(0), 119 | uintptr(cq.Size), 120 | syscall.PROT_READ|syscall.PROT_WRITE, 121 | syscall.MAP_SHARED|syscall.MAP_POPULATE, 122 | uintptr(fd), 123 | uintptr(CqRingOffset), 124 | ) 125 | if errno < 0 { 126 | return syscall.Errno(-errno) 127 | } 128 | } 129 | 130 | cq.Head = (*uint32)(unsafe.Pointer(uintptr(uint(cqPtr) + uint(p.CqOffset.Head)))) 131 | cq.Tail = (*uint32)(unsafe.Pointer(uintptr(uint(cqPtr) + uint(p.CqOffset.Tail)))) 132 | cq.Mask = (*uint32)(unsafe.Pointer(uintptr(uint(cqPtr) + uint(p.CqOffset.RingMask)))) 133 | cq.Overflow = (*uint32)(unsafe.Pointer(uintptr(uint(cqPtr) + uint(p.CqOffset.Overflow)))) 134 | cq.Flags = (*uint32)(unsafe.Pointer(uintptr(uint(cqPtr) + uint(p.CqOffset.Flags)))) 135 | 136 | // BUG: don't use composite literals 137 | cq.Entries = *(*[]CompletionEntry)(unsafe.Pointer(&reflect.SliceHeader{ 138 | Data: uintptr(uint(cqPtr) + uint(p.CqOffset.Cqes)), 139 | Len: int(p.CqEntries), 140 | Cap: int(p.CqEntries), 141 | })) 142 | // See: https://github.com/jlauinger/go-safer 143 | runtime.KeepAlive(cqPtr) 144 | 145 | return nil 146 | } 147 | -------------------------------------------------------------------------------- /ring_benchmark_test.go: -------------------------------------------------------------------------------- 1 | // +build linux 2 | 3 | package iouring 4 | 5 | import ( 6 | "crypto/rand" 7 | "fmt" 8 | "io/ioutil" 9 | "os" 10 | "testing" 11 | "time" 12 | 13 | "github.com/stretchr/testify/require" 14 | ) 15 | 16 | func BenchmarkRingWrite(b *testing.B) { 17 | tests := []struct { 18 | ringSize uint 19 | writeSize int 20 | }{ 21 | { 22 | ringSize: 1024, 23 | writeSize: 128, 24 | }, 25 | { 26 | ringSize: 1024, 27 | writeSize: 512, 28 | }, 29 | { 30 | ringSize: 1024, 31 | writeSize: 1024, 32 | }, 33 | { 34 | ringSize: 8192, 35 | writeSize: 2048, 36 | }, 37 | { 38 | ringSize: 8192, 39 | writeSize: 4096, 40 | }, 41 | } 42 | 43 | for _, test := range tests { 44 | b.Run( 45 | fmt.Sprintf("ring-%d-write-%d", test.ringSize, test.writeSize), 46 | func(b *testing.B) { 47 | r, err := New(test.ringSize, &Params{ 48 | Features: FeatNoDrop, 49 | }, 50 | ) 51 | require.NoError(b, err) 52 | require.NotNil(b, r) 53 | 54 | //bufPool := sync.Pool{ 55 | // New: func() interface{} { 56 | // return make([]byte, writeSize) 57 | // }, 58 | //} 59 | 60 | f, err := ioutil.TempFile("", "example") 61 | require.NoError(b, err) 62 | defer os.Remove(f.Name()) 63 | 64 | rw, err := r.FileReadWriter(f) 65 | require.NoError(b, err) 66 | 67 | data := make([]byte, test.writeSize) 68 | 69 | b.SetBytes(int64(test.writeSize)) 70 | b.ReportAllocs() 71 | b.ResetTimer() 72 | for i := 0; i < b.N; i++ { 73 | //data := bufPool.Get().([]byte) 74 | _, err = rw.Write(data) 75 | if err != nil { 76 | b.Fatal(err) 77 | } 78 | //bufPool.Put(data) 79 | } 80 | }, 81 | ) 82 | } 83 | } 84 | 85 | func BenchmarkFileWrite(b *testing.B) { 86 | tests := []struct { 87 | ringSize uint 88 | writeSize int 89 | multiwrite int 90 | }{ 91 | { 92 | ringSize: 1024, 93 | writeSize: 128, 94 | multiwrite: 1, 95 | }, 96 | { 97 | ringSize: 1024, 98 | writeSize: 512, 99 | multiwrite: 1, 100 | }, 101 | { 102 | ringSize: 1024, 103 | writeSize: 1024, 104 | multiwrite: 1, 105 | }, 106 | { 107 | ringSize: 8192, 108 | writeSize: 2048, 109 | multiwrite: 2, 110 | }, 111 | { 112 | ringSize: 8192, 113 | writeSize: 4096, 114 | multiwrite: 2, 115 | }, 116 | } 117 | for _, test := range tests { 118 | b.Run( 119 | fmt.Sprintf("os-file-write-%d", test.writeSize), 120 | func(b *testing.B) { 121 | data := make([]byte, test.writeSize) 122 | n, err := rand.Read(data) 123 | require.NoError(b, err) 124 | require.Equal(b, test.writeSize, int(n)) 125 | 126 | f, err := os.OpenFile( 127 | fmt.Sprintf("os-file-write-%d.test", test.writeSize), 128 | os.O_RDWR|os.O_CREATE, 0644) 129 | require.NoError(b, err) 130 | defer os.Remove(f.Name()) 131 | 132 | b.SetBytes(int64(len(data))) 133 | b.ReportAllocs() 134 | b.ResetTimer() 135 | for i := 0; i < b.N; i++ { 136 | f.Write(data) 137 | } 138 | }, 139 | ) 140 | } 141 | } 142 | 143 | func BenchmarkRingDeadlineWrite(b *testing.B) { 144 | tests := []struct { 145 | ringSize uint 146 | writeSize int 147 | deadline time.Duration 148 | }{ 149 | { 150 | ringSize: 1024, 151 | writeSize: 128, 152 | deadline: 1 * time.Millisecond, 153 | }, 154 | { 155 | ringSize: 1024, 156 | writeSize: 512, 157 | deadline: 100 * time.Microsecond, 158 | }, 159 | { 160 | ringSize: 1024, 161 | writeSize: 1024, 162 | deadline: 10 * time.Microsecond, 163 | }, 164 | { 165 | ringSize: 8192, 166 | writeSize: 2048, 167 | deadline: 1 * time.Microsecond, 168 | }, 169 | { 170 | ringSize: 8192, 171 | writeSize: 4096, 172 | deadline: 1 * time.Microsecond, 173 | }, 174 | } 175 | for _, test := range tests { 176 | b.Run( 177 | fmt.Sprintf("ring-%d-deadline-%v-%d", test.ringSize, test.deadline.String(), test.writeSize), 178 | func(b *testing.B) { 179 | r, err := New(test.ringSize, &Params{Features: FeatNoDrop}, WithDeadline(test.deadline)) 180 | require.NoError(b, err) 181 | require.NotNil(b, r) 182 | 183 | f, err := ioutil.TempFile("", "example") 184 | require.NoError(b, err) 185 | defer os.Remove(f.Name()) 186 | 187 | rw, err := r.FileReadWriter(f) 188 | require.NoError(b, err) 189 | 190 | data := make([]byte, test.writeSize) 191 | 192 | b.SetBytes(int64(test.writeSize)) 193 | b.ReportAllocs() 194 | b.ResetTimer() 195 | for i := 0; i < b.N; i++ { 196 | _, err = rw.Write(data) 197 | if err != nil { 198 | b.Fatal(err) 199 | } 200 | } 201 | }, 202 | ) 203 | } 204 | } 205 | -------------------------------------------------------------------------------- /examples/bulk-socket/go.sum: -------------------------------------------------------------------------------- 1 | github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= 2 | github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= 3 | github.com/gin-contrib/sse v0.1.0 h1:Y/yl/+YNO8GZSjAhjMsSuLt29uWRFHdHYUb5lYOV9qE= 4 | github.com/gin-contrib/sse v0.1.0/go.mod h1:RHrZQHXnP2xjPF+u1gW/2HnVO7nvIa9PG3Gm+fLHvGI= 5 | github.com/gin-gonic/gin v1.6.3 h1:ahKqKTFpO5KTPHxWZjEdPScmYaGtLo8Y4DMHoEsnp14= 6 | github.com/gin-gonic/gin v1.6.3/go.mod h1:75u5sXoLsGZoRN5Sgbi1eraJ4GU3++wFwWzhwvtwp4M= 7 | github.com/go-playground/assert/v2 v2.0.1/go.mod h1:VDjEfimB/XKnb+ZQfWdccd7VUvScMdVu0Titje2rxJ4= 8 | github.com/go-playground/locales v0.13.0 h1:HyWk6mgj5qFqCT5fjGBuRArbVDfE4hi8+e8ceBS/t7Q= 9 | github.com/go-playground/locales v0.13.0/go.mod h1:taPMhCMXrRLJO55olJkUXHZBHCxTMfnGwq/HNwmWNS8= 10 | github.com/go-playground/universal-translator v0.17.0 h1:icxd5fm+REJzpZx7ZfpaD876Lmtgy7VtROAbHHXk8no= 11 | github.com/go-playground/universal-translator v0.17.0/go.mod h1:UkSxE5sNxxRwHyU+Scu5vgOQjsIJAF8j9muTVoKLVtA= 12 | github.com/go-playground/validator/v10 v10.2.0 h1:KgJ0snyC2R9VXYN2rneOtQcw5aHQB1Vv0sFl1UcHBOY= 13 | github.com/go-playground/validator/v10 v10.2.0/go.mod h1:uOYAAleCW8F/7oMFd6aG0GOhaH6EGOAJShg8Id5JGkI= 14 | github.com/golang/protobuf v1.3.3 h1:gyjaxf+svBWX08ZjK86iN9geUJF0H6gp2IRKX6Nf6/I= 15 | github.com/golang/protobuf v1.3.3/go.mod h1:vzj43D7+SQXF/4pzW/hwtAqwc6iTitCiVSaWz5lYuqw= 16 | github.com/google/gofuzz v1.0.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg= 17 | github.com/hodgesds/iouring-go v0.0.0-20200506041732-4ec64dcb5875 h1:QSVBRVqTQRNTVnNxffYcJy5MD7YD89SANqHQD5RP+QQ= 18 | github.com/hodgesds/iouring-go v0.0.0-20200506041732-4ec64dcb5875/go.mod h1:HbMokKokhmAlGWaSJRUCUhhTudi+e0ZFvZq9b6JYHRo= 19 | github.com/json-iterator/go v1.1.9/go.mod h1:KdQUCv79m/52Kvf8AW2vK1V8akMuk1QjK/uOdHXbAo4= 20 | github.com/leodido/go-urn v1.2.0 h1:hpXL4XnriNwQ/ABnpepYM/1vCLWNDfUNts8dX3xTG6Y= 21 | github.com/leodido/go-urn v1.2.0/go.mod h1:+8+nEpDfqqsY+g338gtMEUOtuK+4dEMhiQEgxpxOKII= 22 | github.com/mattn/go-isatty v0.0.12 h1:wuysRhFDzyxgEmMf5xjvJ2M9dZoWAXNNr5LSBS7uHXY= 23 | github.com/mattn/go-isatty v0.0.12/go.mod h1:cbi8OIDigv2wuxKPP5vlRcQ1OAZbq2CE4Kysco4FUpU= 24 | github.com/modern-go/concurrent v0.0.0-20180228061459-e0a39a4cb421/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q= 25 | github.com/modern-go/reflect2 v0.0.0-20180701023420-4b7aa43c6742/go.mod h1:bx2lNnkwVCuqBIxFjflWJWanXIb3RllmbCylyMrvgv0= 26 | github.com/pkg/errors v0.9.1 h1:FEBLx1zS214owpjy7qsBeixbURkuhQAwrK5UwLGTwt4= 27 | github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0= 28 | github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= 29 | github.com/r3labs/sse v0.0.0-20200310095403-ee05428e4d0e h1:w3ZemLxSM2hb3bHk7wjNaAAluaDQ+9WnWZQV1wcA8f4= 30 | github.com/r3labs/sse v0.0.0-20200310095403-ee05428e4d0e/go.mod h1:S8xSOnV3CgpNrWd0GQ/OoQfMtlg2uPRSuTzcSGrzwK8= 31 | github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= 32 | github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI= 33 | github.com/stretchr/testify v1.4.0/go.mod h1:j7eGeouHqKxXV5pUuKE4zz7dFj8WfuZ+81PSLYec5m4= 34 | github.com/ugorji/go v1.1.7 h1:/68gy2h+1mWMrwZFeD1kQialdSzAb432dtpeJ42ovdo= 35 | github.com/ugorji/go v1.1.7/go.mod h1:kZn38zHttfInRq0xu/PH0az30d+z6vm202qpg1oXVMw= 36 | github.com/ugorji/go/codec v1.1.7 h1:2SvQaVZ1ouYrrKKwoSk2pzd4A9evlKJb9oTL+OaLUSs= 37 | github.com/ugorji/go/codec v1.1.7/go.mod h1:Ax+UKWsSmolVDwsd+7N3ZtXu+yMGCf907BLYF3GoBXY= 38 | golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= 39 | golang.org/x/net v0.0.0-20191116160921-f9c825593386 h1:ktbWvQrW08Txdxno1PiDpSxPXG6ndGsfnJjRRtkM0LQ= 40 | golang.org/x/net v0.0.0-20191116160921-f9c825593386/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= 41 | golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= 42 | golang.org/x/sys v0.0.0-20200116001909-b77594299b42/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= 43 | golang.org/x/sys v0.0.0-20200202164722-d101bd2416d5 h1:LfCXLvNmTYH9kEmVgqbnsWfruoXZIrh4YBgqVHtDvw0= 44 | golang.org/x/sys v0.0.0-20200202164722-d101bd2416d5/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= 45 | golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= 46 | golang.org/x/text v0.3.2/go.mod h1:bEr9sfX3Q8Zfm5fL9x+3itogRgK3+ptLWKqgva+5dAk= 47 | golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= 48 | gopkg.in/cenkalti/backoff.v1 v1.1.0 h1:Arh75ttbsvlpVA7WtVpH4u9h6Zl46xuptxqLxPiSo4Y= 49 | gopkg.in/cenkalti/backoff.v1 v1.1.0/go.mod h1:J6Vskwqd+OMVJl8C33mmtxTBs2gyzfv7UDAkHu8BrjI= 50 | gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= 51 | gopkg.in/yaml.v2 v2.2.2/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= 52 | gopkg.in/yaml.v2 v2.2.8 h1:obN1ZagJSUGI0Ek/LBmuj4SNLPfIny3KsKFopxRdj10= 53 | gopkg.in/yaml.v2 v2.2.8/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= 54 | -------------------------------------------------------------------------------- /register.go: -------------------------------------------------------------------------------- 1 | // +build linux 2 | 3 | package iouring 4 | 5 | import ( 6 | "fmt" 7 | "sync" 8 | "syscall" 9 | "unsafe" 10 | ) 11 | 12 | // RegisterEventFd is used to register an event file descriptor to a ring. 13 | func RegisterEventFd(ringFd int, fd int) error { 14 | _, _, errno := syscall.Syscall6( 15 | RegisterSyscall, 16 | uintptr(ringFd), 17 | uintptr(RegRegisterEventFd), 18 | uintptr(fd), 19 | uintptr(1), 20 | uintptr(0), 21 | uintptr(0), 22 | ) 23 | if errno < 0 { 24 | var err error 25 | err = errno 26 | return err 27 | } 28 | return nil 29 | } 30 | 31 | // RegisterEventFdAsync is used to register an event file descriptor for async 32 | // polling on a ring. 33 | func RegisterEventFdAsync(ringFd int, fd int) error { 34 | _, _, errno := syscall.Syscall6( 35 | RegisterSyscall, 36 | uintptr(ringFd), 37 | uintptr(RegRegisterEventFdAsync), 38 | uintptr(fd), 39 | uintptr(1), 40 | uintptr(0), 41 | uintptr(0), 42 | ) 43 | if errno < 0 { 44 | var err error 45 | err = errno 46 | return err 47 | } 48 | return nil 49 | } 50 | 51 | // UnregisterEventFd is used to unregister a file descriptor to a ring. 52 | func UnregisterEventFd(ringFd int, fd int) error { 53 | _, _, errno := syscall.Syscall6( 54 | RegisterSyscall, 55 | uintptr(ringFd), 56 | uintptr(RegRegisterEventFd), 57 | uintptr(0), 58 | uintptr(0), 59 | uintptr(0), 60 | uintptr(0), 61 | ) 62 | if errno < 0 { 63 | var err error 64 | err = errno 65 | return err 66 | } 67 | return nil 68 | } 69 | 70 | // RegisterBuffers is used to register buffers to a ring. 71 | func RegisterBuffers(fd int, vecs []*syscall.Iovec) error { 72 | _, _, errno := syscall.Syscall6( 73 | RegisterSyscall, 74 | uintptr(fd), 75 | uintptr(RegRegisterBuffers), 76 | uintptr(unsafe.Pointer(&vecs[0])), 77 | uintptr(len(vecs)), 78 | uintptr(0), 79 | uintptr(0), 80 | ) 81 | if errno < 0 { 82 | var err error 83 | err = errno 84 | return err 85 | } 86 | return nil 87 | } 88 | 89 | // UnregisterBuffers is used to unregister iovecs from a ring. 90 | func UnregisterBuffers(fd int, vecs []*syscall.Iovec) error { 91 | _, _, errno := syscall.Syscall6( 92 | RegisterSyscall, 93 | uintptr(fd), 94 | uintptr(RegUnregisterBuffers), 95 | uintptr(unsafe.Pointer(&vecs[0])), 96 | uintptr(len(vecs)), 97 | uintptr(0), 98 | uintptr(0), 99 | ) 100 | if errno < 0 { 101 | var err error 102 | err = errno 103 | return err 104 | } 105 | return nil 106 | } 107 | 108 | // RegisterFiles is used to register files to a ring. 109 | func RegisterFiles(fd int, files []int) error { 110 | _, _, errno := syscall.Syscall6( 111 | RegisterSyscall, 112 | uintptr(fd), 113 | uintptr(RegRegisterFiles), 114 | uintptr(unsafe.Pointer(&files[0])), 115 | uintptr(len(files)), 116 | uintptr(0), 117 | uintptr(0), 118 | ) 119 | if errno < 0 { 120 | var err error 121 | err = errno 122 | return err 123 | } 124 | return nil 125 | } 126 | 127 | // UnregisterFiles is used to unregister files to a ring. 128 | func UnregisterFiles(fd int, files []int) error { 129 | _, _, errno := syscall.Syscall6( 130 | RegisterSyscall, 131 | uintptr(fd), 132 | uintptr(RegUnregisterFiles), 133 | uintptr(unsafe.Pointer(&files[0])), 134 | uintptr(len(files)), 135 | uintptr(0), 136 | uintptr(0), 137 | ) 138 | if errno < 0 { 139 | var err error 140 | err = errno 141 | return err 142 | } 143 | return nil 144 | } 145 | 146 | // ReregisterFiles is used to reregister files to a ring. 147 | func ReregisterFiles(fd int, files []int) error { 148 | _, _, errno := syscall.Syscall6( 149 | RegisterSyscall, 150 | uintptr(fd), 151 | uintptr(RegRegisterFilesUpdate), 152 | uintptr(unsafe.Pointer(&files[0])), 153 | uintptr(len(files)), 154 | uintptr(0), 155 | uintptr(0), 156 | ) 157 | if errno < 0 { 158 | var err error 159 | err = errno 160 | return err 161 | } 162 | return nil 163 | } 164 | 165 | // FileRegistry is an interface for registering files to a Ring. 166 | type FileRegistry interface { 167 | Register(int) error 168 | Unregister(int) error 169 | ID(int) (int, bool) 170 | } 171 | 172 | type fileRegistry struct { 173 | mu sync.RWMutex 174 | ringFd int 175 | f []int 176 | fID map[int]int /* map of fd to offset */ 177 | } 178 | 179 | // NewFileRegistry creates a FileRegistry for use with a ring. 180 | func NewFileRegistry(ringFd int) FileRegistry { 181 | return &fileRegistry{ 182 | ringFd: ringFd, 183 | f: []int{}, 184 | fID: map[int]int{}, 185 | } 186 | } 187 | 188 | // Register implements the FileRegistry interface. It is used to register a 189 | // file descriptor with a ring. 190 | func (r *fileRegistry) Register(fd int) error { 191 | r.mu.Lock() 192 | defer r.mu.Unlock() 193 | 194 | r.f = append(r.f, fd) 195 | r.fID[fd] = len(r.f) - 1 196 | if r.fID[fd] < 0 { 197 | r.fID[fd] = 0 198 | } 199 | return ReregisterFiles(r.ringFd, r.f) 200 | } 201 | 202 | // Unregister implements the FileRegistry interface. It is used to unregister a 203 | // file descriptor form a ring. 204 | func (r *fileRegistry) Unregister(fd int) error { 205 | r.mu.Lock() 206 | defer r.mu.Unlock() 207 | 208 | id, ok := r.fID[fd] 209 | if !ok { 210 | return fmt.Errorf("fd %d not registered", fd) 211 | } 212 | r.f = append(r.f[:id], r.f[id+1:]...) 213 | 214 | return UnregisterFiles(r.ringFd, r.f) 215 | } 216 | 217 | // ID returns the ID of a file descriptor that has been registered. 218 | func (r *fileRegistry) ID(fd int) (int, bool) { 219 | r.mu.RLock() 220 | defer r.mu.RUnlock() 221 | 222 | id, ok := r.fID[fd] 223 | return id, ok 224 | } 225 | -------------------------------------------------------------------------------- /read_writer.go: -------------------------------------------------------------------------------- 1 | // +build linux 2 | 3 | package iouring 4 | 5 | import ( 6 | "io" 7 | "os" 8 | "runtime" 9 | "sync/atomic" 10 | "syscall" 11 | "unsafe" 12 | 13 | "github.com/pkg/errors" 14 | ) 15 | 16 | // ReadWriteAtCloser supports reading, writing, and closing. 17 | type ReadWriteAtCloser interface { 18 | io.WriterAt 19 | io.ReadWriteCloser 20 | } 21 | 22 | // ringFIO is used for handling file IO. 23 | type ringFIO struct { 24 | r *Ring 25 | f *os.File 26 | fd int32 27 | fOffset *int64 28 | c *completer 29 | } 30 | 31 | // getCqe is used for getting a CQE result and will retry up to one time. 32 | func (i *ringFIO) getCqe(reqID uint64, count, min int) (int, error) { 33 | // TODO: consider adding the submitter interface here, or move out the 34 | // submit function from this method all together. 35 | if count > 0 || min > 0 { 36 | _, err := i.r.Enter(uint(count), uint(min), EnterGetEvents, nil) 37 | if err != nil { 38 | return 0, err 39 | } 40 | } 41 | 42 | cq := i.r.cq 43 | foundIdx := 0 44 | findCqe: 45 | head := atomic.LoadUint32(cq.Head) 46 | tail := atomic.LoadUint32(cq.Tail) 47 | mask := atomic.LoadUint32(cq.Mask) 48 | end := int(tail & mask) 49 | 50 | for x := int(head & mask); x < len(cq.Entries); x++ { 51 | cqe := cq.Entries[x] 52 | if cqe.UserData == reqID { 53 | if cqe.Res < 0 { 54 | return 0, syscall.Errno(-cqe.Res) 55 | } 56 | atomic.StoreInt64(i.fOffset, atomic.LoadInt64(i.fOffset)+int64(cqe.Res)) 57 | foundIdx = x 58 | i.c.complete(foundIdx) 59 | return int(cqe.Res), nil 60 | } 61 | if x == end { 62 | goto findCqe 63 | } 64 | } 65 | tail = atomic.LoadUint32(cq.Tail) 66 | mask = atomic.LoadUint32(cq.Mask) 67 | end = int(tail & mask) 68 | for x := 0; x < end; x++ { 69 | cqe := cq.Entries[x] 70 | if cqe.UserData == reqID { 71 | if cqe.Res < 0 { 72 | return 0, syscall.Errno(-cqe.Res) 73 | } 74 | atomic.StoreInt64(i.fOffset, atomic.LoadInt64(i.fOffset)+int64(cqe.Res)) 75 | foundIdx = x 76 | i.c.complete(foundIdx) 77 | return int(cqe.Res), nil 78 | } 79 | if x == end { 80 | goto findCqe 81 | } 82 | } 83 | goto findCqe 84 | } 85 | 86 | // Write implements the io.Writer interface. 87 | func (i *ringFIO) Write(b []byte) (int, error) { 88 | id, ready, err := i.PrepareWrite(b, 0) 89 | if err != nil { 90 | return 0, err 91 | } 92 | ready() 93 | n, err := i.getCqe(id, 1, 1) 94 | runtime.KeepAlive(b) 95 | return n, err 96 | } 97 | 98 | // PrepareWrite is used to prepare a Write SQE. The ring is able to be entered 99 | // after the returned callback is called. 100 | func (i *ringFIO) PrepareWrite(b []byte, flags uint8) (uint64, func(), error) { 101 | sqe, ready := i.r.SubmitEntry() 102 | if sqe == nil { 103 | return 0, nil, errRingUnavailable 104 | } 105 | 106 | sqe.Opcode = Write 107 | sqe.UserData = i.r.ID() 108 | sqe.Fd = i.fd 109 | sqe.Len = uint32(len(b)) 110 | sqe.Flags = flags 111 | sqe.Offset = uint64(atomic.LoadInt64(i.fOffset)) 112 | sqe.Addr = (uint64)(uintptr(unsafe.Pointer(&b[0]))) 113 | 114 | return sqe.UserData, ready, nil 115 | } 116 | 117 | // PrepareRead is used to prepare a Read SQE. The ring is able to be entered 118 | // after the returned callback is called. 119 | func (i *ringFIO) PrepareRead(b []byte, flags uint8) (uint64, func(), error) { 120 | sqe, ready := i.r.SubmitEntry() 121 | if sqe == nil { 122 | return 0, nil, errRingUnavailable 123 | } 124 | 125 | sqe.Opcode = Read 126 | sqe.UserData = i.r.ID() 127 | sqe.Fd = i.fd 128 | sqe.Len = uint32(len(b)) 129 | sqe.Flags = flags 130 | sqe.Offset = uint64(atomic.LoadInt64(i.fOffset)) 131 | sqe.Addr = (uint64)(uintptr(unsafe.Pointer(&b[0]))) 132 | 133 | return sqe.UserData, ready, nil 134 | } 135 | 136 | // Read implements the io.Reader interface. 137 | func (i *ringFIO) Read(b []byte) (int, error) { 138 | id, ready, err := i.PrepareRead(b, 0) 139 | if err != nil { 140 | return 0, err 141 | } 142 | ready() 143 | n, err := i.getCqe(id, 1, 1) 144 | runtime.KeepAlive(b) 145 | if err != nil { 146 | return 0, err 147 | } 148 | if n == 0 { 149 | return n, io.EOF 150 | } 151 | return n, nil 152 | } 153 | 154 | // WriteAt implements the io.WriterAt interface. 155 | func (i *ringFIO) WriteAt(b []byte, o int64) (int, error) { 156 | sqe, ready := i.r.SubmitEntry() 157 | if sqe == nil { 158 | return 0, errRingUnavailable 159 | } 160 | 161 | sqe.Opcode = Write 162 | sqe.UserData = i.r.ID() 163 | sqe.Fd = i.fd 164 | sqe.Len = uint32(len(b)) 165 | sqe.Flags = 0 166 | sqe.Offset = uint64(o) 167 | sqe.Addr = (uint64)(uintptr(unsafe.Pointer(&b[0]))) 168 | 169 | ready() 170 | 171 | n, err := i.getCqe(sqe.UserData, 1, 1) 172 | runtime.KeepAlive(b) 173 | return n, err 174 | } 175 | 176 | // ReadAt implements the io.ReaderAt interface. 177 | func (i *ringFIO) ReadAt(b []byte, o int64) (int, error) { 178 | sqe, ready := i.r.SubmitEntry() 179 | if sqe == nil { 180 | return 0, errRingUnavailable 181 | } 182 | 183 | sqe.Opcode = Read 184 | sqe.UserData = i.r.ID() 185 | sqe.Fd = i.fd 186 | sqe.Len = uint32(len(b)) 187 | sqe.Flags = 0 188 | sqe.Offset = uint64(o) 189 | sqe.Addr = (uint64)(uintptr(unsafe.Pointer(&b[0]))) 190 | 191 | ready() 192 | 193 | n, err := i.getCqe(sqe.UserData, 1, 1) 194 | runtime.KeepAlive(b) 195 | if err != nil { 196 | return 0, err 197 | } 198 | if n == 0 { 199 | return n, io.EOF 200 | } 201 | return n, nil 202 | } 203 | 204 | // Close implements the io.Closer interface. 205 | func (i *ringFIO) Close() error { 206 | id, err := i.r.PrepareClose(int(i.fd)) 207 | if err != nil { 208 | return err 209 | } 210 | _, err = i.getCqe(id, 1, 1) 211 | if err != nil { 212 | return err 213 | } 214 | return nil 215 | } 216 | 217 | // Seek implements the io.Seeker interface. 218 | func (i *ringFIO) Seek(offset int64, whence int) (int64, error) { 219 | switch whence { 220 | case io.SeekStart: 221 | atomic.StoreInt64(i.fOffset, offset) 222 | return 0, nil 223 | case io.SeekCurrent: 224 | atomic.StoreInt64(i.fOffset, atomic.LoadInt64(i.fOffset)+offset) 225 | return 0, nil 226 | case io.SeekEnd: 227 | stat, err := i.f.Stat() 228 | if err != nil { 229 | return 0, err 230 | } 231 | atomic.StoreInt64(i.fOffset, stat.Size()-offset) 232 | return 0, nil 233 | default: 234 | return 0, errors.New("unknown whence") 235 | } 236 | } 237 | -------------------------------------------------------------------------------- /types_test.go: -------------------------------------------------------------------------------- 1 | // +build linux 2 | 3 | package iouring 4 | 5 | import ( 6 | "io" 7 | "io/ioutil" 8 | "os" 9 | "sync" 10 | "testing" 11 | 12 | "github.com/stretchr/testify/require" 13 | ) 14 | 15 | func TestRingFileReadWriterRead(t *testing.T) { 16 | r, err := New(1024, nil) 17 | require.NoError(t, err) 18 | require.NotNil(t, r) 19 | 20 | content := []byte("testing...1,2.3") 21 | f, err := ioutil.TempFile("", "example") 22 | require.NoError(t, err) 23 | defer os.Remove(f.Name()) 24 | 25 | _, err = f.Write(content) 26 | require.NoError(t, err) 27 | require.NoError(t, f.Sync()) 28 | 29 | _, err = f.Seek(0, 0) 30 | require.NoError(t, err) 31 | 32 | rw, err := r.FileReadWriter(f) 33 | require.NoError(t, err) 34 | 35 | buf := make([]byte, len(content)/2) 36 | n, err := rw.Read(buf) 37 | require.NoError(t, err) 38 | require.True(t, n > 0) 39 | require.Subset(t, content, buf) 40 | 41 | buf = make([]byte, len(content)/2) 42 | n, err = rw.Read(buf) 43 | require.NoError(t, err) 44 | require.True(t, n > 0) 45 | require.Subset(t, content, buf) 46 | 47 | require.NoError(t, rw.Close()) 48 | } 49 | 50 | func TestRingFileReadWriterSeek(t *testing.T) { 51 | r, err := New(1024, nil) 52 | require.NoError(t, err) 53 | require.NotNil(t, r) 54 | 55 | content := []byte("testing...1,2,3") 56 | f, err := ioutil.TempFile("", "example") 57 | require.NoError(t, err) 58 | defer os.Remove(f.Name()) 59 | 60 | rw, err := r.FileReadWriter(f) 61 | require.NoError(t, err) 62 | 63 | _, err = rw.Write(content) 64 | require.NoError(t, err) 65 | require.NoError(t, f.Sync()) 66 | 67 | _, err = rw.Seek(0, io.SeekCurrent) 68 | require.NoError(t, err) 69 | _, err = rw.Seek(0, io.SeekEnd) 70 | require.NoError(t, err) 71 | } 72 | 73 | func TestRingFileReadWriterReadAt(t *testing.T) { 74 | r, err := New(1024, nil) 75 | require.NoError(t, err) 76 | require.NotNil(t, r) 77 | 78 | content := []byte("testing...1,2,3") 79 | f, err := ioutil.TempFile("", "example") 80 | require.NoError(t, err) 81 | defer os.Remove(f.Name()) 82 | 83 | rw, err := r.FileReadWriter(f) 84 | require.NoError(t, err) 85 | 86 | _, err = rw.Write(content) 87 | require.NoError(t, err) 88 | require.NoError(t, f.Sync()) 89 | 90 | buf := make([]byte, len(content)) 91 | _, err = rw.ReadAt(buf, 0) 92 | require.NoError(t, err) 93 | require.Equal(t, content, buf) 94 | } 95 | 96 | func TestRingFileReadWriterWriteAt(t *testing.T) { 97 | r, err := New(1024, nil) 98 | require.NoError(t, err) 99 | require.NotNil(t, r) 100 | 101 | content := []byte("testing...1,2,3") 102 | f, err := ioutil.TempFile("", "example") 103 | require.NoError(t, err) 104 | defer os.Remove(f.Name()) 105 | 106 | rw, err := r.FileReadWriter(f) 107 | require.NoError(t, err) 108 | 109 | _, err = rw.WriteAt(content, 0) 110 | require.NoError(t, err) 111 | require.NoError(t, f.Sync()) 112 | 113 | buf := []byte("testing...3,2,1") 114 | n, err := rw.WriteAt(buf, 0) 115 | require.NoError(t, err) 116 | require.Equal(t, len(buf), n) 117 | 118 | buf2 := make([]byte, len(buf)) 119 | _, err = rw.ReadAt(buf2, 0) 120 | require.NoError(t, err) 121 | require.Equal(t, buf, buf2) 122 | } 123 | 124 | func TestRingFileReadWriterWrite(t *testing.T) { 125 | r, err := New(1024, nil) 126 | require.NoError(t, err) 127 | require.NotNil(t, r) 128 | 129 | content := []byte("testing...1,2.3") 130 | f, err := ioutil.TempFile("", "example") 131 | require.NoError(t, err) 132 | defer os.Remove(f.Name()) 133 | 134 | rw, err := r.FileReadWriter(f) 135 | require.NoError(t, err) 136 | 137 | // Write to the file using the ring 138 | _, err = rw.Write(content) 139 | require.NoError(t, err) 140 | 141 | require.NoError(t, f.Sync()) 142 | 143 | _, err = f.Seek(0, 0) 144 | require.NoError(t, err) 145 | 146 | buf := make([]byte, len(content)) 147 | n, err := f.Read(buf) 148 | require.NoError(t, err) 149 | require.True(t, n > 0) 150 | require.Equal(t, content, buf) 151 | require.NoError(t, rw.Close()) 152 | } 153 | 154 | func TestRingFileReadWriterWriteRead(t *testing.T) { 155 | r, err := New(1024, nil) 156 | require.NoError(t, err) 157 | require.NotNil(t, r) 158 | 159 | content := []byte("testing...1,2,3") 160 | f, err := ioutil.TempFile("", "example") 161 | require.NoError(t, err) 162 | defer os.Remove(f.Name()) 163 | 164 | rw, err := r.FileReadWriter(f) 165 | require.NoError(t, err) 166 | 167 | // Write to the file using the ring 168 | _, err = rw.Write(content) 169 | require.NoError(t, err) 170 | 171 | require.NoError(t, f.Sync()) 172 | 173 | _, err = rw.Seek(0, 0) 174 | require.NoError(t, err) 175 | 176 | buf := make([]byte, len(content)) 177 | n, err := rw.Read(buf) 178 | require.NoError(t, err) 179 | require.True(t, n > 0) 180 | require.Equal(t, content, buf) 181 | 182 | require.NoError(t, rw.Close()) 183 | } 184 | 185 | func TestRingReadWrap(t *testing.T) { 186 | ringSize := uint(8) 187 | r, err := New(ringSize, nil) 188 | require.NoError(t, err) 189 | require.NotNil(t, r) 190 | 191 | f, err := os.Open("/dev/zero") 192 | require.NoError(t, err) 193 | 194 | rw, err := r.FileReadWriter(f) 195 | require.NoError(t, err) 196 | 197 | for i := 0; i < int(ringSize)*100; i++ { 198 | buf := make([]byte, 8) 199 | n, err := rw.Read(buf) 200 | require.NoError(t, err) 201 | require.True(t, n > 0) 202 | } 203 | } 204 | 205 | func TestConcurrentReaders(t *testing.T) { 206 | ringSize := uint(8) 207 | r, err := New(ringSize, &Params{}) 208 | require.NoError(t, err) 209 | require.NotNil(t, r) 210 | 211 | f, err := os.Open("/dev/zero") 212 | require.NoError(t, err) 213 | 214 | rw, err := r.FileReadWriter(f) 215 | require.NoError(t, err) 216 | 217 | work := make(chan struct{}) 218 | stop := make(chan struct{}) 219 | done := make(chan struct{}, 8) 220 | var wg sync.WaitGroup 221 | 222 | for i := 0; i < 4; i++ { 223 | wg.Add(1) 224 | go func() { 225 | for { 226 | select { 227 | case <-stop: 228 | wg.Done() 229 | return 230 | case <-work: 231 | buf := make([]byte, 1) 232 | _, err := rw.Read(buf) 233 | if err != nil && err != ErrEntryNotFound { 234 | require.NoError(t, err) 235 | } 236 | done <- struct{}{} 237 | } 238 | } 239 | }() 240 | } 241 | 242 | for i := 0; i < int(ringSize+2); i++ { 243 | work <- struct{}{} 244 | <-done 245 | } 246 | close(stop) 247 | wg.Wait() 248 | } 249 | 250 | func TestCqeIsZero(t *testing.T) { 251 | cqe := &CompletionEntry{} 252 | require.True(t, cqe.IsZero()) 253 | cqe.Res = 1 254 | require.False(t, cqe.IsZero()) 255 | } 256 | 257 | func TestReadWriterEOF(t *testing.T) { 258 | r, err := New(1024, nil) 259 | require.NoError(t, err) 260 | require.NotNil(t, r) 261 | 262 | content := []byte("testing...1,2,3") 263 | f, err := ioutil.TempFile("", "example") 264 | require.NoError(t, err) 265 | 266 | rw, err := r.FileReadWriter(f) 267 | require.NoError(t, err) 268 | 269 | // Write to the file using the ring 270 | _, err = rw.Write(content) 271 | require.NoError(t, err) 272 | 273 | buf := make([]byte, 4096) 274 | _, err = rw.Read(buf) 275 | require.Error(t, err) 276 | f.Close() 277 | os.Remove(f.Name()) 278 | } 279 | -------------------------------------------------------------------------------- /types.go: -------------------------------------------------------------------------------- 1 | // +build linux 2 | 3 | package iouring 4 | 5 | import ( 6 | "runtime" 7 | "sync" 8 | "sync/atomic" 9 | 10 | "github.com/pkg/errors" 11 | ) 12 | 13 | const ( 14 | // CqSeenFlag is a nonstandard flag for handling concurrent readers 15 | // from the CompletionQueue. 16 | CqSeenFlag = 1 17 | ) 18 | 19 | var ( 20 | // ErrEntryNotFound is returned when a CQE is not found. 21 | ErrEntryNotFound = errors.New("Completion entry not found") 22 | 23 | errCQEMissing = errors.New("cqe missing") 24 | 25 | cqePool = sync.Pool{ 26 | New: func() interface{} { 27 | return &CompletionEntry{} 28 | }, 29 | } 30 | ) 31 | 32 | type completionRequest struct { 33 | id uint64 34 | res int32 35 | flags uint32 36 | done chan struct{} 37 | } 38 | 39 | // Params are used to configured a io uring. 40 | type Params struct { 41 | SqEntries uint32 42 | CqEntries uint32 43 | Flags uint32 44 | SqThreadCPU uint32 45 | SqThreadIdle uint32 46 | Features uint32 47 | WqFD uint32 48 | Resv [3]uint32 49 | SqOffset SQRingOffset 50 | CqOffset CQRingOffset 51 | } 52 | 53 | // SQRingOffset describes the various submit queue offsets. 54 | type SQRingOffset struct { 55 | Head uint32 56 | Tail uint32 57 | RingMask uint32 58 | Entries uint32 59 | Flags uint32 60 | Dropped uint32 61 | Array uint32 62 | Resv1 uint32 63 | Resv2 uint64 64 | } 65 | 66 | // CQRingOffset describes the various completion queue offsets. 67 | type CQRingOffset struct { 68 | Head uint32 69 | Tail uint32 70 | RingMask uint32 71 | Entries uint32 72 | Overflow uint32 73 | Cqes uint32 74 | Flags uint32 75 | Resv [2]uint64 76 | } 77 | 78 | // SubmitEntry is an IO submission data structure (Submission Queue Entry). 79 | type SubmitEntry struct { 80 | Opcode Opcode /* type of operation for this sqe */ 81 | Flags uint8 /* IOSQE_ flags */ 82 | Ioprio uint16 /* ioprio for the request */ 83 | Fd int32 /* file descriptor to do IO on */ 84 | Offset uint64 /* offset into file */ 85 | Addr uint64 /* pointer to buffer or iovecs */ 86 | Len uint32 /* buffer size or number of iovecs */ 87 | UFlags int32 88 | UserData uint64 89 | Anon0 [24]byte /* extra padding */ 90 | } 91 | 92 | // Reset is used to reset an SubmitEntry. 93 | func (e *SubmitEntry) Reset() { 94 | e.Opcode = Nop 95 | e.Flags = 0 96 | e.Ioprio = 0 97 | e.Fd = -1 98 | e.Offset = 0 99 | e.Addr = 0 100 | e.Len = 0 101 | e.UFlags = 0 102 | e.UserData = 0 103 | } 104 | 105 | // SubmitQueue represents the submit queue ring buffer. 106 | type SubmitQueue struct { 107 | Size uint32 108 | Head *uint32 109 | Tail *uint32 110 | Mask *uint32 111 | Flags *uint32 112 | Dropped *uint32 113 | 114 | // Array holds entries to be submitted; it must never be resized it is mmap'd. 115 | Array []uint32 116 | // Entries must never be resized, it is mmap'd. 117 | Entries []SubmitEntry 118 | // ptr is pointer to the start of the mmap. 119 | ptr uintptr 120 | 121 | // entered is when the ring is being entered. 122 | entered *uint32 123 | // writes is used to keep track of the number of concurrent writers to 124 | // the ring. 125 | writes *uint32 126 | } 127 | 128 | // Reset is used to reset all entries. 129 | func (s *SubmitQueue) Reset() { 130 | for _, entry := range s.Entries { 131 | entry.Reset() 132 | } 133 | } 134 | 135 | // NeedWakeup is used to determine whether the submit queue needs awoken. 136 | func (s *SubmitQueue) NeedWakeup() bool { 137 | return atomic.LoadUint32(s.Flags)&SqNeedWakeup != 0 138 | } 139 | 140 | func (s *SubmitQueue) enterLock() { 141 | for { 142 | if atomic.LoadUint32(s.writes) != 0 && atomic.LoadUint32(s.entered) == 1 { 143 | runtime.Gosched() 144 | continue 145 | } 146 | if atomic.CompareAndSwapUint32(s.entered, 0, 1) { 147 | break 148 | } 149 | } 150 | } 151 | 152 | func (s *SubmitQueue) enterUnlock() { 153 | atomic.StoreUint32(s.entered, 0) 154 | } 155 | 156 | // completeWrite is used to signal that an entry in the map has been fully 157 | // written. 158 | func (s *SubmitQueue) completeWrite() { 159 | for { 160 | writes := atomic.LoadUint32(s.writes) 161 | if writes == 0 { 162 | panic("invalid number of sq write completions") 163 | } 164 | if atomic.CompareAndSwapUint32(s.writes, writes, writes-1) { 165 | return 166 | } 167 | runtime.Gosched() 168 | } 169 | } 170 | 171 | // CompletionEntry IO completion data structure (Completion Queue Entry). 172 | type CompletionEntry struct { 173 | UserData uint64 /* sqe->data submission data passed back */ 174 | Res int32 /* result code for this event */ 175 | Flags uint32 176 | } 177 | 178 | // IsZero returns if the CQE is zero valued. 179 | func (c *CompletionEntry) IsZero() bool { 180 | return c.UserData == 0 && c.Res == 0 && c.Flags == 0 181 | } 182 | 183 | // CompletionQueue represents the completion queue ring buffer. 184 | type CompletionQueue struct { 185 | Size uint32 186 | Head *uint32 187 | Tail *uint32 188 | Mask *uint32 189 | Overflow *uint32 190 | Flags *uint32 191 | 192 | // Entries must never be resized, it is mmap'd. 193 | Entries []CompletionEntry 194 | ptr uintptr 195 | } 196 | 197 | // Advance is used to advance the completion queue by a count. 198 | func (c *CompletionQueue) Advance(count int) { 199 | atomic.AddUint32(c.Head, uint32(count)) 200 | } 201 | 202 | // EntryBy (DEPRECATED) returns a CompletionEntry by comparing the user data, 203 | // this should be called after the ring has been entered. 204 | func (c *CompletionQueue) EntryBy(userData uint64) (*CompletionEntry, error) { 205 | head := atomic.LoadUint32(c.Head) 206 | tail := atomic.LoadUint32(c.Tail) 207 | mask := atomic.LoadUint32(c.Mask) 208 | if head&mask == tail&mask { 209 | return nil, ErrEntryNotFound 210 | } 211 | 212 | // seenIdx is used for indicating the largest consecutive seen CQEs, 213 | // which is then used for setting the new head position. This is done 214 | // by setting the CqSeenFlag bit on a CQE UserData once a CQE has been 215 | // read. The head is then set to the largest consecutive seen index. 216 | seenIdx := head & mask 217 | seen := false 218 | seenEnd := false 219 | for i := seenIdx; i <= uint32(len(c.Entries)-1); i++ { 220 | cqe := c.Entries[i] 221 | if cqe.Flags&CqSeenFlag == CqSeenFlag || cqe.IsZero() { 222 | seen = true 223 | } else if !seenEnd { 224 | seen = false 225 | seenEnd = true 226 | } 227 | if seen == true && !seenEnd { 228 | seenIdx = i + 1 229 | } 230 | if cqe.UserData == userData { 231 | c.Entries[i].Flags |= CqSeenFlag 232 | if seenIdx == c.Size { 233 | seenIdx = 0 234 | } 235 | atomic.StoreUint32(c.Head, seenIdx) 236 | return &c.Entries[i], nil 237 | } 238 | } 239 | // Handle wrapping. 240 | seenIdx = uint32(0) 241 | seen = false 242 | seenEnd = false 243 | tail = atomic.LoadUint32(c.Tail) 244 | mask = atomic.LoadUint32(c.Mask) 245 | for i := uint32(0); i <= tail&mask; i++ { 246 | cqe := c.Entries[i] 247 | if cqe.Flags&CqSeenFlag == CqSeenFlag || cqe.IsZero() { 248 | seen = true 249 | } else if !seenEnd { 250 | seen = false 251 | seenEnd = true 252 | } 253 | if seen == true && !seenEnd { 254 | seenIdx = i + 1 255 | } 256 | if cqe.UserData == userData { 257 | c.Entries[i].Flags |= CqSeenFlag 258 | if seenIdx == c.Size { 259 | seenIdx = 0 260 | } 261 | atomic.StoreUint32(c.Head, seenIdx) 262 | return &c.Entries[i], nil 263 | } 264 | } 265 | 266 | return nil, ErrEntryNotFound 267 | } 268 | -------------------------------------------------------------------------------- /examples/standalone/main.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "bytes" 5 | "fmt" 6 | "io/ioutil" 7 | "log" 8 | "os" 9 | "reflect" 10 | "syscall" 11 | "unsafe" 12 | ) 13 | 14 | const ( 15 | // SetupSyscall defines the syscall number for io_uring_setup. 16 | SetupSyscall = 425 17 | // EnterSyscall defines the syscall number for io_uring_enter. 18 | EnterSyscall = 426 19 | // RegisterSyscall defines the syscall number for io_uring_register. 20 | RegisterSyscall = 427 21 | 22 | // SqRingOffset is the offset of the submission queue. 23 | SqRingOffset uint64 = 0 24 | // CqRingOffset is the offset of the completion queue. 25 | CqRingOffset uint64 = 0x8000000 26 | // SqeRingOffset is the offset of the submission queue entries. 27 | SqeRingOffset uint64 = 0x10000000 28 | 29 | // EnterGetEvents if the bit is set in flags, then the system call will 30 | // attempt to wait for min_complete event completions before 31 | // returning. 32 | EnterGetEvents uint = (1 << 0) 33 | 34 | // Opcodes 35 | Read uint8 = 22 36 | ) 37 | 38 | var ( 39 | uint32Size = unsafe.Sizeof(uint32(0)) 40 | cqeSize = unsafe.Sizeof(Cqe{}) 41 | sqeSize = unsafe.Sizeof(Sqe{}) 42 | ) 43 | 44 | type Sqe struct { 45 | Opcode uint8 46 | Flags uint8 47 | Ioprio uint16 48 | Fd int32 49 | Off uint64 50 | Addr uint64 51 | Len uint32 52 | Rw_flags int32 53 | User_data uint64 54 | Anon0 [24]byte 55 | } 56 | 57 | type Cqe struct { 58 | Data uint64 59 | Res int32 60 | Flags uint32 61 | } 62 | 63 | type SqOffset struct { 64 | Head uint32 65 | Tail uint32 66 | Mask uint32 67 | Entries uint32 68 | Flags uint32 69 | Dropped uint32 70 | Array uint32 71 | Resv1 uint32 72 | Resv2 uint64 73 | } 74 | 75 | type CqOffset struct { 76 | Head uint32 77 | Tail uint32 78 | Mask uint32 79 | Entries uint32 80 | Overflow uint32 81 | Cqes uint32 82 | Resv [2]uint64 83 | } 84 | 85 | type Params struct { 86 | Sq_entries uint32 87 | Cq_entries uint32 88 | Flags uint32 89 | Sq_thread_cpu uint32 90 | Sq_thread_idle uint32 91 | Features uint32 92 | Wq_fd uint32 93 | Resv [3]uint32 94 | Sq_off SqOffset 95 | Cq_off CqOffset 96 | } 97 | 98 | type SubmitQueue struct { 99 | Size uint32 100 | Head *uint32 101 | Tail *uint32 102 | Mask *uint32 103 | Flags *uint32 104 | Dropped *uint32 105 | 106 | // Entries must never be resized, it is mmap'd. 107 | Entries []Sqe 108 | Array []uint32 109 | } 110 | 111 | type CompletionQueue struct { 112 | Size uint32 113 | Head *uint32 114 | Tail *uint32 115 | Mask *uint32 116 | Overflow *uint32 117 | 118 | // Entries must never be resized, it is mmap'd. 119 | Entries []Cqe 120 | } 121 | 122 | // Setup is used to setup a io_uring using the io_uring_setup syscall. 123 | func Setup(entries uint, params *Params) (int, error) { 124 | fd, _, errno := syscall.Syscall( 125 | SetupSyscall, 126 | uintptr(entries), 127 | uintptr(unsafe.Pointer(params)), 128 | uintptr(0), 129 | ) 130 | // errno is a special type in Go, see: 131 | // https://golang.org/pkg/syscall/#Errno 132 | if errno != 0 { 133 | err := errno 134 | return 0, err 135 | } 136 | return int(fd), nil 137 | } 138 | 139 | // MmapRing is used to configure the submit and completion queues, it should 140 | // only be called after the Setup function has completed successfully. 141 | // See: 142 | // https://github.com/axboe/liburing/blob/master/src/setup.c#L22 143 | func MmapRing(fd int, p *Params, sq *SubmitQueue, cq *CompletionQueue) error { 144 | var ( 145 | cqPtr uintptr 146 | sqPtr uintptr 147 | errno syscall.Errno 148 | err error 149 | ) 150 | sq.Size = uint32(uint(p.Sq_off.Array) + (uint(p.Sq_entries) * uint(uint32Size))) 151 | 152 | sqPtr, _, errno = syscall.Syscall6( 153 | syscall.SYS_MMAP, 154 | uintptr(0), 155 | uintptr(sq.Size), 156 | syscall.PROT_READ|syscall.PROT_WRITE, 157 | syscall.MAP_SHARED|syscall.MAP_POPULATE, 158 | uintptr(fd), 159 | uintptr(SqRingOffset), 160 | ) 161 | if errno != 0 { 162 | err = errno 163 | return err 164 | } 165 | 166 | // Conversion of a uintptr back to Pointer is not valid in general, 167 | // except for: 168 | // 3) Conversion of a Pointer to a uintptr and back, with arithmetic. 169 | 170 | // go vet doesn't like these casts so it probably violates the memory 171 | // model. 172 | sq.Head = (*uint32)(unsafe.Pointer(sqPtr + uintptr(p.Sq_off.Head))) 173 | sq.Tail = (*uint32)(unsafe.Pointer(sqPtr + uintptr(p.Sq_off.Tail))) 174 | sq.Mask = (*uint32)(unsafe.Pointer(sqPtr + uintptr(p.Sq_off.Mask))) 175 | sq.Flags = (*uint32)(unsafe.Pointer(sqPtr + uintptr(p.Sq_off.Flags))) 176 | sq.Dropped = (*uint32)(unsafe.Pointer(sqPtr + uintptr(p.Sq_off.Dropped))) 177 | 178 | // Map the sqe ring. 179 | sqePtr, _, errno := syscall.Syscall6( 180 | syscall.SYS_MMAP, 181 | uintptr(0), 182 | uintptr(uint(p.Sq_entries)*uint(sqeSize)), 183 | syscall.PROT_READ|syscall.PROT_WRITE, 184 | syscall.MAP_SHARED|syscall.MAP_POPULATE, 185 | uintptr(fd), 186 | uintptr(SqeRingOffset), 187 | ) 188 | if errno != 0 { 189 | err = errno 190 | return err 191 | } 192 | 193 | // Making mmap'd slices requires doing an unsafe pointer cast. 194 | sq.Entries = *(*[]Sqe)(unsafe.Pointer(&reflect.SliceHeader{ 195 | Data: uintptr(sqePtr), 196 | Len: int(p.Sq_entries), 197 | Cap: int(p.Sq_entries), 198 | })) 199 | sq.Array = *(*[]uint32)(unsafe.Pointer(&reflect.SliceHeader{ 200 | Data: uintptr(unsafe.Pointer(sqPtr + uintptr(p.Sq_off.Array))), 201 | Len: int(p.Sq_entries), 202 | Cap: int(p.Sq_entries), 203 | })) 204 | 205 | // Map the completion queue. 206 | cq.Size = uint32(uint(p.Cq_off.Cqes) + (uint(p.Cq_entries) * uint(cqeSize))) 207 | cqPtr, _, errno = syscall.Syscall6( 208 | syscall.SYS_MMAP, 209 | uintptr(0), 210 | uintptr(cq.Size), 211 | syscall.PROT_READ|syscall.PROT_WRITE, 212 | syscall.MAP_SHARED|syscall.MAP_POPULATE, 213 | uintptr(fd), 214 | uintptr(CqRingOffset), 215 | ) 216 | if errno != 0 { 217 | err = errno 218 | return err 219 | } 220 | 221 | cq.Head = (*uint32)(unsafe.Pointer(uintptr(uint(cqPtr) + uint(p.Cq_off.Head)))) 222 | cq.Tail = (*uint32)(unsafe.Pointer(uintptr(uint(cqPtr) + uint(p.Cq_off.Tail)))) 223 | cq.Mask = (*uint32)(unsafe.Pointer(uintptr(uint(cqPtr) + uint(p.Cq_off.Mask)))) 224 | cq.Overflow = (*uint32)(unsafe.Pointer(uintptr(uint(cqPtr) + uint(p.Cq_off.Overflow)))) 225 | 226 | cq.Entries = *(*[]Cqe)(unsafe.Pointer(&reflect.SliceHeader{ 227 | Data: uintptr(uint(cqPtr) + uint(p.Cq_off.Cqes)), 228 | Len: int(p.Cq_entries), 229 | Cap: int(p.Cq_entries), 230 | })) 231 | 232 | return nil 233 | } 234 | 235 | // Enter is used to submit to the queue. 236 | func Enter(fd int, toSubmit uint, minComplete uint, flags uint /* sigset *unix.Sigset_t*/) (int, error) { 237 | res, _, errno := syscall.Syscall6( 238 | EnterSyscall, 239 | uintptr(fd), 240 | uintptr(toSubmit), 241 | uintptr(minComplete), 242 | uintptr(flags), 243 | /*uintptr(unsafe.Pointer(sigset)),*/ 244 | uintptr(0), 245 | uintptr(0), 246 | ) 247 | if errno != 0 { 248 | var err error 249 | err = errno 250 | return 0, err 251 | } 252 | 253 | return int(res), nil 254 | } 255 | 256 | func main() { 257 | // First create a tempfile for writing some test data. 258 | tmpFile, err := ioutil.TempFile("", "example") 259 | if err != nil { 260 | log.Fatal(err) 261 | } 262 | defer os.Remove(tmpFile.Name()) 263 | 264 | // Write to the file without using the ring. 265 | content := []byte("testing 1,2,3") 266 | _, err = tmpFile.Write(content) 267 | if err != nil { 268 | log.Fatal(err) 269 | } 270 | 271 | _, err = tmpFile.Seek(0, 0) 272 | if err != nil { 273 | log.Fatal(err) 274 | } 275 | 276 | p := &Params{} 277 | ringFd, err := Setup(8, p) 278 | if err != nil { 279 | log.Fatal(err) 280 | } 281 | var ( 282 | cq CompletionQueue 283 | sq SubmitQueue 284 | ) 285 | if err := MmapRing(ringFd, p, &sq, &cq); err != nil { 286 | log.Fatal(err) 287 | } 288 | 289 | offset := uint64(0) 290 | exContent := []byte{} 291 | 292 | // Do two reads to read the content from the tempfile. 293 | for i := 0; i < 2; i++ { 294 | readBuff := make([]byte, len(content)/2) 295 | sqTail := *sq.Tail 296 | sqIdx := sqTail & *sq.Mask 297 | 298 | // Prepare the Sqe 299 | sq.Entries[sqIdx].Opcode = Read 300 | sq.Entries[sqIdx].Fd = int32(tmpFile.Fd()) 301 | sq.Entries[sqIdx].Off = offset 302 | sq.Entries[sqIdx].Len = uint32(len(readBuff)) 303 | sq.Entries[sqIdx].User_data = uint64(i + 1) 304 | 305 | // This is probably a violation of the memory model, but in 306 | // order for reads to work we have to pass the address of the 307 | // read buffer to the SQE. If the readBuffer is heap allocated 308 | // then it is less of a problem. 309 | sq.Entries[sqIdx].Addr = (uint64)(uintptr(unsafe.Pointer(&readBuff[0]))) 310 | 311 | sq.Array[sqIdx] = *sq.Head & *sq.Mask 312 | *sq.Tail += 1 313 | fmt.Printf("sq head:%v tail: %v\nsq entries: %+v\n", *sq.Head, *sq.Tail, sq.Entries[:2]) 314 | 315 | fmt.Println("entering the ring") 316 | n, err := Enter(ringFd, uint(1), uint(1), EnterGetEvents) 317 | if err != nil { 318 | log.Fatal(err) 319 | } 320 | if n != 1 { 321 | log.Fatalf("expected 1 completed entry, got: %v", n) 322 | } 323 | 324 | cqTail := *cq.Tail 325 | cqHead := *cq.Head 326 | if cqHead == cqTail { 327 | log.Fatal("No entries\n") 328 | } 329 | fmt.Printf("cq head:%v tail: %v\ncq entries: %+v\n", *cq.Head, *cq.Tail, cq.Entries[:2]) 330 | 331 | // Search for the cqe in a suboptimal manner 332 | for cqIdx := cqHead & *cq.Mask; cqIdx < cqTail; cqIdx++ { 333 | // The Cqe data should match our loop index (i)+1 334 | if cq.Entries[int(cqIdx)].Data == uint64(i+1) { 335 | exContent = append(exContent, readBuff...) 336 | fmt.Printf("got content: %+v\n", string(readBuff)) 337 | *cq.Head += 1 338 | offset += uint64(cq.Entries[int(cqIdx)].Res) 339 | } 340 | } 341 | } 342 | 343 | if !bytes.Equal(content, exContent) { 344 | log.Fatalf("Expected: %+v, got: %+v", string(content), string(exContent)) 345 | } 346 | } 347 | -------------------------------------------------------------------------------- /conn.go: -------------------------------------------------------------------------------- 1 | // +build linux 2 | 3 | package iouring 4 | 5 | import ( 6 | "fmt" 7 | "io/ioutil" 8 | "net" 9 | "os" 10 | "strconv" 11 | "strings" 12 | "sync/atomic" 13 | "syscall" 14 | ) 15 | 16 | const ( 17 | POLLIN = 0x1 18 | POLLPRI = 0x2 19 | POLLOUT = 0x4 20 | POLLERR = 0x8 21 | POLLHUP = 0x10 22 | POLLNVAL = 0x20 23 | 24 | // SOReuseport is the socket option to reuse socket port. 25 | SOReuseport int = 0x0F 26 | 27 | // TCPFastopen is the socket option to open a TCP fast open. 28 | TCPFastopen int = 0x17 29 | ) 30 | 31 | // FastOpenAllowed return nil if fast open is enabled. 32 | func FastOpenAllowed() error { 33 | b, err := ioutil.ReadFile("/proc/sys/net/ipv4/tcp_fack") 34 | if err != nil { 35 | return err 36 | } 37 | allowed, err := strconv.Atoi(strings.Replace(string(b), "\n", "", -1)) 38 | if err != nil { 39 | return err 40 | } 41 | 42 | if allowed != 3 { 43 | return fmt.Errorf("set /proc/sys/net/ipv4/tcp_fastopen to 3") 44 | } 45 | 46 | return nil 47 | } 48 | 49 | type connInfo struct { 50 | fd int 51 | id uint64 52 | sqeIds chan uint64 53 | } 54 | 55 | type addr struct { 56 | net string 57 | s string 58 | } 59 | 60 | // Network implements the net.Addr interface. 61 | func (a *addr) Network() string { 62 | return a.net 63 | } 64 | 65 | // String implements the net.Addr interface. 66 | func (a *addr) String() string { 67 | return a.s 68 | } 69 | 70 | type ringListener struct { 71 | debug bool 72 | r *Ring 73 | f *os.File 74 | a *addr 75 | stop chan struct{} 76 | errHandler func(error) 77 | newConn chan net.Conn 78 | connGet chan chan net.Conn 79 | } 80 | 81 | // run is used to interact with the ring 82 | func (l *ringListener) run() { 83 | id := l.r.ID() 84 | fd := int(l.f.Fd()) 85 | cInfo := &connInfo{ 86 | fd: fd, 87 | id: id, 88 | } 89 | sqe, commit := l.r.SubmitEntry() 90 | sqe.Opcode = PollAdd 91 | sqe.Fd = int32(fd) 92 | sqe.UFlags = int32(POLLIN) 93 | sqe.UserData = id 94 | commit() 95 | 96 | conns := map[uint64]*connInfo{id: cInfo} 97 | 98 | for { 99 | select { 100 | case <-l.stop: 101 | return 102 | default: 103 | _, err := l.r.Enter(1024, 1, EnterGetEvents, nil) 104 | if err != nil { 105 | if l.errHandler != nil { 106 | l.errHandler(err) 107 | } 108 | continue 109 | } 110 | l.walkCq(conns) 111 | } 112 | } 113 | } 114 | 115 | func (l *ringListener) walkCq(conns map[uint64]*connInfo) { 116 | head := atomic.LoadUint32(l.r.cq.Head) 117 | tail := atomic.LoadUint32(l.r.cq.Tail) 118 | mask := atomic.LoadUint32(l.r.cq.Mask) 119 | if head&mask == tail&mask { 120 | return 121 | } 122 | 123 | seenIdx := head & mask 124 | seenEnd := false 125 | if l.debug { 126 | sqHead := *l.r.sq.Head 127 | sqTail := *l.r.sq.Tail 128 | sqMask := *l.r.sq.Mask 129 | cqHead := *l.r.cq.Head 130 | cqTail := *l.r.cq.Tail 131 | cqMask := *l.r.cq.Mask 132 | fmt.Printf("sq head: %v tail: %v\nsq entries: %+v\n", sqHead&sqMask, sqTail&sqMask, l.r.sq.Entries[:9]) 133 | fmt.Printf("cq head: %v tail: %v\ncq entries: %+v\n", cqHead&cqMask, cqTail&cqMask, l.r.cq.Entries[:9]) 134 | } 135 | for i := seenIdx; i <= tail&mask; i++ { 136 | cqe := l.r.cq.Entries[i] 137 | if (cqe.Flags&CqSeenFlag == CqSeenFlag || cqe.IsZero()) && !seenEnd { 138 | seenIdx = i 139 | } else { 140 | seenEnd = true 141 | } 142 | cInfo, ok := conns[cqe.UserData] 143 | if !ok { 144 | continue 145 | } 146 | l.r.cq.Entries[i].Flags |= CqSeenFlag 147 | head = atomic.LoadUint32(l.r.cq.Head) 148 | if seenIdx > head { 149 | atomic.CompareAndSwapUint32(l.r.cq.Head, head, seenIdx) 150 | } 151 | l.onListen(conns, cInfo) 152 | return 153 | } 154 | 155 | // Handle wrapping. 156 | seenIdx = uint32(0) 157 | seenEnd = false 158 | tail = atomic.LoadUint32(l.r.cq.Tail) 159 | mask = atomic.LoadUint32(l.r.cq.Mask) 160 | for i := uint32(0); i <= tail&mask; i++ { 161 | cqe := l.r.cq.Entries[i] 162 | if (cqe.Flags&CqSeenFlag == CqSeenFlag || cqe.IsZero()) && !seenEnd { 163 | seenIdx = i 164 | } else { 165 | seenEnd = true 166 | } 167 | cInfo, ok := conns[cqe.UserData] 168 | if !ok { 169 | continue 170 | } 171 | l.r.cq.Entries[i].Flags |= CqSeenFlag 172 | head = atomic.LoadUint32(l.r.cq.Head) 173 | if seenIdx > head { 174 | atomic.CompareAndSwapUint32(l.r.cq.Head, head, seenIdx) 175 | } 176 | l.onListen(conns, cInfo) 177 | return 178 | } 179 | } 180 | 181 | // onListen is called when processing a cqe for a listening socket. 182 | func (l *ringListener) onListen(conns map[uint64]*connInfo, cInfo *connInfo) { 183 | var ( 184 | newConnInfo connInfo 185 | offset int64 186 | rc = ringConn{ 187 | stop: make(chan struct{}, 2), 188 | poll: make(chan uint64, 64), 189 | r: l.r, 190 | } 191 | ) 192 | for { 193 | // Wait for a new connection to arrive and add it to the ring. 194 | newFd, sa, err := syscall.Accept4(cInfo.fd, syscall.SOCK_NONBLOCK) 195 | if err != nil { 196 | // TODO: Log this or something? 197 | panic(err.Error()) 198 | } 199 | rc.fd = newFd 200 | rc.laddr = l.a 201 | rc.raddr = &addr{net: l.a.net} 202 | switch sockType := sa.(type) { 203 | case *syscall.SockaddrInet4: 204 | rc.raddr.s = fmt.Sprintf("%s:%d", sockType.Addr, sockType.Port) 205 | case *syscall.SockaddrInet6: 206 | rc.raddr.s = fmt.Sprintf("%s:%d", sockType.Addr, sockType.Port) 207 | case *syscall.SockaddrUnix: 208 | rc.raddr.s = sockType.Name 209 | } 210 | rc.offset = &offset 211 | break 212 | } 213 | 214 | // Add the new connection back to the ring. 215 | sqe, commit := l.r.SubmitEntry() 216 | sqe.Opcode = PollAdd 217 | sqe.Fd = int32(rc.fd) 218 | sqe.UFlags = int32(POLLIN) 219 | sqe.UserData = newConnInfo.id 220 | commit() 221 | ready := int32(1) 222 | rc.pollReady = &ready 223 | go rc.run() 224 | 225 | // Add the old connection back as well. 226 | sqe, commit = l.r.SubmitEntry() 227 | sqe.Opcode = PollAdd 228 | sqe.Fd = int32(cInfo.fd) 229 | sqe.UFlags = int32(POLLIN) 230 | sqe.UserData = uint64(cInfo.fd) 231 | commit() 232 | 233 | // Wait for the new connection to be accepted. 234 | // TODO: If this is unbuffered it will block, alternatively it could be 235 | // sent in a separate goroutine to ensure the main ring code isn't 236 | // blocking. 237 | l.newConn <- &rc 238 | } 239 | 240 | // Close implements the net.Listener interface. 241 | func (l *ringListener) Close() error { 242 | close(l.stop) 243 | return l.f.Close() 244 | 245 | } 246 | 247 | // Addr implements the net.Listener interface. 248 | func (l *ringListener) Addr() net.Addr { 249 | return l.a 250 | } 251 | 252 | // Accept implements the net.Listener interface. 253 | func (l *ringListener) Accept() (net.Conn, error) { 254 | return <-l.newConn, nil 255 | } 256 | 257 | // Returns the file descriptor of the connection. 258 | func (l *ringListener) Fd() int { 259 | if l.f == nil { 260 | return -1 261 | } 262 | return int(l.f.Fd()) 263 | } 264 | 265 | // SockoptListener returns a net.Listener that is Ring based. 266 | func (r *Ring) SockoptListener(network, address string, errHandler func(error), sockopts ...int) (net.Listener, error) { 267 | var ( 268 | err error 269 | fd int 270 | sockAddr syscall.Sockaddr 271 | ) 272 | l := &ringListener{ 273 | r: r, 274 | a: &addr{net: network}, 275 | stop: make(chan struct{}), 276 | newConn: make(chan net.Conn, 1024), 277 | connGet: make(chan chan net.Conn), 278 | errHandler: errHandler, 279 | } 280 | 281 | switch network { 282 | case "tcp", "tcp4": 283 | fd, err = syscall.Socket(syscall.AF_INET, syscall.SOCK_STREAM, 0) 284 | if err != nil { 285 | return nil, fmt.Errorf("could not open socket") 286 | } 287 | netAddr, err := net.ResolveTCPAddr(network, address) 288 | if err != nil { 289 | return nil, fmt.Errorf("could not open socket") 290 | } 291 | l.a.net = netAddr.Network() 292 | l.a.s = netAddr.String() 293 | 294 | var ipAddr [4]byte 295 | copy(ipAddr[:], netAddr.IP) 296 | sockAddr = &syscall.SockaddrInet4{ 297 | Port: netAddr.Port, 298 | Addr: ipAddr, 299 | } 300 | case "tcp6": 301 | fd, err = syscall.Socket(syscall.AF_INET6, syscall.SOCK_STREAM, 0) 302 | if err != nil { 303 | return nil, fmt.Errorf("could not open socket") 304 | } 305 | netAddr, err := net.ResolveTCPAddr(network, address) 306 | if err != nil { 307 | return nil, fmt.Errorf("could not open socket") 308 | } 309 | l.a.net = netAddr.Network() 310 | l.a.s = netAddr.String() 311 | 312 | ipAddr := [16]byte{} 313 | copy(ipAddr[:], netAddr.IP) 314 | sockAddr = &syscall.SockaddrInet6{ 315 | Port: netAddr.Port, 316 | Addr: ipAddr, 317 | } 318 | case "unix": 319 | fd, err = syscall.Socket(syscall.AF_UNIX, syscall.SOCK_STREAM, 0) 320 | sockAddr = &syscall.SockaddrUnix{ 321 | Name: address, 322 | } 323 | case "udp", "udp4": 324 | fd, err = syscall.Socket(syscall.AF_INET, syscall.SOCK_DGRAM, 0) 325 | if err != nil { 326 | return nil, fmt.Errorf("could not open socket") 327 | } 328 | netAddr, err := net.ResolveUDPAddr(network, address) 329 | if err != nil { 330 | return nil, fmt.Errorf("could not open socket") 331 | } 332 | ipAddr := [4]byte{} 333 | copy(ipAddr[:], netAddr.IP) 334 | sockAddr = &syscall.SockaddrInet4{ 335 | Port: netAddr.Port, 336 | Addr: ipAddr, 337 | } 338 | case "udp6": 339 | fd, err = syscall.Socket(syscall.AF_INET, syscall.SOCK_DGRAM, 0) 340 | if err != nil { 341 | return nil, fmt.Errorf("could not open socket") 342 | } 343 | netAddr, err := net.ResolveUDPAddr(network, address) 344 | if err != nil { 345 | return nil, fmt.Errorf("could not open socket") 346 | } 347 | l.a.net = netAddr.Network() 348 | l.a.s = netAddr.String() 349 | 350 | ipAddr := [16]byte{} 351 | copy(ipAddr[:], netAddr.IP) 352 | sockAddr = &syscall.SockaddrInet6{ 353 | Port: netAddr.Port, 354 | Addr: ipAddr, 355 | } 356 | default: 357 | return nil, fmt.Errorf("unknown network family: %s", network) 358 | } 359 | if err != nil { 360 | syscall.Close(fd) 361 | return nil, err 362 | } 363 | 364 | for _, sockopt := range sockopts { 365 | if sockopt == SOReuseport { 366 | err = syscall.SetsockoptInt(fd, syscall.SOL_SOCKET, sockopt, 1) 367 | if err != nil { 368 | syscall.Close(fd) 369 | return nil, err 370 | } 371 | } else if sockopt == TCPFastopen { 372 | if err := FastOpenAllowed(); err != nil { 373 | return nil, err 374 | } 375 | err = syscall.SetsockoptInt(fd, syscall.SOL_TCP, sockopt, 1) 376 | if err != nil { 377 | syscall.Close(fd) 378 | return nil, err 379 | } 380 | } 381 | } 382 | 383 | if err := syscall.Bind(fd, sockAddr); err != nil { 384 | syscall.Close(fd) 385 | return nil, err 386 | } 387 | 388 | if err := syscall.Listen(fd, syscall.SOMAXCONN); err != nil { 389 | syscall.Close(fd) 390 | return nil, err 391 | } 392 | 393 | f := os.NewFile(uintptr(fd), "l") 394 | l.f = f 395 | l.debug = r.debug 396 | go l.run() 397 | 398 | return l, nil 399 | } 400 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # `io_uring` Go 2 | [![GoDoc](https://godoc.org/github.com/hodgesds/iouring-go?status.svg)](https://godoc.org/github.com/hodgesds/iouring-go) 3 | 4 | **WORK IN PROGRESS** This library adds support for [`io_uring`](https://kernel.dk/io_uring.pdf) for 5 | Go. This library is similar to [liburing](https://github.com/axboe/liburing). 6 | If you want to contribute feel free to send PRs or emails, there's plenty of 7 | things that need cleaned up. Also, check out [@dshylyak](https://github.com/dshulyak)'s 8 | [uring](https://github.com/dshulyak/uring) library as well for a similar approach. 9 | Ideally, these approaches would be added to the Go runtime for optimal efficiency, 10 | so these libraries are more of a POC, see [here](https://github.com/dshulyak/uring/issues/1). 11 | 12 | # Interacting with the Submit/Completion Queues 13 | ## Design 14 | The library is designed so that if you want to use your own implementation for 15 | handling submissions/completions that everything is available for use. 16 | Alternatively, there helper methods on the `Ring` struct that also interact 17 | with standard library interfaces as well. There is also a interface for 18 | creating a `net.Listener`, but it is still a work in progress. 19 | 20 | ## Submission Queue 21 | The submission and completion queues are both mmap'd as slices, the question 22 | then becomes how to design an efficient API that is also able to interact with 23 | many of the standard library interfaces. One choice is to run a background 24 | goroutine that manages all operations with the queues and use channels for 25 | enqueuing requests. The downside of this approach is that are [outstanding 26 | issues](https://github.com/golang/go/issues/8899) with the design of channels 27 | may make it suboptimal for high throughput IO. 28 | 29 | [`liburing`](https://github.com/axboe/liburing) uses memory barriers for 30 | interacting appropriately with the submission/completion queues of `io_uring`. 31 | One problem with the memory model of Go is that it uses [weak 32 | atomics](https://github.com/golang/go/issues/5045) which can make it difficult 33 | to use `sync/atomic` in all situations. If certain IO operations are to be 34 | carriered out in a specific order then this becomes a real challenge. 35 | 36 | The current challenge with the SQ is that currently for each reader/writer 37 | interface every time the a read or write is submitted the 38 | [`Enter`](https://godoc.org/github.com/hodgesds/iouring-go#Enter) method is 39 | called on the ring. These could be batched (with a small latency penalty) and 40 | allow for a single enter of the ring, which would result in fewer syscalls. 41 | 42 | ## Completion Queue 43 | Completion queues have the difficulty of many concurrent readers which 44 | need to synchronize updating the position of the head. The current solution 45 | is to have a separate background goroutine that tracks the position of the 46 | out of order completions and updates the head as necessary. This separates the 47 | logic of synchronizing updating of the CQ head and handling a SQ request 48 | 49 | # Setup 50 | Ulimit values for locked memory address space may need to be adjusted. If the 51 | following error occurs when running tests then the `memlock` value in 52 | [`/etc/security/limits.conf`](https://linux.die.net/man/5/limits.conf) may need 53 | to be increased. 54 | 55 | ``` 56 | === RUN TestNew 57 | TestNew: ring_test.go:13: 58 | Error Trace: ring_test.go:13 59 | Error: Received unexpected error: 60 | cannot allocate memory 61 | Test: TestNew 62 | ``` 63 | 64 | The ulimit value must be greater than the ring size, use `ulimit -l` to view 65 | the current limit. 66 | 67 | 68 | # Example 69 | Here is a minimal example to get started that writes to a file using a ring: 70 | 71 | ``` 72 | package main 73 | 74 | import ( 75 | "log" 76 | "os" 77 | 78 | "github.com/hodgesds/iouring-go" 79 | ) 80 | 81 | func main() { 82 | r, err := iouring.New(1024, &iouring.Params{ 83 | Features: iouring.FeatNoDrop, 84 | }) 85 | if err != nil { 86 | log.Fatal(err) 87 | } 88 | 89 | // Open a file for registering with the ring. 90 | f, err := os.OpenFile("hello.txt", os.O_RDWR|os.O_CREATE|os.O_TRUNC, 0755) 91 | if err != nil { 92 | log.Fatal(err) 93 | } 94 | 95 | // Register the file with the ring, which returns an io.WriteCloser. 96 | rw, err := r.FileReadWriter(f) 97 | if err != nil { 98 | log.Fatal(err) 99 | } 100 | 101 | if _, err := rw.Write([]byte("hello io_uring!")); err != nil { 102 | log.Fatal(err) 103 | } 104 | 105 | // Close the WriteCloser, which closes the open file (f). 106 | if err := r.Close(); err != nil { 107 | log.Fatal(err) 108 | } 109 | } 110 | ``` 111 | 112 | # Benchmarks 113 | I haven't really wanted to add any benchmarks as I haven't spent the time to 114 | really write good benchmarks. However, here's some initial numbers with some 115 | comments: 116 | 117 | ``` 118 | BenchmarkFileWrite 119 | BenchmarkFileWrite/os-file-write-128 120 | BenchmarkFileWrite/os-file-write-128-8 245845 4649 ns/op 27.53 MB/s 0 B/op 0 allocs/op 121 | BenchmarkFileWrite/os-file-write-512 122 | BenchmarkFileWrite/os-file-write-512-8 243472 4867 ns/op 105.20 MB/s 0 B/op 0 allocs/op 123 | BenchmarkFileWrite/os-file-write-1024 124 | BenchmarkFileWrite/os-file-write-1024-8 212593 5320 ns/op 192.48 MB/s 0 B/op 0 allocs/op 125 | BenchmarkFileWrite/os-file-write-2048 126 | BenchmarkFileWrite/os-file-write-2048-8 183775 6047 ns/op 338.69 MB/s 0 B/op 0 allocs/op 127 | BenchmarkFileWrite/os-file-write-4096 128 | BenchmarkFileWrite/os-file-write-4096-8 143608 7614 ns/op 537.98 MB/s 0 B/op 0 allocs/op 129 | BenchmarkRingWrite 130 | BenchmarkRingWrite/ring-1024-write-128 131 | BenchmarkRingWrite/ring-1024-write-128-8 126456 9346 ns/op 13.70 MB/s 32 B/op 1 allocs/op 132 | BenchmarkRingWrite/ring-1024-write-512 133 | BenchmarkRingWrite/ring-1024-write-512-8 119118 10702 ns/op 47.84 MB/s 32 B/op 1 allocs/op 134 | BenchmarkRingWrite/ring-1024-write-1024 135 | BenchmarkRingWrite/ring-1024-write-1024-8 115423 10600 ns/op 96.60 MB/s 32 B/op 1 allocs/op 136 | BenchmarkRingWrite/ring-8192-write-2048 137 | BenchmarkRingWrite/ring-8192-write-2048-8 103276 11006 ns/op 186.07 MB/s 32 B/op 1 allocs/op 138 | BenchmarkRingWrite/ring-8192-write-4096 139 | BenchmarkRingWrite/ring-8192-write-4096-8 87127 13704 ns/op 298.90 MB/s 32 B/op 1 allocs/op 140 | BenchmarkRingDeadlineWrite 141 | BenchmarkRingDeadlineWrite/ring-1024-deadline-1ms-128 142 | BenchmarkRingDeadlineWrite/ring-1024-deadline-1ms-128-8 102620 9979 ns/op 12.83 MB/s 32 B/op 1 allocs/op 143 | BenchmarkRingDeadlineWrite/ring-1024-deadline-100µs-512 144 | BenchmarkRingDeadlineWrite/ring-1024-deadline-100µs-512-8 118021 10479 ns/op 48.86 MB/s 32 B/op 1 allocs/op 145 | BenchmarkRingDeadlineWrite/ring-1024-deadline-10µs-1024 146 | BenchmarkRingDeadlineWrite/ring-1024-deadline-10µs-1024-8 103600 10232 ns/op 100.08 MB/s 32 B/op 1 allocs/op 147 | BenchmarkRingDeadlineWrite/ring-8192-deadline-1µs-2048 148 | BenchmarkRingDeadlineWrite/ring-8192-deadline-1µs-2048-8 101726 11330 ns/op 180.75 MB/s 32 B/op 1 allocs/op 149 | BenchmarkRingDeadlineWrite/ring-8192-deadline-1µs-4096 150 | BenchmarkRingDeadlineWrite/ring-8192-deadline-1µs-4096-8 87483 13547 ns/op 302.35 MB/s 32 B/op 1 allocs/op 151 | BenchmarkRingMultiWrite 152 | BenchmarkRingMultiWrite: ring_benchmark_test.go:207: 153 | ``` 154 | 155 | The first benchmark is just regualar `os.File` `Write` calls. This benchmark 156 | was run on Xeon E3-1505M v5 running on a luks encrypted consumer NVMe drive. 157 | The first thing to note is that the ns/op for for increasing write sizes scales 158 | from 4-8k. That seems pretty reasonable because the runtime is taking care of 159 | handling the system call. 160 | 161 | The `BenchmarkRingWrite` is roughly the same type of 162 | benchmark with an `Enter` being called for each SQE (essentially 1 syscall per 163 | write request). Note, that the ns/op is much higher because of all extra 164 | "stuff" the ring is handling. It also has a single allocation because it uses a 165 | monotonically increasing request id for tracking submissions with completions 166 | (using the user data field in the SQE). The other thing to note is the ring 167 | currently isn't using an eventfd for handling completions, it is doing the good 168 | old fashion brute force approach of submitting the request and then aggressively 169 | checking the CQ for the completion event. This is rather ineficient and burns 170 | some CPU cycles. Switching to an eventfd approach would probably be the ideal 171 | way to solve this problem. So the numbers showing roughly double the ns/op are 172 | pretty reasonable given the current design, which explains the lower throughput 173 | when doing a '1:1' comparison with Go file IO. 174 | 175 | The `BenchmarkRingDeadlineWrite` is kind of similar to the `BenchmarkRingWrite` 176 | only it uses a deadline approach for submissions. This in theory should handle 177 | concurrent writes far better, but there is no benchmark that is using 178 | concurrent writes as it is not the easiest type benchmark to write. 179 | 180 | The multiwrite API is still a WIP and it in theory should allow for "fan out" 181 | style writes to multiple FDs. 182 | 183 | Note, this library is still usable to a point where you can come up with your 184 | own concurrent io scheduling based on whatever huerestics you want (limiting IO 185 | requests per user?!?!). Implementing the perfect IO scheduler for Go is not 186 | really a goal of this project so this library will most likely have some 187 | tradeoffs (ie. my spare time) when it comes to optimal scheduling algorithms. 188 | If you are interested in this area feel free to send any PRs. 189 | 190 | # Interacting with the SQ 191 | The submission queue can be interacted with by using the 192 | [`SubmitEntry`](https://godoc.org/github.com/hodgesds/iouring-go#Ring.SubmitEntry) 193 | method on a `Ring`. The returned function **must** be called after all updates 194 | to the `SubmitEntry` are complete and **before** the ring is entered. The 195 | callback is used for synchronization across goroutines. 196 | 197 | # Other References 198 | https://cor3ntin.github.io/posts/iouring/ 199 | 200 | https://github.com/google/vectorio 201 | 202 | https://github.com/shuveb/io_uring-by-example/blob/master/02_cat_uring/main.c 203 | 204 | https://golang.org/pkg/syscall/#Iovec 205 | 206 | https://github.com/golang/go/blob/master/src/runtime/mbarrier.go#L21 207 | -------------------------------------------------------------------------------- /ops_test.go: -------------------------------------------------------------------------------- 1 | // +build linux 2 | 3 | package iouring 4 | 5 | import ( 6 | "fmt" 7 | "io" 8 | "io/ioutil" 9 | "log" 10 | "math/rand" 11 | "net" 12 | "os" 13 | "sync" 14 | "syscall" 15 | "testing" 16 | "time" 17 | 18 | "github.com/stretchr/testify/require" 19 | "golang.org/x/sys/unix" 20 | ) 21 | 22 | func TestPrepareAccept(t *testing.T) { 23 | r, err := New(2048, nil) 24 | require.NoError(t, err) 25 | require.NotNil(t, r) 26 | 27 | fd, err := syscall.Socket( 28 | syscall.AF_INET, syscall.SOCK_STREAM, syscall.IPPROTO_TCP) 29 | require.NoError(t, err) 30 | require.True(t, fd > 0) 31 | addr := &syscall.SockaddrInet4{ 32 | Port: 80, 33 | } 34 | copy(addr.Addr[:], net.ParseIP("8.8.8.8")) 35 | id, err := r.PrepareAccept( 36 | fd, 37 | addr, 38 | syscall.SizeofSockaddrInet4, 39 | 0, 40 | ) 41 | require.NoError(t, err) 42 | require.True(t, id > uint64(0)) 43 | } 44 | 45 | func TestClose(t *testing.T) { 46 | r, err := New(2048, nil) 47 | require.NoError(t, err) 48 | require.NotNil(t, r) 49 | 50 | f, err := ioutil.TempFile("", "fsync") 51 | require.NoError(t, err) 52 | defer os.Remove(f.Name()) 53 | 54 | err = r.Close(int(f.Fd())) 55 | require.NoError(t, err) 56 | } 57 | 58 | func TestPrepareConnect(t *testing.T) { 59 | r, err := New(2048, nil) 60 | require.NoError(t, err) 61 | require.NotNil(t, r) 62 | 63 | fd, err := syscall.Socket( 64 | syscall.AF_INET, syscall.SOCK_STREAM, syscall.IPPROTO_TCP) 65 | require.NoError(t, err) 66 | require.True(t, fd > 0) 67 | addr := &syscall.SockaddrInet4{ 68 | Port: 80, 69 | } 70 | copy(addr.Addr[:], net.ParseIP("8.8.8.8")) 71 | id, err := r.PrepareConnect( 72 | fd, 73 | addr, 74 | syscall.SizeofSockaddrInet4, 75 | ) 76 | require.NoError(t, err) 77 | require.True(t, id > uint64(0)) 78 | } 79 | 80 | func TestFadvise(t *testing.T) { 81 | r, err := New(2048, nil) 82 | require.NoError(t, err) 83 | require.NotNil(t, r) 84 | 85 | f, err := ioutil.TempFile("", "fadvise") 86 | require.NoError(t, err) 87 | defer os.Remove(f.Name()) 88 | 89 | data := []byte("hello fadvise") 90 | _, err = f.Write(data) 91 | require.NoError(t, err) 92 | 93 | err = r.Fadvise(int(f.Fd()), 0, uint32(len(data)), unix.FADV_NORMAL) 94 | require.NoError(t, err) 95 | } 96 | 97 | func TestFallocate(t *testing.T) { 98 | r, err := New(2048, nil) 99 | require.NoError(t, err) 100 | require.NotNil(t, r) 101 | 102 | f, err := ioutil.TempFile("", "fallocate") 103 | require.NoError(t, err) 104 | defer os.Remove(f.Name()) 105 | 106 | data := []byte("hello fallocate") 107 | _, err = f.Write(data) 108 | require.NoError(t, err) 109 | 110 | err = r.Fallocate(int(f.Fd()), unix.FALLOC_FL_KEEP_SIZE, 0, int64(len(data))) 111 | require.NoError(t, err) 112 | } 113 | 114 | func TestFsync(t *testing.T) { 115 | r, err := New(2048, nil) 116 | require.NoError(t, err) 117 | require.NotNil(t, r) 118 | 119 | f, err := ioutil.TempFile("", "fsync") 120 | require.NoError(t, err) 121 | defer os.Remove(f.Name()) 122 | 123 | err = r.Fsync(int(f.Fd()), 0) 124 | require.NoError(t, err) 125 | } 126 | 127 | func TestPrepareNop(t *testing.T) { 128 | r, err := New(2048, nil) 129 | require.NoError(t, err) 130 | require.NotNil(t, r) 131 | 132 | id, err := r.PrepareNop() 133 | require.NoError(t, err) 134 | require.True(t, id > uint64(0)) 135 | } 136 | 137 | func BenchmarkPrepareNop(b *testing.B) { 138 | r, err := New(2048, nil) 139 | require.NoError(b, err) 140 | require.NotNil(b, r) 141 | 142 | b.ReportAllocs() 143 | b.ResetTimer() 144 | for i := 0; i < b.N; i++ { 145 | _, err = r.PrepareNop() 146 | if err != nil { 147 | b.Fatal(err) 148 | } 149 | } 150 | } 151 | 152 | func BenchmarkNop(b *testing.B) { 153 | r, err := New(2048, nil) 154 | require.NoError(b, err) 155 | require.NotNil(b, r) 156 | 157 | b.ReportAllocs() 158 | b.ResetTimer() 159 | for i := 0; i < b.N; i++ { 160 | err = r.Nop() 161 | if err != nil { 162 | b.Fatal(err) 163 | } 164 | } 165 | } 166 | 167 | func BenchmarkNopDeadline(b *testing.B) { 168 | tests := []struct { 169 | ringSize uint 170 | writeSize int 171 | deadline time.Duration 172 | }{ 173 | { 174 | ringSize: 1024, 175 | deadline: 1 * time.Millisecond, 176 | }, 177 | { 178 | ringSize: 1024, 179 | deadline: 100 * time.Microsecond, 180 | }, 181 | { 182 | ringSize: 1024, 183 | deadline: 10 * time.Microsecond, 184 | }, 185 | } 186 | for _, test := range tests { 187 | b.Run( 188 | fmt.Sprintf( 189 | "ring-%d-nop-deadline-%v", 190 | test.ringSize, 191 | test.deadline.String(), 192 | ), 193 | func(b *testing.B) { 194 | r, err := New( 195 | test.ringSize, 196 | nil, 197 | WithDeadline(test.deadline), 198 | ) 199 | require.NoError(b, err) 200 | require.NotNil(b, r) 201 | 202 | b.ReportAllocs() 203 | b.ResetTimer() 204 | for i := 0; i < b.N; i++ { 205 | err = r.Nop() 206 | if err != nil { 207 | b.Fatal(err) 208 | } 209 | } 210 | }, 211 | ) 212 | } 213 | } 214 | 215 | func TestPollAdd(t *testing.T) { 216 | t.Skip("FIX ME") 217 | r, err := New( 218 | 2048, 219 | nil, 220 | WithID(1000000), 221 | WithEnterErrHandler(func(err error) { require.NoError(t, err) }), 222 | ) 223 | require.NoError(t, err) 224 | require.NotNil(t, r) 225 | 226 | data := []byte("foo") 227 | buf := make([]byte, len(data)) 228 | pipeFds := make([]int, 2) 229 | require.NoError(t, unix.Pipe2(pipeFds, syscall.O_NONBLOCK)) 230 | var wg sync.WaitGroup 231 | wg.Add(1) 232 | ready := make(chan struct{}) 233 | go func() { 234 | defer wg.Done() 235 | ready <- struct{}{} 236 | for i := 0; i < 3; i++ { 237 | println("F") 238 | syscall.Read(pipeFds[1], buf) 239 | ready <- struct{}{} 240 | println("y") 241 | require.NoError(t, r.PollAdd(pipeFds[1], POLLIN)) 242 | println("D") 243 | } 244 | syscall.Close(pipeFds[0]) 245 | }() 246 | for i := 0; i < 3; i++ { 247 | <-ready 248 | _, err = syscall.Write(pipeFds[0], data) 249 | println("W") 250 | if err == io.EOF { 251 | break 252 | } 253 | } 254 | wg.Wait() 255 | } 256 | 257 | func TestPrepareReadv(t *testing.T) { 258 | r, err := New(2048, nil) 259 | require.NoError(t, err) 260 | require.NotNil(t, r) 261 | 262 | data := []byte("testing...1,2,3") 263 | f, err := ioutil.TempFile("", "example") 264 | require.NoError(t, err) 265 | defer os.Remove(f.Name()) 266 | _, err = f.Write(data) 267 | 268 | require.NoError(t, err) 269 | _, err = f.Seek(0, 0) 270 | require.NoError(t, err) 271 | 272 | v := make([]*syscall.Iovec, 1) 273 | id, err := r.PrepareReadv(int(f.Fd()), v, 0) 274 | require.NoError(t, err) 275 | require.True(t, id > uint64(0)) 276 | } 277 | 278 | func TestSplice(t *testing.T) { 279 | out, err := ioutil.TempFile("", "out") 280 | require.NoError(t, err) 281 | defer os.Remove(out.Name()) 282 | 283 | data := make([]byte, 32) 284 | n, err := rand.Read(data) 285 | require.NoError(t, err) 286 | require.Equal(t, n, 32) 287 | 288 | pipeFds := make([]int, 2) 289 | require.NoError(t, unix.Pipe(pipeFds)) 290 | 291 | var wg sync.WaitGroup 292 | wg.Add(1) 293 | wrote := make(chan struct{}) 294 | go func() { 295 | <-wrote 296 | defer wg.Done() 297 | c, err := unix.Splice( 298 | pipeFds[0], nil, 299 | int(out.Fd()), nil, 300 | 32, 301 | unix.SPLICE_F_MOVE, 302 | ) 303 | require.NoError(t, err) 304 | require.Equal(t, int(c), 32) 305 | }() 306 | 307 | syscall.Write(pipeFds[1], data) 308 | wrote <- struct{}{} 309 | syscall.Close(pipeFds[1]) 310 | wg.Wait() 311 | } 312 | 313 | func TestRingSplice(t *testing.T) { 314 | t.Skip("FIX ME") 315 | r, err := New(2048, nil) 316 | require.NoError(t, err) 317 | require.NotNil(t, r) 318 | 319 | out, err := ioutil.TempFile("", "out") 320 | require.NoError(t, err) 321 | defer os.Remove(out.Name()) 322 | 323 | data := make([]byte, 32) 324 | n, err := rand.Read(data) 325 | require.NoError(t, err) 326 | require.Equal(t, n, 32) 327 | 328 | pipeFds := make([]int, 2) 329 | require.NoError(t, unix.Pipe(pipeFds)) 330 | 331 | var wg sync.WaitGroup 332 | wg.Add(1) 333 | wrote := make(chan struct{}) 334 | go func() { 335 | <-wrote 336 | defer wg.Done() 337 | c, err := r.Splice( 338 | pipeFds[0], nil, 339 | int(out.Fd()), nil, 340 | 32, 341 | unix.SPLICE_F_MOVE, 342 | ) 343 | require.NoError(t, err) 344 | require.Equal(t, int(c), 32) 345 | }() 346 | 347 | syscall.Write(pipeFds[1], data) 348 | wrote <- struct{}{} 349 | syscall.Close(pipeFds[1]) 350 | wg.Wait() 351 | } 352 | 353 | func TestRingStatx(t *testing.T) { 354 | r, err := New(2048, nil) 355 | require.NoError(t, err) 356 | require.NotNil(t, r) 357 | 358 | path, err := os.Getwd() 359 | require.NoError(t, err) 360 | 361 | f, err := ioutil.TempFile(path, "statx") 362 | require.NoError(t, err) 363 | defer os.Remove(f.Name()) 364 | 365 | _, err = f.Write([]byte("test")) 366 | require.NoError(t, err) 367 | 368 | var ( 369 | x1 unix.Statx_t 370 | x2 unix.Statx_t 371 | ) 372 | d, err := os.Open(path) 373 | require.NoError(t, err) 374 | defer d.Close() 375 | 376 | err = r.Statx(int(d.Fd()), path, 0, unix.STATX_ALL, &x1) 377 | require.NoError(t, err) 378 | 379 | err = unix.Statx(int(d.Fd()), path, 0, unix.STATX_ALL, &x2) 380 | require.NoError(t, err) 381 | require.Equal(t, x1, x2) 382 | } 383 | 384 | func TestSend(t *testing.T) { 385 | r, err := New(2048, nil) 386 | require.NoError(t, err) 387 | require.NotNil(t, r) 388 | 389 | sockFile := fmt.Sprintf("sock_test_%d.sock", rand.Int()) 390 | 391 | l, err := net.ListenUnix("unix", &net.UnixAddr{ 392 | Name: sockFile, 393 | Net: "unix", 394 | }) 395 | require.NoError(t, err) 396 | defer l.Close() 397 | 398 | b := []byte("some bytes") 399 | 400 | var wg sync.WaitGroup 401 | wg.Add(1) 402 | go func() { 403 | defer wg.Done() 404 | conn, err := l.Accept() 405 | if err != nil { 406 | log.Fatal("accept error:", err) 407 | } 408 | 409 | exB := make([]byte, len(b)) 410 | _, err = conn.Read(exB) 411 | require.NoError(t, err) 412 | require.Equal(t, b, exB) 413 | 414 | require.NoError(t, conn.Close()) 415 | }() 416 | 417 | c, err := net.DialUnix("unix", nil, &net.UnixAddr{ 418 | Name: sockFile, 419 | Net: "unix", 420 | }) 421 | require.NoError(t, err) 422 | f, err := c.File() 423 | require.NoError(t, err) 424 | require.NoError(t, r.Send(int(f.Fd()), b, 0)) 425 | wg.Wait() 426 | } 427 | 428 | func BenchmarkStatxRing(b *testing.B) { 429 | r, err := New(2048, nil) 430 | require.NoError(b, err) 431 | require.NotNil(b, r) 432 | 433 | path, err := os.Getwd() 434 | require.NoError(b, err) 435 | 436 | f, err := ioutil.TempFile(path, "statx") 437 | require.NoError(b, err) 438 | defer os.Remove(f.Name()) 439 | 440 | _, err = f.Write([]byte("test")) 441 | require.NoError(b, err) 442 | 443 | var x1 unix.Statx_t 444 | d, err := os.Open(path) 445 | require.NoError(b, err) 446 | defer d.Close() 447 | 448 | b.ReportAllocs() 449 | b.ResetTimer() 450 | for i := 0; i < b.N; i++ { 451 | err = r.Statx(int(d.Fd()), path, 0, unix.STATX_ALL, &x1) 452 | if err != nil { 453 | b.Fatal(err) 454 | } 455 | } 456 | } 457 | 458 | func TestPrepareTimeout(t *testing.T) { 459 | r, err := New(2048, nil) 460 | require.NoError(t, err) 461 | require.NotNil(t, r) 462 | 463 | id, err := r.PrepareTimeout(&syscall.Timespec{Sec: 1}, 1, 0) 464 | require.NoError(t, err) 465 | require.True(t, id > uint64(0)) 466 | } 467 | 468 | func TestPrepareTimeoutRemove(t *testing.T) { 469 | r, err := New(2048, nil) 470 | require.NoError(t, err) 471 | require.NotNil(t, r) 472 | 473 | id, err := r.PrepareTimeoutRemove(0, 0) 474 | require.NoError(t, err) 475 | require.True(t, id > uint64(0)) 476 | } 477 | 478 | func TestPrepareWritev(t *testing.T) { 479 | r, err := New(2048, nil) 480 | require.NoError(t, err) 481 | require.NotNil(t, r) 482 | 483 | f, err := ioutil.TempFile("", "example") 484 | require.NoError(t, err) 485 | defer os.Remove(f.Name()) 486 | 487 | b := byte(1) 488 | v := &syscall.Iovec{ 489 | Base: &b, 490 | } 491 | v.SetLen(1) 492 | iovs := []*syscall.Iovec{v} 493 | id, err := r.PrepareReadv(int(f.Fd()), iovs, 0) 494 | require.NoError(t, err) 495 | require.True(t, id > uint64(0)) 496 | } 497 | -------------------------------------------------------------------------------- /ring.go: -------------------------------------------------------------------------------- 1 | // +build linux 2 | 3 | package iouring 4 | 5 | import ( 6 | "os" 7 | "runtime" 8 | "sync" 9 | "sync/atomic" 10 | "syscall" 11 | "time" 12 | 13 | "github.com/pkg/errors" 14 | "golang.org/x/sys/unix" 15 | ) 16 | 17 | // Ring contains an io_uring submit and completion ring. 18 | type Ring struct { 19 | fd int 20 | p *Params 21 | cq *CompletionQueue 22 | c *completer 23 | cqMu sync.RWMutex 24 | sq *SubmitQueue 25 | sqMu sync.RWMutex 26 | sqPool sync.Pool 27 | idx *uint64 28 | debug bool 29 | fileReg FileRegistry 30 | deadline time.Duration 31 | enterErrHandler func(error) 32 | submitter submitter 33 | 34 | stop chan struct{} 35 | completions chan *completionRequest 36 | eventFd int 37 | completionPool sync.Pool 38 | } 39 | 40 | // New is used to create an iouring.Ring. 41 | func New(size uint, p *Params, opts ...RingOption) (*Ring, error) { 42 | if p == nil { 43 | p = &Params{} 44 | } 45 | fd, err := Setup(size, p) 46 | if err != nil { 47 | return nil, err 48 | } 49 | var ( 50 | cq CompletionQueue 51 | sq SubmitQueue 52 | sqWrites uint32 53 | ) 54 | if err := MmapRing(fd, p, &sq, &cq); err != nil { 55 | return nil, err 56 | } 57 | idx := uint64(0) 58 | entered := uint32(0) 59 | sq.entered = &entered 60 | 61 | sq.writes = &sqWrites 62 | r := &Ring{ 63 | p: p, 64 | fd: fd, 65 | cq: &cq, 66 | sq: &sq, 67 | idx: &idx, 68 | fileReg: nil, 69 | eventFd: -1, 70 | stop: make(chan struct{}, 32), 71 | completions: make(chan *completionRequest, len(cq.Entries)), 72 | c: newCompleter(&cq, 512), 73 | completionPool: sync.Pool{ 74 | New: func() interface{} { 75 | return &completionRequest{ 76 | done: make(chan struct{}, 8), 77 | } 78 | }, 79 | }, 80 | } 81 | for _, opt := range opts { 82 | if err := opt(r); err != nil { 83 | return nil, err 84 | } 85 | } 86 | go r.run() 87 | go r.c.run() 88 | 89 | return r, nil 90 | } 91 | 92 | // CQ returns the CompletionQueue for the ring. 93 | func (r *Ring) CQ() *CompletionQueue { 94 | return r.cq 95 | } 96 | 97 | // SQ returns the SubmitQueue for the ring. 98 | func (r *Ring) SQ() *SubmitQueue { 99 | return r.sq 100 | } 101 | 102 | // EventFd returns the file descriptor of the eventfd if it is set, otherwise 103 | // it returns the default value of -1. 104 | func (r *Ring) EventFd() int { 105 | return r.eventFd 106 | } 107 | 108 | // Enter is used to enter the ring. 109 | func (r *Ring) Enter(toSubmit uint, minComplete uint, flags uint, sigset *unix.Sigset_t) (int, error) { 110 | // Acquire the submit barrier so that the ring can safely be entered. 111 | if r.sq.NeedWakeup() { 112 | flags |= EnterSqWakeup 113 | } 114 | // Increase the write counter as the caller will be 115 | // updating the returned SubmitEntry. 116 | r.sq.enterLock() 117 | // TODO: Document how sigset should be used in relation with the go runtime and 118 | // io_uring_enter. 119 | completed, err := Enter(r.fd, toSubmit, minComplete, flags, sigset) 120 | r.sq.enterUnlock() 121 | return completed, err 122 | } 123 | 124 | // run is used to run the ring and handle completions. 125 | func (r *Ring) run() { 126 | inflight := map[uint64]*completionRequest{} 127 | retry := make(chan struct{}, 2) 128 | for { 129 | select { 130 | case <-r.stop: 131 | return 132 | case cr := <-r.completions: 133 | inflight[cr.id] = cr 134 | // TODO: Use the number completed for tracking 135 | count, err := r.Enter(uint(len(inflight)), 0, EnterGetEvents, nil) 136 | if err != nil { 137 | if r.enterErrHandler != nil { 138 | r.enterErrHandler(err) 139 | } 140 | // There still may be completed requests so continue on. 141 | } 142 | r.onEntry(inflight, count) 143 | if len(inflight) > 0 { 144 | retry <- struct{}{} 145 | } 146 | case <-retry: 147 | select { 148 | case cr := <-r.completions: 149 | inflight[cr.id] = cr 150 | _, err := r.Enter(uint(len(inflight)), 0, EnterGetEvents, nil) 151 | if err != nil { 152 | if r.enterErrHandler != nil { 153 | r.enterErrHandler(err) 154 | } 155 | } 156 | default: 157 | } 158 | r.onEntry(inflight, 0) 159 | if len(inflight) > 0 { 160 | // TODO: Use eventfd for polling instead. 161 | time.Sleep(200 * time.Nanosecond) 162 | retry <- struct{}{} 163 | } 164 | } 165 | } 166 | } 167 | 168 | func (r *Ring) complete(reqID uint64) (int32, uint32) { 169 | req := r.completionPool.Get().(*completionRequest) 170 | req.id = reqID 171 | req.res = 0 172 | req.flags = 0 173 | r.completions <- req 174 | <-req.done 175 | res := req.res 176 | flags := req.flags 177 | r.completionPool.Put(req) 178 | return res, flags 179 | } 180 | 181 | func (r *Ring) onEntry(inflight map[uint64]*completionRequest, count int) { 182 | mask := atomic.LoadUint32(r.cq.Mask) 183 | head := atomic.LoadUint32(r.cq.Head) 184 | tail := atomic.LoadUint32(r.cq.Tail) 185 | nEntries := uint32(len(r.cq.Entries)) 186 | seenIdx := uint32(0) 187 | seen := true 188 | for i := head & mask; i < nEntries; i++ { 189 | cqe := r.cq.Entries[i] 190 | if cr, ok := inflight[cqe.UserData]; ok { 191 | if seen { 192 | seenIdx++ 193 | } 194 | cr.res = cqe.Res 195 | cr.flags = cqe.Flags 196 | cr.done <- struct{}{} 197 | delete(inflight, cr.id) 198 | } else { 199 | seen = false 200 | } 201 | if i == tail&mask { 202 | atomic.StoreUint32(r.cq.Head, head+seenIdx) 203 | return 204 | } 205 | } 206 | seen = true 207 | for i := uint32(0); i < tail&mask; i++ { 208 | cqe := r.cq.Entries[i] 209 | if cr, ok := inflight[cqe.UserData]; ok { 210 | if seen { 211 | seenIdx++ 212 | } 213 | cr.res = cqe.Res 214 | cr.flags = cqe.Flags 215 | cr.done <- struct{}{} 216 | delete(inflight, cr.id) 217 | } else { 218 | seen = false 219 | } 220 | } 221 | atomic.StoreUint32(r.cq.Head, head+seenIdx) 222 | } 223 | 224 | // getCqe is used for getting a CQE result. 225 | func (r *Ring) getCqe(reqID uint64) (int32, uint32, error) { 226 | cq := r.cq 227 | findCqe: 228 | 229 | head := atomic.LoadUint32(cq.Head) 230 | tail := atomic.LoadUint32(cq.Tail) 231 | mask := atomic.LoadUint32(cq.Mask) 232 | end := int(tail & mask) 233 | 234 | for x := int(head & mask); x < len(cq.Entries); x++ { 235 | cqe := cq.Entries[x] 236 | if cqe.UserData == reqID { 237 | if cqe.Res < 0 { 238 | return 0, 0, syscall.Errno(-cqe.Res) 239 | } 240 | return cqe.Res, cqe.Flags, nil 241 | } 242 | if x == end { 243 | goto findCqe 244 | return 0, 0, errCQEMissing 245 | } 246 | } 247 | tail = atomic.LoadUint32(cq.Tail) 248 | mask = atomic.LoadUint32(cq.Mask) 249 | end = int(tail & mask) 250 | for x := 0; x < end; x++ { 251 | cqe := cq.Entries[x] 252 | if cqe.UserData == reqID { 253 | if cqe.Res < 0 { 254 | return 0, 0, syscall.Errno(-cqe.Res) 255 | } 256 | return cqe.Res, cqe.Flags, nil 257 | } 258 | if x == end { 259 | goto findCqe 260 | return 0, 0, errCQEMissing 261 | } 262 | } 263 | return 0, 0, errCQEMissing 264 | } 265 | 266 | // CanEnter returns whether or not the ring can be entered. 267 | func (r *Ring) CanEnter() bool { 268 | // TODO: figure out this 269 | return true 270 | } 271 | 272 | // ShouldFlush returns if the ring should flush due to cq being overflown. 273 | func (r *Ring) ShouldFlush() bool { 274 | return atomic.LoadUint32(r.sq.Flags)&SqCqOverflow != 0 275 | } 276 | 277 | // NeedsEnter returns if the ring needs to be entered. 278 | func (r *Ring) NeedsEnter() bool { 279 | return atomic.LoadUint32(r.sq.Flags)&SqNeedWakeup != 0 280 | } 281 | 282 | // Stop is used to stop the ring. 283 | func (r *Ring) Stop() error { 284 | if err := r.closeSq(); err != nil { 285 | return err 286 | } 287 | if r.p.Flags&FeatSingleMmap == 0 { 288 | if err := r.closeCq(); err != nil { 289 | return err 290 | } 291 | } 292 | if r.submitter != nil { 293 | r.submitter.stop() 294 | } 295 | return syscall.Close(r.fd) 296 | } 297 | 298 | func (r *Ring) closeCq() error { 299 | r.cqMu.Lock() 300 | defer r.cqMu.Unlock() 301 | if r.cq == nil { 302 | return nil 303 | } 304 | 305 | _, _, errno := syscall.Syscall6( 306 | syscall.SYS_MUNMAP, 307 | r.cq.ptr, 308 | uintptr(r.cq.Size), 309 | uintptr(0), 310 | uintptr(0), 311 | uintptr(0), 312 | uintptr(0), 313 | ) 314 | if errno != 0 { 315 | err := errno 316 | return errors.Wrap(err, "failed to munmap cq ring") 317 | } 318 | r.cq = nil 319 | return nil 320 | } 321 | 322 | func (r *Ring) closeSq() error { 323 | r.sqMu.Lock() 324 | defer r.sqMu.Unlock() 325 | if r.sq == nil { 326 | return nil 327 | } 328 | 329 | _, _, errno := syscall.Syscall6( 330 | syscall.SYS_MUNMAP, 331 | r.sq.ptr, 332 | uintptr(r.sq.Size), 333 | uintptr(0), 334 | uintptr(0), 335 | uintptr(0), 336 | uintptr(0), 337 | ) 338 | if errno != 0 { 339 | err := errno 340 | return errors.Wrap(err, "failed to munmap sq ring") 341 | } 342 | r.sq = nil 343 | return nil 344 | } 345 | 346 | // SubmitHead returns the position of the head of the submit queue. This method 347 | // is safe for calling concurrently. 348 | func (r *Ring) SubmitHead() int { 349 | return int(atomic.LoadUint32(r.sq.Head) & atomic.LoadUint32(r.sq.Mask)) 350 | } 351 | 352 | // SubmitTail returns the position of the tail of the submit queue. This method 353 | // is safe for calling concurrently. 354 | func (r *Ring) SubmitTail() int { 355 | return int(atomic.LoadUint32(r.sq.Tail) & atomic.LoadUint32(r.sq.Mask)) 356 | } 357 | 358 | // CompleteHead returns the position of the head of the completion queue. This 359 | // method is safe for calling concurrently. 360 | func (r *Ring) CompleteHead() int { 361 | return int(atomic.LoadUint32(r.cq.Head) & atomic.LoadUint32(r.cq.Mask)) 362 | } 363 | 364 | // CompleteTail returns the position of the tail of the submit queue. This method 365 | // is safe for calling concurrently. 366 | func (r *Ring) CompleteTail() int { 367 | return int(atomic.LoadUint32(r.cq.Tail) & atomic.LoadUint32(r.cq.Mask)) 368 | } 369 | 370 | // SubmitEntry returns the next available SubmitEntry or nil if the ring is 371 | // busy. The returned function should be called after SubmitEntry is ready to 372 | // enter the ring. 373 | func (r *Ring) SubmitEntry() (*SubmitEntry, func()) { 374 | // This function roughly follows this: 375 | // https://github.com/axboe/liburing/blob/master/src/queue.c#L258 376 | 377 | getNext: 378 | tail := atomic.LoadUint32(r.sq.Tail) 379 | head := atomic.LoadUint32(r.sq.Head) 380 | mask := atomic.LoadUint32(r.sq.Mask) 381 | next := tail&mask + 1 382 | if next <= uint32(len(r.sq.Entries)) { 383 | // Make sure the ring is safe for updating by acquring the 384 | // update barrier. 385 | if !atomic.CompareAndSwapUint32(r.sq.Tail, tail, next) { 386 | runtime.Gosched() 387 | goto getNext 388 | } 389 | if atomic.LoadUint32(r.sq.entered) != 0 { 390 | runtime.Gosched() 391 | goto getNext 392 | } 393 | atomic.AddUint32(r.sq.writes, 1) 394 | 395 | r.sq.Entries[tail&mask].Reset() 396 | return &r.sq.Entries[tail&mask], func() { 397 | r.sq.completeWrite() 398 | r.sq.Array[next-1] = head & mask 399 | } 400 | } 401 | // When the ring wraps restart. 402 | atomic.CompareAndSwapUint32(r.sq.Tail, tail, 0) 403 | goto getNext 404 | } 405 | 406 | // ID returns an id for a SQEs, it is a monotonically increasing value (until 407 | // uint64 wrapping). 408 | func (r *Ring) ID() uint64 { 409 | return atomic.AddUint64(r.idx, 1) 410 | } 411 | 412 | // Fd returns the file descriptor of the ring. 413 | func (r *Ring) Fd() int { 414 | return r.fd 415 | } 416 | 417 | // FileRegistry returns the FileRegistry for the Ring. 418 | func (r *Ring) FileRegistry() FileRegistry { 419 | return r.fileReg 420 | } 421 | 422 | // FileReadWriter returns an io.ReadWriter from an os.File that uses the ring. 423 | // Note that is is not valid to use other operations on the file (Seek/Close) 424 | // in combination with the reader. 425 | func (r *Ring) FileReadWriter(f *os.File) (ReadWriteSeekerCloser, error) { 426 | return r.fileReadWriter(f) 427 | } 428 | 429 | func (r *Ring) fileReadWriter(f *os.File) (*ringFIO, error) { 430 | var offset int64 431 | o, err := f.Seek(0, 0) 432 | if err != nil { 433 | return nil, err 434 | } 435 | offset = int64(o) 436 | rw := &ringFIO{ 437 | r: r, 438 | f: f, 439 | fd: int32(f.Fd()), 440 | fOffset: &offset, 441 | c: r.c, 442 | } 443 | if r.fileReg == nil { 444 | return rw, nil 445 | } 446 | return rw, r.fileReg.Register(int(f.Fd())) 447 | } 448 | -------------------------------------------------------------------------------- /ops.go: -------------------------------------------------------------------------------- 1 | // +build linux 2 | 3 | package iouring 4 | 5 | import ( 6 | "encoding/binary" 7 | "runtime" 8 | "syscall" 9 | "unsafe" 10 | 11 | "github.com/pkg/errors" 12 | "golang.org/x/sys/unix" 13 | ) 14 | 15 | var ( 16 | errRingUnavailable = errors.New("ring unavailable") 17 | ) 18 | 19 | // PrepareAccept is used to prepare a SQE for an accept(2) call. 20 | func (r *Ring) PrepareAccept( 21 | fd int, 22 | addr syscall.Sockaddr, 23 | socklen uint32, 24 | flags int, 25 | ) (uint64, error) { 26 | sqe, ready := r.SubmitEntry() 27 | if sqe == nil { 28 | return 0, errRingUnavailable 29 | } 30 | 31 | sqe.Opcode = Accept 32 | sqe.UserData = r.ID() 33 | sqe.Fd = int32(fd) 34 | sqe.Addr = (uint64)(uintptr(unsafe.Pointer(&addr))) 35 | sqe.Offset = uint64(socklen) 36 | sqe.UFlags = int32(flags) 37 | 38 | ready() 39 | return sqe.UserData, nil 40 | } 41 | 42 | // PrepareClose is used to prepare a close(2) call. 43 | func (r *Ring) PrepareClose(fd int) (uint64, error) { 44 | sqe, ready := r.SubmitEntry() 45 | if sqe == nil { 46 | return 0, errRingUnavailable 47 | } 48 | sqe.Opcode = Close 49 | sqe.UserData = r.ID() 50 | sqe.Fd = int32(fd) 51 | 52 | ready() 53 | return sqe.UserData, nil 54 | } 55 | 56 | // Close is implements close(2). 57 | func (r *Ring) Close(fd int) error { 58 | id, err := r.PrepareClose(fd) 59 | if err != nil { 60 | return err 61 | } 62 | errno, _ := r.complete(id) 63 | if errno < 0 { 64 | return syscall.Errno(-errno) 65 | } 66 | return nil 67 | } 68 | 69 | // PrepareConnect is used to prepare a SQE for a connect(2) call. 70 | func (r *Ring) PrepareConnect( 71 | fd int, 72 | addr syscall.Sockaddr, 73 | socklen uint32, 74 | ) (uint64, error) { 75 | sqe, ready := r.SubmitEntry() 76 | if sqe == nil { 77 | return 0, errRingUnavailable 78 | } 79 | 80 | sqe.Opcode = Connect 81 | sqe.UserData = r.ID() 82 | sqe.Fd = int32(fd) 83 | sqe.Addr = (uint64)(uintptr(unsafe.Pointer(&addr))) 84 | sqe.Len = socklen 85 | 86 | ready() 87 | return sqe.UserData, nil 88 | } 89 | 90 | // PrepareFadvise is used to prepare a fadvise call. 91 | func (r *Ring) PrepareFadvise( 92 | fd int, offset uint64, n uint32, advise int) (uint64, error) { 93 | sqe, ready := r.SubmitEntry() 94 | if sqe == nil { 95 | return 0, errRingUnavailable 96 | } 97 | 98 | sqe.Opcode = Fadvise 99 | sqe.UserData = r.ID() 100 | sqe.Fd = int32(fd) 101 | sqe.Len = n 102 | sqe.Offset = offset 103 | sqe.UFlags = int32(advise) 104 | 105 | ready() 106 | return sqe.UserData, nil 107 | } 108 | 109 | // Fadvise implements fadvise. 110 | func (r *Ring) Fadvise(fd int, offset uint64, n uint32, advise int) error { 111 | id, err := r.PrepareFadvise(fd, offset, n, advise) 112 | if err != nil { 113 | return err 114 | } 115 | errno, _ := r.complete(id) 116 | if errno < 0 { 117 | return syscall.Errno(-errno) 118 | } 119 | return nil 120 | } 121 | 122 | // PrepareFallocate is used to prepare a fallocate call. 123 | func (r *Ring) PrepareFallocate( 124 | fd int, mode uint32, offset int64, n int64) (uint64, error) { 125 | sqe, ready := r.SubmitEntry() 126 | if sqe == nil { 127 | return 0, errRingUnavailable 128 | } 129 | 130 | sqe.Opcode = Fallocate 131 | sqe.UserData = r.ID() 132 | sqe.Fd = int32(fd) 133 | sqe.Addr = uint64(n) 134 | sqe.Len = mode 135 | sqe.Offset = uint64(offset) 136 | 137 | ready() 138 | return sqe.UserData, nil 139 | } 140 | 141 | // Fallocate implements fallocate. 142 | func (r *Ring) Fallocate(fd int, mode uint32, offset int64, n int64) error { 143 | id, err := r.PrepareFallocate(fd, mode, offset, n) 144 | if err != nil { 145 | return err 146 | } 147 | errno, _ := r.complete(id) 148 | if errno < 0 { 149 | return syscall.Errno(-errno) 150 | } 151 | return nil 152 | } 153 | 154 | // PrepareFsync is used to prepare a fsync(2) call. 155 | func (r *Ring) PrepareFsync(fd int, flags int) (uint64, error) { 156 | sqe, ready := r.SubmitEntry() 157 | if sqe == nil { 158 | return 0, errRingUnavailable 159 | } 160 | sqe.Opcode = Fsync 161 | sqe.UserData = r.ID() 162 | sqe.Fd = int32(fd) 163 | sqe.UFlags = int32(flags) 164 | 165 | ready() 166 | return sqe.UserData, nil 167 | } 168 | 169 | // Fsync implements fsync(2). 170 | func (r *Ring) Fsync(fd int, flags int) error { 171 | id, err := r.PrepareFsync(fd, flags) 172 | if err != nil { 173 | return err 174 | } 175 | errno, _ := r.complete(id) 176 | if errno < 0 { 177 | return syscall.Errno(-errno) 178 | } 179 | return nil 180 | } 181 | 182 | // PrepareNop is used to prep a nop. 183 | func (r *Ring) PrepareNop() (uint64, error) { 184 | sqe, ready := r.SubmitEntry() 185 | if sqe == nil { 186 | return 0, errRingUnavailable 187 | } 188 | sqe.Opcode = Nop 189 | sqe.UserData = r.ID() 190 | sqe.Fd = -1 191 | 192 | ready() 193 | return sqe.UserData, nil 194 | } 195 | 196 | // Nop is a nop. 197 | func (r *Ring) Nop() error { 198 | id, err := r.PrepareNop() 199 | if err != nil { 200 | return err 201 | } 202 | errno, _ := r.complete(id) 203 | if errno < 0 { 204 | return syscall.Errno(-errno) 205 | } 206 | return nil 207 | } 208 | 209 | // PollAdd is used to add a poll to a fd. 210 | func (r *Ring) PollAdd(fd int, mask int) error { 211 | id, err := r.PreparePollAdd(fd, mask) 212 | if err != nil { 213 | return err 214 | } 215 | errno, _ := r.complete(id) 216 | if errno < 0 { 217 | return syscall.Errno(-errno) 218 | } 219 | return nil 220 | } 221 | 222 | // PreparePollAdd is used to prepare a SQE for adding a poll. 223 | func (r *Ring) PreparePollAdd(fd int, mask int) (uint64, error) { 224 | sqe, ready := r.SubmitEntry() 225 | if sqe == nil { 226 | return 0, errRingUnavailable 227 | } 228 | sqe.Opcode = PollAdd 229 | sqe.Fd = int32(fd) 230 | sqe.UFlags = int32(mask) 231 | sqe.UserData = r.ID() 232 | 233 | ready() 234 | return sqe.UserData, nil 235 | } 236 | 237 | // PrepareReadv is used to prepare a readv SQE. 238 | func (r *Ring) PrepareReadv( 239 | fd int, 240 | iovecs []*syscall.Iovec, 241 | offset int, 242 | ) (uint64, error) { 243 | sqe, ready := r.SubmitEntry() 244 | if sqe == nil { 245 | return 0, errRingUnavailable 246 | } 247 | 248 | sqe.Opcode = Readv 249 | sqe.UserData = r.ID() 250 | sqe.Fd = int32(fd) 251 | sqe.Addr = (uint64)(uintptr(unsafe.Pointer(&iovecs[0]))) 252 | sqe.Len = uint32(len(iovecs)) 253 | sqe.Offset = uint64(offset) 254 | 255 | ready() 256 | return sqe.UserData, nil 257 | } 258 | 259 | // PrepareRecvmsg is used to prepare a recvmsg SQE. 260 | func (r *Ring) PrepareRecvmsg( 261 | fd int, 262 | msg *syscall.Msghdr, 263 | flags int, 264 | ) (uint64, error) { 265 | sqe, ready := r.SubmitEntry() 266 | if sqe == nil { 267 | return 0, errRingUnavailable 268 | } 269 | 270 | sqe.Opcode = RecvMsg 271 | sqe.UserData = r.ID() 272 | sqe.Fd = int32(fd) 273 | sqe.Addr = (uint64)(uintptr(unsafe.Pointer(msg))) 274 | sqe.Len = 1 275 | sqe.Offset = 0 276 | sqe.UFlags = int32(flags) 277 | 278 | ready() 279 | return sqe.UserData, nil 280 | } 281 | 282 | // Splice implements splice using a ring. 283 | func (r *Ring) Splice( 284 | inFd int, 285 | inOff *int64, 286 | outFd int, 287 | outOff *int64, 288 | n int, 289 | flags int, 290 | ) (int64, error) { 291 | id, err := r.PrepareSplice(inFd, inOff, outFd, outOff, n, flags) 292 | if err != nil { 293 | return 0, err 294 | } 295 | // TODO: replace complete with something more efficient. 296 | errno, res := r.complete(id) 297 | if errno < 0 { 298 | return 0, syscall.Errno(-errno) 299 | } 300 | runtime.KeepAlive(inOff) 301 | runtime.KeepAlive(outOff) 302 | return int64(res), nil 303 | } 304 | 305 | // PrepareSplice is used to prepare a SQE for a splice(2). 306 | func (r *Ring) PrepareSplice( 307 | inFd int, 308 | inOff *int64, 309 | outFd int, 310 | outOff *int64, 311 | n int, 312 | flags int, 313 | ) (uint64, error) { 314 | sqe, ready := r.SubmitEntry() 315 | if sqe == nil { 316 | return 0, errRingUnavailable 317 | } 318 | 319 | sqe.Opcode = Splice 320 | sqe.Fd = int32(outFd) 321 | if inOff != nil { 322 | sqe.Addr = uint64(uintptr(unsafe.Pointer(&inOff))) 323 | } else { 324 | sqe.Addr = 0 325 | } 326 | sqe.Len = uint32(n) 327 | if outOff != nil { 328 | sqe.Offset = uint64(uintptr(unsafe.Pointer(&outOff))) 329 | } else { 330 | sqe.Offset = 0 331 | } 332 | sqe.UFlags = int32(flags) 333 | // BUG: need to convert the inFd to the union member of the SQE 334 | anon := [24]byte{} 335 | binary.LittleEndian.PutUint32(anon[4:], uint32(inFd)) 336 | sqe.Anon0 = anon 337 | sqe.UserData = r.ID() 338 | 339 | ready() 340 | return sqe.UserData, nil 341 | } 342 | 343 | // Statx implements statx using a ring. 344 | func (r *Ring) Statx( 345 | dirfd int, 346 | path string, 347 | flags int, 348 | mask int, 349 | statx *unix.Statx_t, 350 | ) (err error) { 351 | id, err := r.PrepareStatx(dirfd, path, flags, mask, statx) 352 | if err != nil { 353 | return err 354 | } 355 | errno, _ := r.complete(id) 356 | // No GC until the request is done. 357 | runtime.KeepAlive(statx) 358 | runtime.KeepAlive(dirfd) 359 | runtime.KeepAlive(path) 360 | runtime.KeepAlive(mask) 361 | runtime.KeepAlive(flags) 362 | if errno < 0 { 363 | return syscall.Errno(-errno) 364 | } 365 | return nil 366 | } 367 | 368 | // PrepareStatx is used to prepare a Statx call and will return the request id 369 | // (SQE UserData) of the SQE. After calling the returned callback function the 370 | // ring is safe to be entered. 371 | func (r *Ring) PrepareStatx( 372 | dirfd int, 373 | path string, 374 | flags int, 375 | mask int, 376 | statx *unix.Statx_t, 377 | ) (uint64, error) { 378 | sqe, ready := r.SubmitEntry() 379 | if sqe == nil { 380 | return 0, errRingUnavailable 381 | } 382 | 383 | sqe.Opcode = Statx 384 | sqe.Fd = int32(dirfd) 385 | if path != "" { 386 | // TODO: could probably avoid the conversion to []byte 387 | b := saferStringToBytes(&path) 388 | sqe.Addr = (uint64)(uintptr(unsafe.Pointer(&b[0]))) 389 | } 390 | sqe.Len = uint32(mask) 391 | sqe.Offset = (uint64)(uintptr(unsafe.Pointer(statx))) 392 | sqe.UFlags = int32(flags) 393 | sqe.UserData = r.ID() 394 | 395 | ready() 396 | return sqe.UserData, nil 397 | } 398 | 399 | // PrepareTimeout is used to prepare a timeout SQE. 400 | func (r *Ring) PrepareTimeout( 401 | ts *syscall.Timespec, count int, flags int) (uint64, error) { 402 | sqe, ready := r.SubmitEntry() 403 | if sqe == nil { 404 | return 0, errRingUnavailable 405 | } 406 | 407 | sqe.Opcode = Timeout 408 | sqe.UserData = r.ID() 409 | sqe.UFlags = int32(flags) 410 | sqe.Fd = -1 411 | sqe.Addr = (uint64)(uintptr(unsafe.Pointer(ts))) 412 | sqe.Len = 1 413 | sqe.Offset = uint64(count) 414 | 415 | ready() 416 | return sqe.UserData, nil 417 | } 418 | 419 | // PrepareTimeoutRemove is used to prepare a timeout removal. 420 | func (r *Ring) PrepareTimeoutRemove(data uint64, flags int) (uint64, error) { 421 | sqe, ready := r.SubmitEntry() 422 | if sqe == nil { 423 | return 0, errRingUnavailable 424 | } 425 | 426 | sqe.Opcode = TimeoutRemove 427 | sqe.UserData = r.ID() 428 | sqe.UFlags = int32(flags) 429 | sqe.Fd = -1 430 | sqe.Addr = data 431 | sqe.Len = 0 432 | sqe.Offset = 0 433 | 434 | ready() 435 | return sqe.UserData, nil 436 | } 437 | 438 | // PrepareRead is used to prepare a read SQE. 439 | func (r *Ring) PrepareRead( 440 | fd int, 441 | b []byte, 442 | offset uint64, 443 | flags uint8, 444 | ) (uint64, error) { 445 | sqe, ready := r.SubmitEntry() 446 | if sqe == nil { 447 | return 0, errRingUnavailable 448 | } 449 | 450 | sqe.Opcode = Read 451 | sqe.UserData = r.ID() 452 | sqe.Fd = int32(fd) 453 | sqe.Len = uint32(len(b)) 454 | sqe.Flags = flags 455 | sqe.Offset = offset 456 | sqe.Addr = (uint64)(uintptr(unsafe.Pointer(&b[0]))) 457 | 458 | ready() 459 | return sqe.UserData, nil 460 | } 461 | 462 | // PrepareReadFixed is used to prepare a fixed read SQE. 463 | func (r *Ring) PrepareReadFixed( 464 | fd int, 465 | b []byte, 466 | flags uint8, 467 | ) (uint64, error) { 468 | sqe, ready := r.SubmitEntry() 469 | if sqe == nil { 470 | return 0, errRingUnavailable 471 | } 472 | 473 | sqe.Opcode = ReadFixed 474 | sqe.UserData = r.ID() 475 | sqe.Fd = int32(fd) 476 | sqe.Len = uint32(len(b)) 477 | sqe.Flags = flags 478 | sqe.Addr = (uint64)(uintptr(unsafe.Pointer(&b[0]))) 479 | 480 | ready() 481 | return sqe.UserData, nil 482 | } 483 | 484 | // PrepareWrite is used to prepare a Write SQE. 485 | func (r *Ring) PrepareWrite( 486 | fd int, 487 | b []byte, 488 | offset uint64, 489 | flags uint8, 490 | ) (uint64, error) { 491 | sqe, ready := r.SubmitEntry() 492 | if sqe == nil { 493 | return 0, errRingUnavailable 494 | } 495 | 496 | sqe.Opcode = Write 497 | sqe.UserData = r.ID() 498 | sqe.Fd = int32(fd) 499 | sqe.Len = uint32(len(b)) 500 | sqe.Flags = flags 501 | sqe.Offset = offset 502 | sqe.Addr = (uint64)(uintptr(unsafe.Pointer(&b[0]))) 503 | 504 | ready() 505 | return sqe.UserData, nil 506 | } 507 | 508 | // PrepareWriteFixed is used to prepare a fixed write SQE. 509 | func (r *Ring) PrepareWriteFixed( 510 | fd int, 511 | b []byte, 512 | flags uint8, 513 | ) (uint64, error) { 514 | sqe, ready := r.SubmitEntry() 515 | if sqe == nil { 516 | return 0, errRingUnavailable 517 | } 518 | 519 | sqe.Opcode = WriteFixed 520 | sqe.UserData = r.ID() 521 | sqe.Fd = int32(fd) 522 | sqe.Len = uint32(len(b)) 523 | sqe.Flags = flags 524 | sqe.Addr = (uint64)(uintptr(unsafe.Pointer(&b[0]))) 525 | 526 | ready() 527 | return sqe.UserData, nil 528 | } 529 | 530 | // PrepareWritev is used to prepare a writev SQE. 531 | func (r *Ring) PrepareWritev( 532 | fd int, 533 | iovecs []*syscall.Iovec, 534 | offset int, 535 | ) (uint64, error) { 536 | sqe, ready := r.SubmitEntry() 537 | if sqe == nil { 538 | return 0, errRingUnavailable 539 | } 540 | 541 | sqe.Opcode = Writev 542 | sqe.UserData = r.ID() 543 | sqe.Fd = int32(fd) 544 | sqe.Addr = (uint64)(uintptr(unsafe.Pointer(&iovecs[0]))) 545 | sqe.Len = uint32(len(iovecs)) 546 | sqe.Offset = uint64(offset) 547 | 548 | ready() 549 | return sqe.UserData, nil 550 | } 551 | 552 | // PrepareSend is used to prepare a Send SQE. 553 | func (r *Ring) PrepareSend( 554 | fd int, 555 | b []byte, 556 | flags uint8, 557 | ) (uint64, error) { 558 | sqe, ready := r.SubmitEntry() 559 | if sqe == nil { 560 | return 0, errRingUnavailable 561 | } 562 | 563 | sqe.Opcode = Send 564 | sqe.UserData = r.ID() 565 | sqe.Fd = int32(fd) 566 | sqe.Len = uint32(len(b)) 567 | sqe.Flags = flags 568 | sqe.Addr = (uint64)(uintptr(unsafe.Pointer(&b[0]))) 569 | 570 | ready() 571 | return sqe.UserData, nil 572 | } 573 | 574 | // Send is used to send data to a socket. 575 | func (r *Ring) Send( 576 | fd int, 577 | b []byte, 578 | flags uint8, 579 | ) error { 580 | id, err := r.PrepareSend(fd, b, flags) 581 | if err != nil { 582 | return err 583 | } 584 | errno, _ := r.complete(id) 585 | // No GC until the request is done. 586 | runtime.KeepAlive(b) 587 | if errno < 0 { 588 | return syscall.Errno(-errno) 589 | } 590 | return nil 591 | } 592 | 593 | // PrepareRecv is used to prepare a Recv SQE. 594 | func (r *Ring) PrepareRecv( 595 | fd int, 596 | b []byte, 597 | flags uint8, 598 | ) (uint64, error) { 599 | sqe, ready := r.SubmitEntry() 600 | if sqe == nil { 601 | return 0, errRingUnavailable 602 | } 603 | 604 | sqe.Opcode = Recv 605 | sqe.UserData = r.ID() 606 | sqe.Fd = int32(fd) 607 | sqe.Len = uint32(len(b)) 608 | sqe.Flags = flags 609 | sqe.Addr = (uint64)(uintptr(unsafe.Pointer(&b[0]))) 610 | 611 | ready() 612 | return sqe.UserData, nil 613 | } 614 | 615 | // Recv is used to recv data on a socket. 616 | func (r *Ring) Recv( 617 | fd int, 618 | b []byte, 619 | flags uint8, 620 | ) error { 621 | id, err := r.PrepareRecv(fd, b, flags) 622 | if err != nil { 623 | return err 624 | } 625 | errno, _ := r.complete(id) 626 | // No GC until the request is done. 627 | runtime.KeepAlive(b) 628 | if errno < 0 { 629 | return syscall.Errno(-errno) 630 | } 631 | return nil 632 | } 633 | --------------------------------------------------------------------------------