├── README.md ├── Makefile ├── LICENSE ├── interface_nolinux.go ├── mount_nolinux.go ├── qp_test.go ├── mr_test.go ├── dial_linux.go ├── mount_linux.go ├── syscall_linux_amd64.go ├── syscall_linux_386.go ├── listen_linux.go ├── interface_test.go ├── signal_test.go ├── syscall_linux.go ├── util.go ├── bench_test.go ├── read_test.go ├── mr_linux.go ├── dial_test.go ├── interface_linux.go ├── conn_test.go ├── conn_linux.go └── qp_linux.go /README.md: -------------------------------------------------------------------------------- 1 | #ib 2 | 3 | The ib package contains library functions to perform Infiniband communication in go in Linux. 4 | 5 | [Godocs](https://godoc.org/github.com/jsgilmore/ib) 6 | 7 | Dependencies: 8 | github.com/jsgilmore/mount 9 | github.com/jsgilmore/shm 10 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | install: 2 | go install . 3 | 4 | regenerate: 5 | set -xe 6 | GOARCH=386 go tool cgo -godefs syscall_linux.go |gofmt -s >syscall_linux_386.go 7 | GOARCH=amd64 go tool cgo -godefs syscall_linux.go |gofmt -s >syscall_linux_amd64.go 8 | go install . 9 | 10 | .DEFAULT_GOAL:=install -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright 2014 Vastech SA (PTY) LTD 2 | 3 | Licensed under the Apache License, Version 2.0 (the "License"); 4 | you may not use this file except in compliance with the License. 5 | You may obtain a copy of the License at 6 | 7 | http://www.apache.org/licenses/LICENSE-2.0 8 | 9 | Unless required by applicable law or agreed to in writing, software 10 | distributed under the License is distributed on an "AS IS" BASIS, 11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | See the License for the specific language governing permissions and 13 | limitations under the License. -------------------------------------------------------------------------------- /interface_nolinux.go: -------------------------------------------------------------------------------- 1 | // Copyright 2014 Vastech SA (PTY) LTD 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | // +build !linux 16 | 17 | package ib 18 | 19 | func Initialize() { 20 | panic("only implemented on linux") 21 | } 22 | -------------------------------------------------------------------------------- /mount_nolinux.go: -------------------------------------------------------------------------------- 1 | // Copyright 2014 Vastech SA (PTY) LTD 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | // +build !linux 16 | 17 | package ib 18 | 19 | func SetupOptional() bool { 20 | panic("only implemented onl linux") 21 | } 22 | -------------------------------------------------------------------------------- /qp_test.go: -------------------------------------------------------------------------------- 1 | // Copyright 2014 Vastech SA (PTY) LTD 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | package ib 16 | 17 | import ( 18 | "fmt" 19 | "testing" 20 | ) 21 | 22 | func TestNewQPClose(t *testing.T) { 23 | for _, iface := range guidToInterface { 24 | qp, err := iface.NewQueuePair(10) 25 | if err != nil { 26 | panic(err) 27 | } 28 | fmt.Printf("%+v\n", qp.Query()) 29 | // this failed when NewQueuePair didn't Reset and Init the QP 30 | if err := qp.Close(); err != nil { 31 | panic(err) 32 | } 33 | } 34 | } 35 | -------------------------------------------------------------------------------- /mr_test.go: -------------------------------------------------------------------------------- 1 | // Copyright 2014 Vastech SA (PTY) LTD 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | package ib 16 | 17 | import ( 18 | "github.com/jsgilmore/shm" 19 | "testing" 20 | ) 21 | 22 | func TestShmMemoryRegion(t *testing.T) { 23 | if !SetupOptional() { 24 | t.Skip("skipping TestShmMemoryRegion") 25 | return 26 | } 27 | buf, err := shm.NewBufferTmpfs(64<<20, shm.PROT_RDWR) 28 | if err != nil { 29 | panic(err) 30 | } 31 | mr, err := RegisterMemory(buf.Bytes()) 32 | if err != nil { 33 | panic(err) 34 | } 35 | checkClose(mr) 36 | checkClose(buf) 37 | 38 | buf, err = shm.NewBufferHugepages(64<<20, shm.PROT_RDWR) 39 | if err != nil { 40 | panic(err) 41 | } 42 | mr, err = RegisterMemory(buf.Bytes()) 43 | if err != nil { 44 | panic(err) 45 | } 46 | checkClose(mr) 47 | checkClose(buf) 48 | } 49 | -------------------------------------------------------------------------------- /dial_linux.go: -------------------------------------------------------------------------------- 1 | // Copyright 2014 Vastech SA (PTY) LTD 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | // +build linux 16 | 17 | package ib 18 | 19 | import ( 20 | "errors" 21 | "fmt" 22 | "net" 23 | ) 24 | 25 | var invalidAddrError = errors.New("invalid address") 26 | 27 | func DialRC(laddr, raddr *net.TCPAddr) (*RCConn, error) { 28 | if laddr == nil || raddr == nil { 29 | return nil, invalidAddrError 30 | } 31 | iface := InterfaceForAddr(&net.IPNet{IP: laddr.IP}) 32 | if iface == nil { 33 | err := errors.New(fmt.Sprintf("no interface for address %v", laddr)) 34 | return nil, err 35 | } 36 | // http://code.google.com/p/go/issues/detail?id=3097 37 | c, err := net.DialTCP("tcp", laddr, raddr) 38 | if err != nil { 39 | return nil, err 40 | } 41 | defer checkClose(c) 42 | return newRCConn(c, iface) 43 | } 44 | -------------------------------------------------------------------------------- /mount_linux.go: -------------------------------------------------------------------------------- 1 | // Copyright 2014 Vastech SA (PTY) LTD 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | // +build linux 16 | 17 | package ib 18 | 19 | import ( 20 | "github.com/jsgilmore/mount" 21 | "os" 22 | ) 23 | 24 | const mountDir = "/run/next/test/mnt" 25 | 26 | var setupDone = false 27 | 28 | func SetupOptional() bool { 29 | if setupDone { 30 | return true 31 | } 32 | if !mount.InMountNamespace() { 33 | return false 34 | } 35 | mount.MountNamespace() 36 | setup() 37 | setupDone = true 38 | return true 39 | } 40 | 41 | func setup() { 42 | // zero-sized directory just for mounts 43 | if err := os.MkdirAll(mountDir, 0777); err != nil { 44 | panic(err) 45 | } 46 | if err := mount.MountTmpfs(mountDir, 0); err != nil { 47 | panic(err) 48 | } 49 | 50 | if err := mount.MountTmpfs("/dev/shm", 14<<30); err != nil { 51 | panic(err) 52 | } 53 | if err := mount.MountHugetlbfs("/dev/hugepages", 2<<20, 4<<30); err != nil { 54 | panic(err) 55 | } 56 | setupDone = true 57 | } 58 | -------------------------------------------------------------------------------- /syscall_linux_amd64.go: -------------------------------------------------------------------------------- 1 | // Created by cgo -godefs - DO NOT EDIT 2 | // cgo -godefs syscall_linux.go 3 | 4 | package ib 5 | 6 | import ( 7 | "syscall" 8 | "unsafe" 9 | ) 10 | 11 | const ( 12 | POLLIN = 0x1 13 | POLLPRI = 0x2 14 | POLLOUT = 0x4 15 | POLLRDHUP = 0x2000 16 | POLLERR = 0x8 17 | POLLHUP = 0x10 18 | POLLNVAL = 0x20 19 | ) 20 | 21 | const ( 22 | RLIM_INFINITY = -0x1 23 | RLIMIT_AS = 0x9 24 | RLIMIT_CORE = 0x4 25 | RLIMIT_CPU = 0x0 26 | RLIMIT_DATA = 0x2 27 | RLIMIT_FSIZE = 0x1 28 | RLIMIT_MEMLOCK = 0x8 29 | RLIMIT_MSGQUEUE = 0xc 30 | RLIMIT_NICE = 0xd 31 | RLIMIT_NOFILE = 0x7 32 | RLIMIT_NPROC = 0x6 33 | RLIMIT_RSS = 0x5 34 | RLIMIT_RTPRIO = 0xe 35 | RLIMIT_SIGPENDING = 0xb 36 | RLIMIT_STACK = 0x3 37 | ) 38 | 39 | type Pollfd struct { 40 | Fd int32 41 | Events int16 42 | Revents int16 43 | } 44 | type Rlimit struct { 45 | Cur uint64 46 | Max uint64 47 | } 48 | 49 | func Getrlimit(resource int) (rlim Rlimit, err error) { 50 | _, _, e1 := syscall.Syscall(syscall.SYS_GETRLIMIT, uintptr(resource), uintptr(unsafe.Pointer(&rlim)), 0) 51 | if e1 != 0 { 52 | err = e1 53 | } 54 | return 55 | } 56 | 57 | func Poll(fds []Pollfd, timeout int64) (n int, err error) { 58 | for { 59 | r0, _, e1 := syscall.Syscall(syscall.SYS_POLL, uintptr(unsafe.Pointer(&fds[0])), uintptr(len(fds)), uintptr(timeout)) 60 | n = int(r0) 61 | switch e1 { 62 | case 0: 63 | return 64 | case 0x4: 65 | 66 | default: 67 | err = e1 68 | return 69 | } 70 | } 71 | } 72 | -------------------------------------------------------------------------------- /syscall_linux_386.go: -------------------------------------------------------------------------------- 1 | // Created by cgo -godefs - DO NOT EDIT 2 | // cgo -godefs syscall_linux.go 3 | 4 | package ib 5 | 6 | import ( 7 | "syscall" 8 | "unsafe" 9 | ) 10 | 11 | const ( 12 | POLLIN = 0x1 13 | POLLPRI = 0x2 14 | POLLOUT = 0x4 15 | POLLRDHUP = 0x2000 16 | POLLERR = 0x8 17 | POLLHUP = 0x10 18 | POLLNVAL = 0x20 19 | ) 20 | 21 | const ( 22 | RLIM_INFINITY = 0xffffffff 23 | RLIMIT_AS = 0x9 24 | RLIMIT_CORE = 0x4 25 | RLIMIT_CPU = 0x0 26 | RLIMIT_DATA = 0x2 27 | RLIMIT_FSIZE = 0x1 28 | RLIMIT_MEMLOCK = 0x8 29 | RLIMIT_MSGQUEUE = 0xc 30 | RLIMIT_NICE = 0xd 31 | RLIMIT_NOFILE = 0x7 32 | RLIMIT_NPROC = 0x6 33 | RLIMIT_RSS = 0x5 34 | RLIMIT_RTPRIO = 0xe 35 | RLIMIT_SIGPENDING = 0xb 36 | RLIMIT_STACK = 0x3 37 | ) 38 | 39 | type Pollfd struct { 40 | Fd int32 41 | Events int16 42 | Revents int16 43 | } 44 | type Rlimit struct { 45 | Cur uint32 46 | Max uint32 47 | } 48 | 49 | func Getrlimit(resource int) (rlim Rlimit, err error) { 50 | _, _, e1 := syscall.Syscall(syscall.SYS_GETRLIMIT, uintptr(resource), uintptr(unsafe.Pointer(&rlim)), 0) 51 | if e1 != 0 { 52 | err = e1 53 | } 54 | return 55 | } 56 | 57 | func Poll(fds []Pollfd, timeout int64) (n int, err error) { 58 | for { 59 | r0, _, e1 := syscall.Syscall(syscall.SYS_POLL, uintptr(unsafe.Pointer(&fds[0])), uintptr(len(fds)), uintptr(timeout)) 60 | n = int(r0) 61 | switch e1 { 62 | case 0: 63 | return 64 | case 0x4: 65 | 66 | default: 67 | err = e1 68 | return 69 | } 70 | } 71 | } 72 | -------------------------------------------------------------------------------- /listen_linux.go: -------------------------------------------------------------------------------- 1 | // Copyright 2014 Vastech SA (PTY) LTD 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | // +build linux 16 | 17 | package ib 18 | 19 | import ( 20 | "errors" 21 | "net" 22 | ) 23 | 24 | type RCListener struct { 25 | l *net.TCPListener 26 | iface *Interface 27 | } 28 | 29 | var errNoInterfaceForAddr = errors.New("ib: ListenRC: no interface for address") 30 | 31 | func ListenRC(laddr *net.TCPAddr) (*RCListener, error) { 32 | if laddr == nil { 33 | return nil, invalidAddrError 34 | } 35 | iface := InterfaceForAddr(&net.IPNet{IP: laddr.IP}) 36 | if iface == nil { 37 | return nil, errNoInterfaceForAddr 38 | } 39 | l, err := net.ListenTCP("tcp", laddr) 40 | if err != nil { 41 | return nil, err 42 | } 43 | return &RCListener{l, iface}, nil 44 | } 45 | 46 | func (this *RCListener) Accept() (*RCConn, error) { 47 | c, err := this.l.AcceptTCP() 48 | if err != nil { 49 | return nil, err 50 | } 51 | defer checkClose(c) 52 | return newRCConn(c, this.iface) 53 | } 54 | 55 | func (this *RCListener) Addr() net.Addr { 56 | return this.l.Addr() 57 | } 58 | 59 | func (this *RCListener) Close() error { 60 | return this.l.Close() 61 | } 62 | -------------------------------------------------------------------------------- /interface_test.go: -------------------------------------------------------------------------------- 1 | // Copyright 2014 Vastech SA (PTY) LTD 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | package ib 16 | 17 | import ( 18 | "fmt" 19 | "sync" 20 | "testing" 21 | ) 22 | 23 | func TestInterfaces(t *testing.T) { 24 | for _, iface := range interfaces { 25 | fmt.Printf("%+v active=%v lid=%d\n", iface, iface.Active(), iface.Lid()) 26 | } 27 | } 28 | 29 | func TestInterfaceAddrs(t *testing.T) { 30 | addrs := InterfaceAddrs() 31 | for _, addr := range addrs { 32 | iface := InterfaceForAddr(addr) 33 | if iface == nil { 34 | continue 35 | } 36 | fmt.Printf("%v -> %+v guid=%x\n", addr, iface, []byte(iface.guid)) 37 | } 38 | } 39 | 40 | func TestCompletionChannels(t *testing.T) { 41 | var wg sync.WaitGroup 42 | ibAddrs := InterfaceAddrs() 43 | for _, addr := range ibAddrs { 44 | iface := InterfaceForAddr(addr) 45 | if iface == nil { 46 | continue 47 | } 48 | wg.Add(1) 49 | go func() { 50 | defer wg.Done() 51 | for i := 0; i < 10; i++ { 52 | cq := iface.createCompletionQueue(5) 53 | if cq == nil { 54 | panic("ibv_create_cq: failure") 55 | } 56 | fmt.Printf("cq.channel=%+v\n", cq.channel) 57 | if err := destroyCompletionQueue(cq); err != nil { 58 | panic(err) 59 | } 60 | } 61 | }() 62 | } 63 | wg.Wait() 64 | } 65 | 66 | func init() { 67 | Initialize() 68 | } 69 | -------------------------------------------------------------------------------- /signal_test.go: -------------------------------------------------------------------------------- 1 | package ib 2 | 3 | import ( 4 | "net" 5 | "sync" 6 | "syscall" 7 | "testing" 8 | "time" 9 | ) 10 | 11 | // Test if random signals cause infiniband errors. Poll did not check for EINTR and could be interrupted causing errors 12 | // to be returned. Needs -cpu=2 or higher to trigger the issue. 13 | func TestConnWithSignals(t *testing.T) { 14 | for i := 0; i < 10; i++ { 15 | testConnWithSignals(t) 16 | } 17 | } 18 | 19 | func testConnWithSignals(t *testing.T) { 20 | stop := make(chan struct{}) 21 | var wg sync.WaitGroup 22 | wg.Add(1) 23 | go signalSelf(stop, &wg) 24 | connectReadAndWrite(t) 25 | close(stop) 26 | wg.Wait() 27 | } 28 | 29 | func signalSelf(stop chan struct{}, wg *sync.WaitGroup) { 30 | ticker := time.NewTicker(10 * time.Millisecond) 31 | for { 32 | select { 33 | case <-ticker.C: 34 | syscall.Kill(syscall.Getpid(), syscall.SIGUSR1) 35 | case <-stop: 36 | wg.Done() 37 | return 38 | } 39 | } 40 | } 41 | 42 | func connectReadAndWrite(t *testing.T) { 43 | laddr, raddr := chooseInterfaces(t) 44 | sendMr, err := AllocateMemory(benchMrLen) 45 | if err != nil { 46 | panic(err) 47 | } 48 | populateRegion(sendMr) 49 | 50 | recvMr, err := AllocateMemory(sendMr.Len()) 51 | if err != nil { 52 | panic(err) 53 | } 54 | 55 | l, err := ListenRC(raddr) 56 | if err != nil { 57 | panic(err) 58 | } 59 | raddr = l.Addr().(*net.TCPAddr) 60 | 61 | ch := make(chan *RCConn) 62 | go func() { 63 | c, err := DialRC(laddr, raddr) 64 | if err != nil { 65 | panic(err) 66 | } 67 | ch <- c 68 | }() 69 | srvc, err := l.Accept() 70 | if err != nil { 71 | panic(err) 72 | } 73 | clic := <-ch 74 | 75 | var wg sync.WaitGroup 76 | wg.Add(1) 77 | go benchReader(&wg, srvc, recvMr, 1) 78 | wg.Add(1) 79 | go benchWriter(&wg, clic, sendMr, 1) 80 | wg.Wait() 81 | 82 | if err := l.Close(); err != nil { 83 | panic(err) 84 | } 85 | if err := recvMr.Close(); err != nil { 86 | panic(err) 87 | } 88 | if err := sendMr.Close(); err != nil { 89 | panic(err) 90 | } 91 | } 92 | -------------------------------------------------------------------------------- /syscall_linux.go: -------------------------------------------------------------------------------- 1 | // Copyright 2014 Vastech SA (PTY) LTD 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | // +build ignore 16 | 17 | package ib 18 | 19 | //#define _GNU_SOURCE 20 | //#include 21 | //#include 22 | //#include 23 | //#include 24 | import "C" 25 | 26 | import ( 27 | "syscall" 28 | "unsafe" 29 | ) 30 | 31 | const ( 32 | POLLIN = C.POLLIN 33 | POLLPRI = C.POLLPRI 34 | POLLOUT = C.POLLOUT 35 | POLLRDHUP = C.POLLRDHUP 36 | POLLERR = C.POLLERR 37 | POLLHUP = C.POLLHUP 38 | POLLNVAL = C.POLLNVAL 39 | ) 40 | 41 | const ( 42 | RLIM_INFINITY = C.RLIM_INFINITY 43 | RLIMIT_AS = C.RLIMIT_AS 44 | RLIMIT_CORE = C.RLIMIT_CORE 45 | RLIMIT_CPU = C.RLIMIT_CPU 46 | RLIMIT_DATA = C.RLIMIT_DATA 47 | RLIMIT_FSIZE = C.RLIMIT_FSIZE 48 | RLIMIT_MEMLOCK = C.RLIMIT_MEMLOCK 49 | RLIMIT_MSGQUEUE = C.RLIMIT_MSGQUEUE 50 | RLIMIT_NICE = C.RLIMIT_NICE 51 | RLIMIT_NOFILE = C.RLIMIT_NOFILE 52 | RLIMIT_NPROC = C.RLIMIT_NPROC 53 | RLIMIT_RSS = C.RLIMIT_RSS 54 | RLIMIT_RTPRIO = C.RLIMIT_RTPRIO 55 | RLIMIT_SIGPENDING = C.RLIMIT_SIGPENDING 56 | RLIMIT_STACK = C.RLIMIT_STACK 57 | ) 58 | 59 | type Pollfd C.struct_pollfd 60 | type Rlimit C.struct_rlimit 61 | 62 | func Getrlimit(resource int) (rlim Rlimit, err error) { 63 | _, _, e1 := syscall.Syscall(syscall.SYS_GETRLIMIT, uintptr(resource), uintptr(unsafe.Pointer(&rlim)), 0) 64 | if e1 != 0 { 65 | err = e1 66 | } 67 | return 68 | } 69 | 70 | func Poll(fds []Pollfd, timeout int64) (n int, err error) { 71 | for { 72 | r0, _, e1 := syscall.Syscall(syscall.SYS_POLL, uintptr(unsafe.Pointer(&fds[0])), uintptr(len(fds)), uintptr(timeout)) 73 | n = int(r0) 74 | switch e1 { 75 | case 0: 76 | return 77 | case C.EINTR: 78 | // Retry system call if it returns EINTR 79 | default: 80 | err = e1 81 | return 82 | } 83 | } 84 | } 85 | -------------------------------------------------------------------------------- /util.go: -------------------------------------------------------------------------------- 1 | // Copyright 2014 Vastech SA (PTY) LTD 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | package ib 16 | 17 | import "C" 18 | 19 | import ( 20 | "encoding/binary" 21 | "errors" 22 | "io" 23 | "os" 24 | "syscall" 25 | ) 26 | 27 | func newError(name string, errno C.int) error { 28 | if errno > 0 { 29 | return os.NewSyscallError(name, syscall.Errno(errno)) 30 | } 31 | if errno < 0 { 32 | // generic error for functions that don't set errno 33 | return errors.New(name + ": failure") 34 | } 35 | return nil 36 | } 37 | 38 | func readQPParams(r io.Reader) (destLid uint16, destQpn, destPsn uint32, err error) { 39 | err = binary.Read(r, binary.LittleEndian, &destLid) 40 | if err != nil { 41 | return 42 | } 43 | err = binary.Read(r, binary.LittleEndian, &destQpn) 44 | if err != nil { 45 | return 46 | } 47 | err = binary.Read(r, binary.LittleEndian, &destPsn) 48 | return 49 | } 50 | 51 | func writeQPParams(w io.Writer, lid uint16, qpn, psn uint32) (err error) { 52 | err = binary.Write(w, binary.LittleEndian, &lid) 53 | if err != nil { 54 | return 55 | } 56 | err = binary.Write(w, binary.LittleEndian, &qpn) 57 | if err != nil { 58 | return 59 | } 60 | err = binary.Write(w, binary.LittleEndian, &psn) 61 | return 62 | } 63 | 64 | func readWriteQPParams(rw io.ReadWriter, lid uint16, qpn, psn uint32) (destLid uint16, destQpn, destPsn uint32, err error) { 65 | destLid, destQpn, destPsn, err = readQPParams(rw) 66 | if err != nil { 67 | return 68 | } 69 | err = writeQPParams(rw, lid, qpn, psn) 70 | return 71 | } 72 | 73 | func writeReadQPParams(rw io.ReadWriter, lid uint16, qpn, psn uint32) (destLid uint16, destQpn, destPsn uint32, err error) { 74 | err = writeQPParams(rw, lid, qpn, psn) 75 | if err != nil { 76 | return 77 | } 78 | destLid, destQpn, destPsn, err = readQPParams(rw) 79 | return 80 | } 81 | 82 | type closer interface { 83 | Close() error 84 | } 85 | 86 | func checkClose(c closer) { 87 | if err := c.Close(); err != nil { 88 | panic(err) 89 | } 90 | } 91 | -------------------------------------------------------------------------------- /bench_test.go: -------------------------------------------------------------------------------- 1 | // Copyright 2014 Vastech SA (PTY) LTD 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | package ib 16 | 17 | import ( 18 | "net" 19 | "sync" 20 | "testing" 21 | ) 22 | 23 | func benchReader(wg *sync.WaitGroup, c *RCConn, mr *MemoryRegion, n int) { 24 | defer wg.Done() 25 | c.SetTimeout(1e9) 26 | for i := 0; i < n; i++ { 27 | if err := c.Read(mr); err != nil { 28 | panic(err) 29 | } 30 | } 31 | if err := c.Close(); err != nil { 32 | panic(err) 33 | } 34 | } 35 | 36 | func benchWriter(wg *sync.WaitGroup, c *RCConn, mr *MemoryRegion, n int) { 37 | defer wg.Done() 38 | c.SetTimeout(1e9) 39 | for i := 0; i < n; i++ { 40 | if err := c.Write(mr); err != nil { 41 | panic(err) 42 | } 43 | } 44 | if err := c.Close(); err != nil { 45 | panic(err) 46 | } 47 | } 48 | 49 | const benchMult = 100 50 | const benchMrLen = 64 * 1024 * 1024 51 | 52 | func BenchmarkThroughput(b *testing.B) { 53 | laddr, raddr := chooseInterfaces(b) 54 | sendMr, err := AllocateMemory(benchMrLen) 55 | if err != nil { 56 | panic(err) 57 | } 58 | b.SetBytes(benchMult * int64(len(sendMr.Bytes()))) 59 | populateRegion(sendMr) 60 | 61 | recvMr, err := AllocateMemory(sendMr.Len()) 62 | if err != nil { 63 | panic(err) 64 | } 65 | 66 | l, err := ListenRC(raddr) 67 | if err != nil { 68 | panic(err) 69 | } 70 | raddr = l.Addr().(*net.TCPAddr) 71 | 72 | ch := make(chan *RCConn) 73 | go func() { 74 | c, err := DialRC(laddr, raddr) 75 | if err != nil { 76 | panic(err) 77 | } 78 | ch <- c 79 | }() 80 | srvc, err := l.Accept() 81 | if err != nil { 82 | panic(err) 83 | } 84 | clic := <-ch 85 | 86 | n := benchMult * b.N 87 | 88 | b.ResetTimer() 89 | b.StartTimer() 90 | 91 | var wg sync.WaitGroup 92 | wg.Add(1) 93 | go benchReader(&wg, srvc, recvMr, n) 94 | wg.Add(1) 95 | go benchWriter(&wg, clic, sendMr, n) 96 | wg.Wait() 97 | 98 | b.StopTimer() 99 | 100 | if err := l.Close(); err != nil { 101 | panic(err) 102 | } 103 | if err := recvMr.Close(); err != nil { 104 | panic(err) 105 | } 106 | if err := sendMr.Close(); err != nil { 107 | panic(err) 108 | } 109 | } 110 | -------------------------------------------------------------------------------- /read_test.go: -------------------------------------------------------------------------------- 1 | // Copyright 2014 Vastech SA (PTY) LTD 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | package ib 16 | 17 | import ( 18 | "fmt" 19 | "net" 20 | "testing" 21 | ) 22 | 23 | func chooseInterfaces(tb testing.TB) (*net.TCPAddr, *net.TCPAddr) { 24 | ibAddrs := InterfaceAddrs() 25 | var addri, addrj net.Addr 26 | for i := 0; i < len(ibAddrs); i++ { 27 | if iface := InterfaceForAddr(ibAddrs[i]); iface == nil || !iface.Active() { 28 | continue 29 | } 30 | addri = ibAddrs[i] 31 | } 32 | for j := len(ibAddrs) - 1; j >= 0; j-- { 33 | if iface := InterfaceForAddr(ibAddrs[j]); iface == nil || !iface.Active() { 34 | continue 35 | } 36 | addrj = ibAddrs[j] 37 | } 38 | if addri == nil || addrj == nil { 39 | tb.Skip("no interfaces to test with") 40 | } 41 | laddr := &net.TCPAddr{IP: addri.(*net.IPNet).IP} 42 | raddr := &net.TCPAddr{IP: addrj.(*net.IPNet).IP} 43 | return laddr, raddr 44 | } 45 | 46 | func connPair(laddr, raddr *net.TCPAddr) (*RCConn, *RCConn) { 47 | l, err := ListenRC(raddr) 48 | if err != nil { 49 | panic(err) 50 | } 51 | defer checkClose(l) 52 | // conn channel 53 | cc := make(chan *RCConn) 54 | go func() { 55 | c, err := l.Accept() 56 | if err != nil { 57 | panic(err) 58 | } 59 | cc <- c 60 | }() 61 | c, err := DialRC(laddr, l.Addr().(*net.TCPAddr)) 62 | if err != nil { 63 | panic(err) 64 | } 65 | return c, <-cc 66 | } 67 | 68 | func TestWriteAfterClose(t *testing.T) { 69 | laddr, raddr := chooseInterfaces(t) 70 | c1, c2 := connPair(laddr, raddr) 71 | mr, err := AllocateMemory(8192) 72 | if err != nil { 73 | panic(err) 74 | } 75 | if err := c2.Close(); err != nil { 76 | panic(err) 77 | } 78 | c1.SetTimeout(10e9) 79 | fmt.Printf("writing\n") 80 | err = c1.Write(mr) 81 | fmt.Printf("write returned\n") 82 | if err == nil { 83 | panic("expected a write error") 84 | } 85 | fmt.Printf("write error: %+v\n", err) 86 | if err := c1.Close(); err != nil { 87 | panic(err) 88 | } 89 | if err := mr.Close(); err != nil { 90 | panic(err) 91 | } 92 | } 93 | 94 | func TestReadAfterClose(t *testing.T) { 95 | laddr, raddr := chooseInterfaces(t) 96 | c1, c2 := connPair(laddr, raddr) 97 | mr, err := AllocateMemory(8192) 98 | if err != nil { 99 | panic(err) 100 | } 101 | if err := c2.Close(); err != nil { 102 | panic(err) 103 | } 104 | c1.SetTimeout(10e9) 105 | for i := 0; i < 100; i++ { 106 | fmt.Printf("reading\n") 107 | err = c1.Read(mr) 108 | fmt.Printf("read returned\n") 109 | if err == nil { 110 | panic("expected a read error") 111 | } 112 | if t, ok := err.(timeout); ok && t.Timeout() { 113 | err = nil 114 | continue 115 | } else { 116 | break 117 | } 118 | } 119 | if err == nil { 120 | panic("expected a non-timeout read error") 121 | } 122 | fmt.Printf("read error: %+v\n", err) 123 | if err := c1.Close(); err != nil { 124 | panic(err) 125 | } 126 | if err := mr.Close(); err != nil { 127 | panic(err) 128 | } 129 | } 130 | -------------------------------------------------------------------------------- /mr_linux.go: -------------------------------------------------------------------------------- 1 | // Copyright 2014 Vastech SA (PTY) LTD 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | // +build linux 16 | 17 | package ib 18 | 19 | //#include 20 | import "C" 21 | 22 | import ( 23 | "errors" 24 | "fmt" 25 | "os" 26 | "runtime" 27 | "syscall" 28 | "unsafe" 29 | ) 30 | 31 | const ( 32 | IBV_ACCESS_LOCAL_WRITE = C.IBV_ACCESS_LOCAL_WRITE 33 | IBV_ACCESS_REMOTE_WRITE = C.IBV_ACCESS_REMOTE_WRITE 34 | ) 35 | 36 | type mrMap map[*C.struct_ibv_pd]*C.struct_ibv_mr 37 | 38 | type MemoryRegion struct { 39 | mrs mrMap 40 | buf []byte 41 | unmap bool 42 | } 43 | 44 | func (this *MemoryRegion) String() string { 45 | if this.buf == nil { 46 | return "MemoryRegion@closed" 47 | } 48 | return fmt.Sprintf("MemoryRegion@%x[%d]", &this.buf[0], len(this.buf)) 49 | } 50 | 51 | func AllocateMemory(size int) (*MemoryRegion, error) { 52 | const mrProt = syscall.PROT_READ | syscall.PROT_WRITE 53 | const mrFlags = syscall.MAP_PRIVATE | syscall.MAP_ANONYMOUS 54 | buf, err := syscall.Mmap(-1, 0, size, mrProt, mrFlags) 55 | if err != nil { 56 | return nil, os.NewSyscallError("mmap", err) 57 | } 58 | return register(buf, true) 59 | } 60 | 61 | func RegisterMemory(buf []byte) (*MemoryRegion, error) { 62 | return register(buf, false) 63 | } 64 | 65 | var errRegisterInvalidBuf = errors.New("ib: register: invalid buffer") 66 | 67 | func register(buf []byte, unmap bool) (*MemoryRegion, error) { 68 | if len(buf) == 0 || len(buf) != cap(buf) { 69 | return nil, errRegisterInvalidBuf 70 | } 71 | const writeAccess = IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_WRITE 72 | mrs := make(mrMap, len(pds)) 73 | for pd := range pds { 74 | // there seems to be some kind of limit at 32 GB where 75 | // ibv_reg_mr hits an internal ENOMEM 76 | cmr := C.ibv_reg_mr(pd, unsafe.Pointer(&buf[0]), C.size_t(len(buf)), writeAccess) 77 | if cmr == nil { 78 | return nil, newError("ibv_reg_mr", -1) 79 | } 80 | mrs[pd] = cmr 81 | } 82 | mr := &MemoryRegion{mrs, buf, unmap} 83 | runtime.SetFinalizer(mr, (*MemoryRegion).finalize) 84 | return mr, nil 85 | } 86 | 87 | func (mr *MemoryRegion) finalize() { 88 | panic("finalized unclosed memory region") 89 | } 90 | 91 | func (mr *MemoryRegion) Bytes() []byte { 92 | return mr.buf 93 | } 94 | 95 | func (mr *MemoryRegion) Ptr() unsafe.Pointer { 96 | return unsafe.Pointer(&mr.buf[0]) 97 | } 98 | 99 | func (mr *MemoryRegion) Len() int { 100 | return len(mr.buf) 101 | } 102 | 103 | func (mr *MemoryRegion) RemoteKey(pd *C.struct_ibv_pd) uint32 { 104 | ibvMr := mr.mrs[pd] 105 | if ibvMr == nil { 106 | return 0 107 | } 108 | return uint32(ibvMr.rkey) 109 | } 110 | 111 | func (mr *MemoryRegion) Close() error { 112 | for pd, cmr := range mr.mrs { 113 | errno := C.ibv_dereg_mr(cmr) 114 | if errno != 0 { 115 | panic(newError("ibv_dereg_mr", errno)) 116 | } 117 | delete(mr.mrs, pd) 118 | } 119 | mr.mrs = nil 120 | 121 | if mr.unmap && mr.buf != nil { 122 | err := syscall.Munmap(mr.buf) 123 | if err != nil { 124 | return os.NewSyscallError("munmap", err) 125 | } 126 | mr.buf = nil 127 | } 128 | 129 | runtime.SetFinalizer(mr, nil) 130 | 131 | return nil 132 | } 133 | 134 | func (mr *MemoryRegion) populateSge(pd *C.struct_ibv_pd, sge *C.struct_ibv_sge) { 135 | ibvMr := mr.mrs[pd] 136 | if ibvMr == nil { 137 | panic("invalid memory region") 138 | } 139 | sge.addr = C.uint64_t(uintptr(ibvMr.addr)) 140 | sge.length = C.uint32_t(ibvMr.length) 141 | sge.lkey = ibvMr.lkey 142 | } 143 | -------------------------------------------------------------------------------- /dial_test.go: -------------------------------------------------------------------------------- 1 | // Copyright 2014 Vastech SA (PTY) LTD 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | package ib 16 | 17 | import ( 18 | "fmt" 19 | "net" 20 | "sync" 21 | "testing" 22 | "time" 23 | ) 24 | 25 | func populateRegion(mr *MemoryRegion) { 26 | b := mr.Bytes() 27 | for i := 0; i < len(b); i++ { 28 | b[i] = byte(i) 29 | } 30 | } 31 | 32 | func checkRegion(mr *MemoryRegion) bool { 33 | b := mr.Bytes() 34 | for i := 0; i < len(b); i++ { 35 | if b[i] != byte(i) { 36 | return false 37 | } 38 | } 39 | return true 40 | } 41 | 42 | func rwMrLen() int { 43 | if testing.Short() { 44 | return 1024 * 1024 45 | } 46 | return 64 * 1024 * 1024 47 | } 48 | 49 | func reader(c *RCConn, n int) { 50 | t := int64(200e6) 51 | if testing.Short() { 52 | t = 10e6 53 | } 54 | c.SetTimeout(t) 55 | 56 | for i := 0; i < n; i++ { 57 | mr, err := AllocateMemory(rwMrLen()) 58 | if err != nil { 59 | panic(err) 60 | } 61 | 62 | // Note for test parameter tweakers: if number of loop 63 | // iterations is small enough and there is a short timeout, 64 | // this test can fail to read data into the memory region. 65 | readDone := false 66 | const ntimeouts = 10 67 | for j := 0; j < ntimeouts; j++ { 68 | if err := c.Read(mr); err == nil { 69 | fmt.Printf("Read into %v\n", mr) 70 | readDone = true 71 | break 72 | } else { 73 | if t, ok := err.(timeout); ok && t.Timeout() { 74 | fmt.Printf("read timed out, looping\n") 75 | continue 76 | } 77 | panic("reader read failed: " + err.Error()) 78 | } 79 | } 80 | 81 | if !readDone { 82 | panic(fmt.Errorf("read not done with %d timeouts of %d ns", ntimeouts, t)) 83 | } 84 | 85 | if !checkRegion(mr) { 86 | panic("memory region data invalid") 87 | } 88 | 89 | if err := mr.Close(); err != nil { 90 | panic(err) 91 | } 92 | } 93 | 94 | fmt.Printf("reader closing\n") 95 | if err := c.Close(); err != nil { 96 | panic(err) 97 | } 98 | fmt.Printf("reader closed\n") 99 | } 100 | 101 | func writer(c *RCConn, n int) { 102 | c.SetTimeout(5e9) 103 | 104 | for i := 0; i < n; i++ { 105 | mr, err := AllocateMemory(rwMrLen()) 106 | if err != nil { 107 | panic(err) 108 | } 109 | populateRegion(mr) 110 | // sleep a bit so that read will send keepalives 111 | if testing.Short() { 112 | time.Sleep(30e6) 113 | } else { 114 | time.Sleep(600e6) 115 | } 116 | if err := c.Write(mr); err != nil { 117 | panic("writer write failed: " + err.Error()) 118 | } 119 | fmt.Printf("Wrote from %v\n", mr) 120 | if err := mr.Close(); err != nil { 121 | panic(err) 122 | } 123 | } 124 | 125 | fmt.Printf("writer closing\n") 126 | if err := c.Close(); err != nil { 127 | panic(err) 128 | } 129 | fmt.Printf("writer closed\n") 130 | } 131 | 132 | // nops must be bigger than 16 to expose the ENOMEM error from 133 | // ibv_post_send with unsignaled inline sends 134 | const nops = 20 135 | 136 | func client(wg *sync.WaitGroup, laddr, raddr *net.TCPAddr) { 137 | defer wg.Done() 138 | c, err := DialRC(laddr, raddr) 139 | if err != nil { 140 | panic(err) 141 | } 142 | reader(c, nops) 143 | } 144 | 145 | func server(wg *sync.WaitGroup, l *RCListener) { 146 | defer wg.Done() 147 | c, err := l.Accept() 148 | if err != nil { 149 | panic(err) 150 | } 151 | writer(c, nops) 152 | } 153 | 154 | // This test takes longer to tear down if the reader closes before the 155 | // writer. Don't remember why exactly, but this is the expected 156 | // behaviour due to the way keepalives and RCConn.Close works. 157 | func TestListenDial(t *testing.T) { 158 | ibAddrs := InterfaceAddrs() 159 | var wg sync.WaitGroup 160 | for i, addri := range ibAddrs { 161 | if iface := InterfaceForAddr(addri); iface == nil || !iface.Active() { 162 | fmt.Printf("Skipping %v\n", addri) 163 | continue 164 | } 165 | laddr := &net.TCPAddr{IP: addri.(*net.IPNet).IP} 166 | for _, addrj := range ibAddrs[i:] { 167 | if iface := InterfaceForAddr(addrj); iface == nil || !iface.Active() { 168 | fmt.Printf("Skipping %v\n", addrj) 169 | continue 170 | } 171 | raddr := &net.TCPAddr{IP: addrj.(*net.IPNet).IP} 172 | l, err := ListenRC(raddr) 173 | if err != nil { 174 | panic(err) 175 | } 176 | // get port 177 | raddr = l.Addr().(*net.TCPAddr) 178 | fmt.Printf("laddr=%+v raddr=%v\n", laddr, raddr) 179 | wg.Add(1) 180 | go server(&wg, l) 181 | wg.Add(1) 182 | go client(&wg, laddr, raddr) 183 | wg.Wait() 184 | } 185 | 186 | if testing.Short() { 187 | break 188 | } 189 | } 190 | } 191 | 192 | // run this test with GOMAXPROCS>1 for extra fun 193 | func xTestListenDialLoop(t *testing.T) { 194 | for k := 0; k < 10; k++ { 195 | var wg sync.WaitGroup 196 | for i := 0; i < 10; i++ { 197 | wg.Add(1) 198 | go func() { 199 | defer wg.Done() 200 | TestListenDial(t) 201 | }() 202 | } 203 | wg.Wait() 204 | } 205 | } 206 | -------------------------------------------------------------------------------- /interface_linux.go: -------------------------------------------------------------------------------- 1 | // Copyright 2014 Vastech SA (PTY) LTD 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | // +build linux 16 | 17 | package ib 18 | 19 | // workaround for http://code.google.com/p/go/issues/detail?id=3027 20 | //#define inline 21 | //#define static 22 | 23 | //#include 24 | //#cgo linux LDFLAGS: -libverbs 25 | import "C" 26 | 27 | import ( 28 | "fmt" 29 | "net" 30 | "os" 31 | "unsafe" 32 | ) 33 | 34 | type pdsSet map[*C.struct_ibv_pd]bool 35 | 36 | var interfaces []*Interface 37 | 38 | var guidToInterface = make(map[string]*Interface) 39 | 40 | var pds pdsSet 41 | 42 | type Interface struct { 43 | ctx *C.struct_ibv_context 44 | pd *C.struct_ibv_pd 45 | port uint8 46 | deviceAttr *C.struct_ibv_device_attr 47 | guid net.HardwareAddr 48 | } 49 | 50 | func (iface *Interface) Lid() uint16 { 51 | var portAttr C.struct_ibv_port_attr 52 | errno := C.ibv_query_port(iface.ctx, C.uint8_t(iface.port), &portAttr) 53 | if errno != 0 { 54 | return 0 55 | } 56 | return uint16(portAttr.lid) 57 | } 58 | 59 | func (iface *Interface) Active() bool { 60 | var portAttr C.struct_ibv_port_attr 61 | errno := C.ibv_query_port(iface.ctx, C.uint8_t(iface.port), &portAttr) 62 | if errno != 0 { 63 | return false 64 | } 65 | return portAttr.state == C.IBV_PORT_ACTIVE 66 | } 67 | 68 | func newInterfaces(ctx *C.struct_ibv_context) { 69 | var deviceAttr C.struct_ibv_device_attr 70 | errno := C.ibv_query_device(ctx, &deviceAttr) 71 | if errno != 0 { 72 | return 73 | } 74 | 75 | pd := C.ibv_alloc_pd(ctx) 76 | if pd == nil { 77 | panic(newError("ibv_alloc_pd", -1)) 78 | } 79 | pds[pd] = true 80 | 81 | for port := C.uint8_t(1); port <= deviceAttr.phys_port_cnt; port++ { 82 | var portAttr C.struct_ibv_port_attr 83 | errno := C.ibv_query_port(ctx, port, &portAttr) 84 | if errno != 0 { 85 | continue 86 | } 87 | 88 | var gid C.union_ibv_gid 89 | errno = C.ibv_query_gid(ctx, port, 0, &gid) 90 | if errno != 0 { 91 | continue 92 | } 93 | // last 8 bytes of GID is the GUID 94 | guid := net.HardwareAddr(gid[8:]) 95 | iface := &Interface{ctx, pd, uint8(port), &deviceAttr, guid} 96 | interfaces = append(interfaces, iface) 97 | guidToInterface[string([]byte(guid))] = iface 98 | } 99 | } 100 | 101 | func InterfaceForAddr(addr net.Addr) *Interface { 102 | var ip net.IP 103 | if ipAddr, ok := addr.(*net.IPNet); ok { 104 | ip = ipAddr.IP 105 | } else if tcpAddr, ok := addr.(*net.TCPAddr); ok { 106 | ip = tcpAddr.IP 107 | } else { 108 | return nil 109 | } 110 | 111 | ifaces, err := net.Interfaces() 112 | if err != nil { 113 | return nil 114 | } 115 | 116 | var ibIface *Interface 117 | for i := range ifaces { 118 | iface := &ifaces[i] 119 | 120 | // hack until ifi_type in ifinfomsg is available in Go 121 | if len(iface.HardwareAddr) != 20 { 122 | continue 123 | } 124 | 125 | guidKey := string([]byte(iface.HardwareAddr[12:])) 126 | var ok bool 127 | ibIface, ok = guidToInterface[guidKey] 128 | if !ok || iface == nil { 129 | continue 130 | } 131 | 132 | ifaceAddrs, err := iface.Addrs() 133 | if err != nil || len(ifaceAddrs) == 0 { 134 | continue 135 | } 136 | found := false 137 | for _, ifaceAddr := range ifaceAddrs { 138 | ifaceIPNet := ifaceAddr.(*net.IPNet) 139 | if ifaceIPNet.IP.To4() == nil { 140 | continue 141 | } 142 | if ip.Equal(ifaceIPNet.IP) { 143 | found = true 144 | break 145 | } 146 | } 147 | if found { 148 | break 149 | } else { 150 | ibIface = nil 151 | } 152 | } 153 | return ibIface 154 | } 155 | 156 | func InterfaceAddrs() []net.Addr { 157 | if pds == nil { 158 | panic("ib not initialzied") 159 | } 160 | 161 | ifaces, err := net.Interfaces() 162 | if err != nil { 163 | return nil 164 | } 165 | 166 | var ibAddrs []net.Addr 167 | for i := range ifaces { 168 | iface := &ifaces[i] 169 | if len(iface.HardwareAddr) != 20 { 170 | continue 171 | } 172 | ifaceAddrs, err := iface.Addrs() 173 | if err != nil { 174 | continue 175 | } 176 | for _, addr := range ifaceAddrs { 177 | ipAddr := addr.(*net.IPNet) 178 | if ipAddr.IP.To4() == nil { 179 | continue 180 | } 181 | ibAddrs = append(ibAddrs, ipAddr) 182 | } 183 | } 184 | return ibAddrs 185 | } 186 | 187 | func walkDevices(callback func(device *C.struct_ibv_device)) { 188 | var numDevices C.int 189 | deviceList, err := C.ibv_get_device_list(&numDevices) 190 | if err != nil { 191 | return 192 | } 193 | defer C.ibv_free_device_list(deviceList) 194 | devicePtr := deviceList 195 | device := *devicePtr 196 | for device != nil { 197 | callback(device) 198 | prevDevicePtr := uintptr(unsafe.Pointer(devicePtr)) 199 | sizeofPtr := uintptr(unsafe.Sizeof(devicePtr)) 200 | devicePtr = (**C.struct_ibv_device)(unsafe.Pointer(prevDevicePtr + sizeofPtr)) 201 | device = *devicePtr 202 | } 203 | return 204 | } 205 | 206 | func handleAsyncEvents(ctx *C.struct_ibv_context) { 207 | var event C.struct_ibv_async_event 208 | errno := C.ibv_get_async_event(ctx, &event) 209 | if errno != 0 { 210 | panic(newError("ibv_get_async_event", errno)) 211 | } 212 | C.ibv_ack_async_event(&event) 213 | // ignore most async events 214 | switch event.event_type { 215 | case C.IBV_EVENT_CQ_ERR: 216 | panic("Async event: CQ overrun") 217 | case C.IBV_EVENT_QP_FATAL: 218 | case C.IBV_EVENT_QP_ACCESS_ERR: 219 | case C.IBV_EVENT_COMM_EST: 220 | case C.IBV_EVENT_CLIENT_REREGISTER: 221 | default: 222 | panic(fmt.Sprintf("Async event: %+v", event)) 223 | } 224 | } 225 | 226 | func Initialize() { 227 | if pds != nil { 228 | panic("ib already initialzied") 229 | } 230 | 231 | pds = make(pdsSet, 0) 232 | walkDevices(func(device *C.struct_ibv_device) { 233 | ctx := C.ibv_open_device(device) 234 | if ctx == nil { 235 | panic("ibv_open_device: failure") 236 | } 237 | newInterfaces(ctx) 238 | go handleAsyncEvents(ctx) 239 | }) 240 | 241 | // skip memlock check if there is no IB hardware 242 | if len(pds) == 0 { 243 | return 244 | } 245 | 246 | rlim, err := Getrlimit(RLIMIT_MEMLOCK) 247 | if err != nil { 248 | panic(os.NewSyscallError("getrlimit", err)) 249 | } 250 | const maxUint64 = 1<<64 - 1 251 | if rlim.Cur != uint64(maxUint64) || rlim.Max != uint64(maxUint64) { 252 | panic("ib: MEMLOCK rlimit is not unlimited") 253 | } 254 | } 255 | -------------------------------------------------------------------------------- /conn_test.go: -------------------------------------------------------------------------------- 1 | // Copyright 2014 Vastech SA (PTY) LTD 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | package ib 16 | 17 | import ( 18 | "fmt" 19 | "github.com/jsgilmore/shm" 20 | "math/rand" 21 | "net" 22 | "sync" 23 | "sync/atomic" 24 | "testing" 25 | "time" 26 | ) 27 | 28 | var tmpfsBuf, hugeBuf shm.Buffer 29 | 30 | func init() { 31 | if !SetupOptional() { 32 | return 33 | } 34 | var err error 35 | tmpfsBuf, err = shm.NewBufferTmpfs(2<<20, shm.PROT_RDWR) 36 | if err != nil { 37 | panic(err) 38 | } 39 | hugeBuf, err = shm.NewBufferHugepages(64<<20, shm.PROT_RDWR) 40 | if err != nil { 41 | panic(err) 42 | } 43 | } 44 | 45 | func randomMemory(rng *rand.Rand) *MemoryRegion { 46 | var mr *MemoryRegion 47 | var err error 48 | options := 3 49 | if tmpfsBuf != nil && hugeBuf != nil { 50 | options += 2 51 | } 52 | switch rng.Intn(options) { 53 | case 0: 54 | mr, err = AllocateMemory(4096) 55 | case 1: 56 | mr, err = AllocateMemory(65536) 57 | case 2: 58 | mr, err = AllocateMemory(1048576) 59 | case 3: 60 | mr, err = RegisterMemory(tmpfsBuf.Bytes()) 61 | case 4: 62 | mr, err = RegisterMemory(hugeBuf.Bytes()) 63 | default: 64 | panic("invalid mode") 65 | } 66 | if err != nil { 67 | panic(err) 68 | } 69 | return mr 70 | } 71 | 72 | func tmpfsMemory(rng *rand.Rand) *MemoryRegion { 73 | mr, err := RegisterMemory(tmpfsBuf.Bytes()) 74 | if err != nil { 75 | panic(err) 76 | } 77 | return mr 78 | } 79 | 80 | func hugepagesMemory(rng *rand.Rand) *MemoryRegion { 81 | mr, err := RegisterMemory(hugeBuf.Bytes()) 82 | if err != nil { 83 | panic(err) 84 | } 85 | return mr 86 | } 87 | 88 | const maxTimeout = 1e9 89 | 90 | type regionFunc func(*rand.Rand) *MemoryRegion 91 | 92 | func testConn(c *RCConn, rng *rand.Rand, newRegion regionFunc) { 93 | c.SetTimeout(rng.Int63n(maxTimeout)) 94 | var err error 95 | var errError string 96 | nmessages := rng.Intn(10) 97 | for i := 0; i < nmessages; i++ { 98 | mr := newRegion(rng) 99 | switch rng.Intn(3) { 100 | case 0: 101 | fmt.Printf("reading %v\n", mr) 102 | // Because timeout errors also continue the loop, two 103 | // readers connected to each other will also eventually 104 | // make progress. 105 | err = c.Read(mr) 106 | case 1: 107 | fmt.Printf("writing %v\n", mr) 108 | err = c.Write(mr) 109 | case 2: 110 | fmt.Printf("sleeping for up to maxTimeout\n") 111 | time.Sleep(time.Duration(rng.Int63n(maxTimeout))) 112 | default: 113 | panic("invalid mode") 114 | } 115 | if err != nil { 116 | // Save error string now so that it doesn't refer to a 117 | // closed memory region when it is printed later. 118 | errError = err.Error() 119 | } 120 | if err := mr.Close(); err != nil { 121 | panic(err) 122 | } 123 | if err != nil { 124 | if t, ok := err.(timeout); ok && t.Timeout() { 125 | continue 126 | } 127 | break 128 | } 129 | } 130 | if err == nil { 131 | fmt.Printf("SUCCESSFUL EXCHANGE of %d MESSAGES\n", nmessages) 132 | } else { 133 | fmt.Printf("%v\n", errError) 134 | } 135 | if err := c.Close(); err != nil { 136 | panic("close failed: " + err.Error()) 137 | } 138 | } 139 | 140 | func testConnParams(nlisteners, ndialers, nconn int, newRegion regionFunc, t *testing.T) { 141 | ibAddrs := []net.Addr{} 142 | for _, addr := range InterfaceAddrs() { 143 | if iface := InterfaceForAddr(addr); iface == nil { 144 | fmt.Printf("iface is nil\n") 145 | continue 146 | } else if !iface.Active() { 147 | fmt.Printf("iface is not active\n") 148 | continue 149 | } 150 | ibAddrs = append(ibAddrs, addr) 151 | } 152 | if len(ibAddrs) == 0 { 153 | t.Skip("no interfaces to test with") 154 | } 155 | 156 | // set up listeners 157 | listeners := []*RCListener{} 158 | for _, addr := range ibAddrs { 159 | raddr := &net.TCPAddr{IP: addr.(*net.IPNet).IP} 160 | fmt.Printf("listeners for %v\n", raddr) 161 | for k := 0; k < nlisteners; k++ { 162 | l, err := ListenRC(raddr) 163 | if err != nil { 164 | panic("ListenRC failed: " + err.Error()) 165 | } 166 | listeners = append(listeners, l) 167 | } 168 | } 169 | 170 | // start dialers 171 | var dialWg sync.WaitGroup 172 | for kk := 0; kk < ndialers; kk++ { 173 | dialWg.Add(1) 174 | k := kk 175 | go func() { 176 | defer dialWg.Done() 177 | rng := rand.New(rand.NewSource(int64(k))) 178 | for j := 0; j < nconn; j++ { 179 | addr := ibAddrs[rng.Intn(len(ibAddrs))] 180 | laddr := &net.TCPAddr{IP: addr.(*net.IPNet).IP} 181 | l := listeners[rng.Intn(len(listeners))] 182 | raddr := l.Addr().(*net.TCPAddr) 183 | 184 | fmt.Printf("dialing %v -> %v\n", laddr, raddr) 185 | c, err := DialRC(laddr, raddr) 186 | if err != nil { 187 | panic("DialRC failed: " + err.Error()) 188 | } 189 | fmt.Printf("connected\n") 190 | testConn(c, rng, newRegion) 191 | fmt.Printf("dialer %d conn %d DONE\n", k, j) 192 | } 193 | }() 194 | } 195 | 196 | var acceptedConns uint32 197 | // accept on listeners 198 | var listenWg sync.WaitGroup 199 | for kk, listener := range listeners { 200 | listenWg.Add(1) 201 | k := kk 202 | l := listener 203 | go func() { 204 | defer listenWg.Done() 205 | rng := rand.New(rand.NewSource(int64(k))) 206 | for j := 0; true; j++ { 207 | fmt.Printf("Accepting on %v\n", l.Addr()) 208 | c, err := l.Accept() 209 | if err != nil { 210 | break 211 | } 212 | fmt.Printf("Accepted\n") 213 | testConn(c, rng, newRegion) 214 | fmt.Printf("listener %d conn %d DONE\n", k, j) 215 | atomic.AddUint32(&acceptedConns, 1) 216 | } 217 | }() 218 | } 219 | 220 | fmt.Printf("waiting for dialers\n") 221 | dialWg.Wait() 222 | 223 | fmt.Printf("closing listeners\n") 224 | for _, l := range listeners { 225 | if err := l.Close(); err != nil { 226 | panic("listener close failed: " + err.Error()) 227 | } 228 | } 229 | 230 | fmt.Printf("waiting for listeners\n") 231 | listenWg.Wait() 232 | 233 | if ndialers*nconn != int(acceptedConns) { 234 | panic("accepted conns != expected conns") 235 | } 236 | } 237 | 238 | func TestConn1(t *testing.T) { 239 | testConnParams(1, 1, 1, randomMemory, t) 240 | } 241 | 242 | func TestConn4(t *testing.T) { 243 | if testing.Short() { 244 | return 245 | } 246 | testConnParams(1, 2, 2, randomMemory, t) 247 | } 248 | 249 | func TestConn50(t *testing.T) { 250 | if testing.Short() { 251 | return 252 | } 253 | testConnParams(5, 5, 10, randomMemory, t) 254 | } 255 | 256 | func TestTmpfs(t *testing.T) { 257 | if testing.Short() || !SetupOptional() { 258 | t.Skip("skipping test") 259 | } 260 | testConnParams(5, 5, 5, tmpfsMemory, t) 261 | } 262 | 263 | func TestHugepages(t *testing.T) { 264 | if testing.Short() || !SetupOptional() { 265 | t.Skip("skipping test") 266 | } 267 | testConnParams(5, 5, 5, hugepagesMemory, t) 268 | } 269 | -------------------------------------------------------------------------------- /conn_linux.go: -------------------------------------------------------------------------------- 1 | // Copyright 2014 Vastech SA (PTY) LTD 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | // +build linux 16 | 17 | package ib 18 | 19 | import ( 20 | "errors" 21 | "fmt" 22 | "io" 23 | "math" 24 | "net" 25 | "time" 26 | ) 27 | 28 | // RCConn can only be used in one direction (read or write) by a 29 | // single goroutine. Simplicity rules. 30 | 31 | const ( 32 | immRendezvousStart = iota + 1 33 | immRendezvousReply 34 | immRendezvousFinish 35 | immClose 36 | ) 37 | 38 | type RCConn struct { 39 | iface *Interface 40 | laddr net.Addr 41 | raddr net.Addr 42 | qp QueuePair 43 | timeout int64 44 | writeTimeoutFatal bool 45 | messages []*MemoryRegion 46 | meta *MemoryRegion 47 | keepalive bool 48 | } 49 | 50 | func NewRCConn(iface *Interface, qp QueuePair, timeout int64, keepalive bool) *RCConn { 51 | messages, meta := CreateBuffers() 52 | return &RCConn{iface, nil, nil, qp, timeout, true, messages, meta, keepalive} 53 | } 54 | 55 | func ioDeadline() time.Time { 56 | return time.Now().Add(5 * time.Second) 57 | } 58 | 59 | func CreateBuffers() (messages []*MemoryRegion, meta *MemoryRegion) { 60 | // allocate receive buffers 61 | for i := 0; i < 2; i++ { 62 | mr, err := AllocateMemory(4096) 63 | if err != nil { 64 | panic(fmt.Errorf("AllocateMemory: %v", err)) 65 | } 66 | messages = append(messages, mr) 67 | } 68 | 69 | // allocate send buffer 70 | meta, err := AllocateMemory(64) 71 | if err != nil { 72 | panic(fmt.Errorf("AllocateMemory: %v", err)) 73 | } 74 | return messages, meta 75 | } 76 | 77 | func NewRCConnFromNetConn(c net.Conn) (*RCConn, error) { 78 | addr := c.LocalAddr().(*net.TCPAddr) 79 | iface := InterfaceForAddr(&net.IPNet{IP: addr.IP}) 80 | if iface == nil { 81 | return nil, errNoInterfaceForAddr 82 | } 83 | return newRCConn(c, iface) 84 | } 85 | 86 | func newRCConn(c net.Conn, iface *Interface) (*RCConn, error) { 87 | // Leave enough room in the completion queue for any operation, 88 | // including inline sends, to return an error. CQ overruns 89 | // sometimes cause internal errors in the HCA, which can make the 90 | // kernel very unhappy. 91 | qp, err := iface.NewQueuePair(10) 92 | if err != nil { 93 | return nil, err 94 | } 95 | 96 | if err := c.SetDeadline(ioDeadline()); err != nil { 97 | checkClose(qp) 98 | return nil, err 99 | } 100 | destLid, destQpn, destPsn, err := writeReadQPParams(c, iface.Lid(), qp.Qpn(), qp.Psn()) 101 | if err != nil { 102 | checkClose(qp) 103 | return nil, err 104 | } 105 | 106 | messages, meta := CreateBuffers() 107 | 108 | if err := qp.Setup(destLid, destQpn, destPsn, messages); err != nil { 109 | checkClose(qp) 110 | return nil, err 111 | } 112 | 113 | laddr, raddr := c.LocalAddr(), c.RemoteAddr() 114 | 115 | rcc := &RCConn{iface, laddr, raddr, qp, math.MaxInt64, true, messages, meta, false} 116 | return rcc, nil 117 | } 118 | 119 | func (c *RCConn) SetTimeout(nsec int64) { 120 | if nsec < 0 { 121 | panic("RCConn: SetTimeout < 0") 122 | } 123 | if nsec > 0 { 124 | c.timeout = nsec 125 | } else { 126 | c.timeout = math.MaxInt64 127 | } 128 | } 129 | 130 | func (c *RCConn) WriteTimeoutFatal(v bool) { 131 | c.writeTimeoutFatal = v 132 | } 133 | 134 | func (c *RCConn) LocalAddr() net.Addr { 135 | return c.laddr 136 | } 137 | 138 | func (c *RCConn) RemoteAddr() net.Addr { 139 | return c.raddr 140 | } 141 | 142 | func (c *RCConn) Setup(destLid uint16, destQpn, destPsn uint32) error { 143 | return c.qp.Setup(destLid, destQpn, destPsn, c.messages) 144 | } 145 | 146 | type rendezvousReply struct { 147 | remoteAddr uint64 148 | rkey uint32 149 | } 150 | 151 | type timeout interface { 152 | Timeout() bool 153 | } 154 | 155 | func (this *RCConn) postKeepalive() error { 156 | // Don't send a keepalive if one is outstanding already. This 157 | // avoids CQ overruns if the timeout is very small. 158 | if this.keepalive { 159 | return nil 160 | } 161 | this.keepalive = true 162 | if err := this.qp.PostKeepalive(); err != nil { 163 | return fmt.Errorf("keepalive: %v", err) 164 | } 165 | return nil 166 | } 167 | 168 | func (this *RCConn) poll() (*WorkCompletion, error) { 169 | wc, err := this.qp.Poll(this.timeout) 170 | if err != nil { 171 | if t, ok := err.(timeout); ok && t.Timeout() { 172 | if err := this.postKeepalive(); err != nil { 173 | this.closeAfterError(err) 174 | return nil, err 175 | } 176 | // Return without shutting down the connection, because 177 | // some timeouts aren't fatal errors. 178 | return nil, err 179 | } 180 | this.closeAfterError(err) 181 | return nil, err 182 | } 183 | 184 | // Check if work completion is a keepalive. If so, clear the flag 185 | // so that another keepalive may be sent. 186 | if wc != nil && wc.Keepalive() { 187 | this.keepalive = false 188 | return nil, nil 189 | } 190 | 191 | return wc, nil 192 | } 193 | 194 | var errStartOnClosedConn = errors.New("start on closed connection") 195 | 196 | func (this *RCConn) start() error { 197 | if this.qp == nil { 198 | return errStartOnClosedConn 199 | } 200 | 201 | for { 202 | wc, err := this.poll() 203 | if err != nil { 204 | // a timeout here is not fatal 205 | return err 206 | } 207 | if wc == nil { 208 | continue 209 | } 210 | if !wc.Success() { 211 | err := fmt.Errorf("receive start: error wc: %s", wc) 212 | this.closeAfterError(nil) 213 | return err 214 | } 215 | immData := wc.ImmData() 216 | if !wc.Receive() || immData != immRendezvousStart { 217 | this.closeAfterError(nil) 218 | if immData == immClose { 219 | return io.EOF 220 | } 221 | err := fmt.Errorf("receive start: unexpected wc: %s", wc) 222 | return err 223 | } 224 | // repost receive buffer 225 | recvMr := wc.MemoryRegion() 226 | if err := this.qp.PostReceive(recvMr); err != nil { 227 | this.closeAfterError(err) 228 | return err 229 | } 230 | break 231 | } 232 | return nil 233 | } 234 | 235 | func (this *RCConn) replyFinish(mr *MemoryRegion, meta []byte) error { 236 | // Post send for rendezvous reply. 237 | reply := (*rendezvousReply)(this.meta.Ptr()) 238 | reply.remoteAddr = uint64(uintptr(mr.Ptr())) 239 | reply.rkey = mr.RemoteKey(this.iface.pd) 240 | 241 | if err := this.qp.PostSendImm(immRendezvousReply, this.meta); err != nil { 242 | this.closeAfterError(err) 243 | return fmt.Errorf("send reply: %v", err) 244 | } 245 | 246 | // Wait for receive of rendezvous finish 247 | for { 248 | wc, err := this.poll() 249 | if err != nil { 250 | this.closeAfterError(err) 251 | return makeTimeoutFatal(err) 252 | } 253 | // poll can return a nil wc in case of keepalive 254 | if wc == nil { 255 | continue 256 | } 257 | 258 | if !wc.Success() { 259 | err := fmt.Errorf("receive finish: error wc: %s", wc) 260 | this.closeAfterError(nil) 261 | return err 262 | } 263 | 264 | if wc.Send() { 265 | // reply has been sent 266 | continue 267 | } 268 | 269 | if !wc.Receive() || wc.ImmData() != immRendezvousFinish { 270 | err := fmt.Errorf("receive finish: unexpected wc: %s", wc) 271 | this.closeAfterError(nil) 272 | return err 273 | } 274 | // repost receive buffer 275 | recvMr := wc.MemoryRegion() 276 | if meta != nil { 277 | // recvMr is always 4k 278 | copy(meta, recvMr.Bytes()[:len(meta)]) 279 | } 280 | if err := this.qp.PostReceive(recvMr); err != nil { 281 | this.closeAfterError(err) 282 | return err 283 | } 284 | break 285 | } 286 | return nil 287 | } 288 | 289 | func rcConnError(name string, err error) error { 290 | if err == nil { 291 | return nil 292 | } 293 | // don't break timeout errors 294 | if t, ok := err.(timeout); ok && t.Timeout() { 295 | return err 296 | } 297 | if err == io.EOF { 298 | return err 299 | } 300 | return fmt.Errorf("ib: RCConn.%s: %v", name, err) 301 | } 302 | 303 | // Reading on both ends of the connection will not return an error. 304 | func (this *RCConn) Read(mr *MemoryRegion) error { 305 | return this.ReadMeta(mr, nil) 306 | } 307 | 308 | func (this *RCConn) ReadMeta(mr *MemoryRegion, meta []byte) error { 309 | if err := this.start(); err != nil { 310 | return rcConnError("ReadMeta", err) 311 | } 312 | return rcConnError("ReadMeta", this.replyFinish(mr, meta)) 313 | } 314 | 315 | var errReadPooledIntr = errors.New("ib: RCConn: ReadPooled interrupted") 316 | 317 | func (this *RCConn) ReadPooled(mrChan <-chan *MemoryRegion) (*MemoryRegion, error) { 318 | if err := this.start(); err != nil { 319 | return nil, rcConnError("ReadPooled", err) 320 | } 321 | mr, ok := <-mrChan 322 | if !ok || mr == nil { 323 | return nil, errReadPooledIntr 324 | } 325 | return mr, rcConnError("ReadPooled", this.replyFinish(mr, nil)) 326 | } 327 | 328 | var errFatalTimeout = errors.New("fatal timeout") 329 | 330 | func makeTimeoutFatal(err error) error { 331 | if t, ok := err.(timeout); ok && t.Timeout() { 332 | return errFatalTimeout 333 | } 334 | return err 335 | } 336 | 337 | var errWriteOnClosedConn = errors.New("write on closed connection") 338 | 339 | // Writing on both ends of the connection will return an error. 340 | func (this *RCConn) Write(mr *MemoryRegion) error { 341 | err := this.WriteMetaStart() 342 | if err != nil { 343 | return rcConnError("Write", err) 344 | } 345 | return rcConnError("Write", this.write(mr, nil)) 346 | } 347 | 348 | func (this *RCConn) WriteMeta(mr *MemoryRegion, meta []byte) error { 349 | return rcConnError("WriteMeta", this.write(mr, meta)) 350 | } 351 | 352 | func (this *RCConn) WriteMetaStart() error { 353 | if this.qp == nil { 354 | return errWriteOnClosedConn 355 | } 356 | 357 | // Post send for rendezvous start 358 | if err := this.qp.PostSendImm(immRendezvousStart, nil); err != nil { 359 | this.closeAfterError(err) 360 | return err 361 | } 362 | return nil 363 | } 364 | 365 | func (this *RCConn) write(mr *MemoryRegion, meta []byte) error { 366 | if this.qp == nil { 367 | return errWriteOnClosedConn 368 | } 369 | 370 | // wait for reply 371 | var reply rendezvousReply 372 | for { 373 | wc, err := this.poll() 374 | if err != nil { 375 | if this.writeTimeoutFatal { 376 | this.closeAfterError(err) 377 | return fmt.Errorf("receive reply poll: %v", makeTimeoutFatal(err)) 378 | } 379 | return err 380 | } 381 | // poll can return a nil wc in case of keepalive 382 | if wc == nil { 383 | continue 384 | } 385 | 386 | if !wc.Success() { 387 | err := fmt.Errorf("receive reply: error wc: %s", wc) 388 | this.closeAfterError(nil) 389 | return err 390 | } 391 | 392 | if wc.Send() { 393 | // start has been sent 394 | continue 395 | } 396 | 397 | if !wc.Receive() || wc.ImmData() != immRendezvousReply { 398 | err := fmt.Errorf("receive reply: unexpected wc: %s", wc) 399 | this.closeAfterError(nil) 400 | return err 401 | } 402 | 403 | recvMr := wc.MemoryRegion() 404 | reply = *(*rendezvousReply)(recvMr.Ptr()) 405 | // Post receive for rendezvous reply again 406 | if err := this.qp.PostReceive(recvMr); err != nil { 407 | this.closeAfterError(err) 408 | return err 409 | } 410 | break 411 | } 412 | 413 | // Post RDMA write with reply parameters 414 | if err := this.qp.PostWrite(mr, reply.remoteAddr, reply.rkey); err != nil { 415 | this.closeAfterError(err) 416 | return err 417 | } 418 | 419 | for { 420 | wc, err := this.poll() 421 | if err != nil { 422 | this.closeAfterError(err) 423 | return fmt.Errorf("rdma write poll: %v", makeTimeoutFatal(err)) 424 | } 425 | // poll can return a nil wc in case of keepalive 426 | if wc == nil { 427 | continue 428 | } 429 | if !wc.Success() { 430 | err := fmt.Errorf("rdma write: error wc: %s", wc) 431 | this.closeAfterError(nil) 432 | return err 433 | } 434 | 435 | // this is a hack. we shouldn't have to deal with this 436 | // work completion at this stage. we should have remained in 437 | // the previous state until both the start and the reply 438 | // completed. 439 | if wc.Send() { 440 | // start has been sent 441 | continue 442 | } 443 | 444 | if !wc.Write() { 445 | err := fmt.Errorf("rdma write: unexpected wc: %s", wc) 446 | this.closeAfterError(nil) 447 | return err 448 | } 449 | break 450 | } 451 | 452 | if meta != nil { 453 | n := copy(this.meta.Bytes(), meta) 454 | if n != len(meta) { 455 | panic("ib: buffer too short for meta") 456 | } 457 | } 458 | 459 | // Post send for rendezvous finish 460 | if err := this.qp.PostSendImm(immRendezvousFinish, this.meta); err != nil { 461 | this.closeAfterError(err) 462 | return err 463 | } 464 | 465 | for { 466 | wc, err := this.poll() 467 | if err != nil { 468 | this.closeAfterError(err) 469 | return fmt.Errorf("send finish poll: %v", makeTimeoutFatal(err)) 470 | } 471 | // poll can return a nil wc in case of keepalive 472 | if wc == nil { 473 | continue 474 | } 475 | if !wc.Success() { 476 | err := fmt.Errorf("send finish: error wc: %s", wc) 477 | this.closeAfterError(err) 478 | return err 479 | } 480 | if !wc.Send() { 481 | err := fmt.Errorf("send finish: unexpected wc: %s", wc) 482 | this.closeAfterError(err) 483 | return err 484 | } 485 | break 486 | } 487 | 488 | return nil 489 | } 490 | 491 | func (this *RCConn) Close() error { 492 | return this.closeImpl(false) 493 | } 494 | 495 | // closeAfterError closes the RCConn after an error has occured. This 496 | // also closes the memory regions used for some of the protocol 497 | // messages, so errors that refer to work completions should be 498 | // formatted before this function is called. 499 | func (this *RCConn) closeAfterError(err error) { 500 | closeErr := this.closeImpl(true) 501 | if closeErr != nil { 502 | // an error has already occurred, so there is no good way to 503 | // handle a second error 504 | panic(fmt.Errorf("ib: RCConn.closeAfterError: %v (%v)", closeErr, err)) 505 | } 506 | } 507 | 508 | func (this *RCConn) closeImpl(broken bool) error { 509 | // Allow multiple closes, because read and write errors also cause 510 | // Close to be called. 511 | if this.qp == nil { 512 | return nil 513 | } 514 | 515 | // Send a message to shut down the remote state machine. This 516 | // introduces some delay if both ends shut down simultaneously, 517 | // but our current protocols don't usually do this. 518 | err := this.qp.PostSendImm(immClose, nil) 519 | if err != nil { 520 | broken = true 521 | } 522 | // If the connection isn't broken yet, keep polling until all the 523 | // sends, including the close, is complete. 524 | for !broken && this.qp.Sending() { 525 | _, err := this.qp.Poll(this.timeout) 526 | if err != nil { 527 | break 528 | } 529 | } 530 | 531 | if err := this.qp.Close(); err != nil { 532 | return err 533 | } 534 | this.qp = nil 535 | 536 | // memory regions must be closed after the QP 537 | if this.meta != nil { 538 | if err := this.meta.Close(); err != nil { 539 | return err 540 | } 541 | this.meta = nil 542 | } 543 | 544 | for _, mr := range this.messages { 545 | if err := mr.Close(); err != nil { 546 | return err 547 | } 548 | } 549 | this.messages = nil 550 | 551 | return nil 552 | } 553 | -------------------------------------------------------------------------------- /qp_linux.go: -------------------------------------------------------------------------------- 1 | // Copyright 2014 Vastech SA (PTY) LTD 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | // +build linux 16 | 17 | package ib 18 | 19 | //#include 20 | import "C" 21 | 22 | import ( 23 | "errors" 24 | "fmt" 25 | "math/rand" 26 | "os" 27 | "syscall" 28 | "time" 29 | "unsafe" 30 | ) 31 | 32 | type QueuePair interface { 33 | Qpn() uint32 34 | Psn() uint32 35 | Reset() error 36 | Init() error 37 | ReadyToReceive(destLid uint16, destQpn, destPsn uint32) error 38 | ReadyToSend() error 39 | Error() error 40 | Close() error 41 | PostSend(mr *MemoryRegion) error 42 | PostSendImm(imm uint32, mr *MemoryRegion) error 43 | PostReceive(mr *MemoryRegion) error 44 | PostWrite(mr *MemoryRegion, remoteAddr uint64, rkey uint32) error 45 | PostKeepalive() error 46 | Setup(destLid uint16, destQpn, destPsn uint32, messages []*MemoryRegion) error 47 | Poll(nsec int64) (*WorkCompletion, error) 48 | Query() *QPAttr 49 | Sending() bool 50 | } 51 | 52 | type QPAttr struct { 53 | attr C.struct_ibv_qp_attr 54 | initAttr C.struct_ibv_qp_init_attr 55 | } 56 | 57 | // workRequests ensures that there is a reference to each memory 58 | // region being used, even after it has been posted. This obviates the 59 | // need for a mapper like in Go's syscall.Mmap. 60 | 61 | type queuePair struct { 62 | iface *Interface 63 | qp *C.struct_ibv_qp 64 | port uint8 65 | psn uint32 66 | pollfd []Pollfd 67 | workRequests map[C.uint64_t]workRequest 68 | } 69 | 70 | func (iface *Interface) createCompletionQueue(cqe int) *C.struct_ibv_cq { 71 | compChannel := C.ibv_create_comp_channel(iface.ctx) 72 | if compChannel == nil { 73 | panic("ibv_create_comp_channel: failure") 74 | } 75 | if err := syscall.SetNonblock(int(compChannel.fd), true); err != nil { 76 | panic(err) 77 | } 78 | cq := C.ibv_create_cq(iface.ctx, C.int(cqe), nil, compChannel, 0) 79 | if cq != nil { 80 | return cq 81 | } 82 | errno := C.ibv_destroy_comp_channel(compChannel) 83 | if errno != 0 { 84 | panic(newError("ibv_destroy_comp_channel", errno)) 85 | } 86 | return nil 87 | } 88 | 89 | // The spec says: 90 | // 91 | // An unsignaled Work Request that completed successfully is confirmed 92 | // when all of the following rules are met: 93 | // - A Work Completion is retrieved from the same CQ that is 94 | // associated with the Send Queue to which the unsignaled Work Request 95 | // was submitted. 96 | // - That Work Completion corresponds to a subsequent Work Request on 97 | // the same Send Queue as the unsignaled Work Request. 98 | // 99 | // This means that if we only do unsignaled sends on a CQ, even if we 100 | // do signaled receives, the Send Queue associated with the CQ will 101 | // fill up because the unsignaled Work Requests are not confirmed. 102 | 103 | func (iface *Interface) NewQueuePair(cqe int) (QueuePair, error) { 104 | cq := iface.createCompletionQueue(cqe) 105 | if cq == nil { 106 | return nil, newError("ibv_create_cq", -1) 107 | } 108 | 109 | initAttr := C.struct_ibv_qp_init_attr{} 110 | initAttr.send_cq = cq 111 | initAttr.recv_cq = cq 112 | initAttr.cap.max_send_wr = C.uint32_t(cqe) 113 | initAttr.cap.max_recv_wr = C.uint32_t(cqe) 114 | initAttr.cap.max_send_sge = 1 115 | initAttr.cap.max_recv_sge = 1 116 | initAttr.cap.max_inline_data = 64 117 | initAttr.qp_type = C.IBV_QPT_RC 118 | 119 | // Make everything signaled. This avoids the problem with inline 120 | // sends filling up the send queue of the CQ. 121 | initAttr.sq_sig_all = 1 122 | 123 | cqp := C.ibv_create_qp(iface.pd, &initAttr) 124 | if cqp == nil { 125 | return nil, newError("ibv_create_qp", -1) 126 | } 127 | 128 | rng := rand.New(rand.NewSource(time.Now().UnixNano())) 129 | // mask psn to make sure it isn't too big 130 | psn := rng.Uint32() & 0xffffff 131 | 132 | pollfd := make([]Pollfd, 1) 133 | pollfd[0].Fd = int32(cq.channel.fd) 134 | pollfd[0].Events = POLLIN 135 | workRequests := make(map[C.uint64_t]workRequest) 136 | qp := &queuePair{iface, cqp, iface.port, psn, pollfd, workRequests} 137 | 138 | // Reset and Init QP here instead so that Close (which transitions 139 | // QP to Error state) can be called immediately after this 140 | // function returns. This happened when writeReadQPParams returned 141 | // an error in newRCConn. 142 | 143 | if err := qp.Reset(); err != nil { 144 | // no reason for this to fail 145 | panic(err) 146 | } 147 | 148 | if err := qp.Init(); err != nil { 149 | // no reason for this to fail 150 | panic(err) 151 | } 152 | 153 | return qp, nil 154 | } 155 | 156 | func destroyCompletionQueue(cq *C.struct_ibv_cq) error { 157 | if cq == nil { 158 | return nil 159 | } 160 | channel := cq.channel 161 | // CQ must be destroyed before completion channel 162 | errno := C.ibv_destroy_cq(cq) 163 | if errno != 0 { 164 | return newError("ibv_destroy_cq", errno) 165 | } 166 | if channel != nil { 167 | errno := C.ibv_destroy_comp_channel(channel) 168 | if errno != 0 { 169 | return newError("ibv_destroy_comp_channel", errno) 170 | } 171 | } 172 | return nil 173 | } 174 | 175 | var errQPAlreadyClosed = errors.New("ib: queuePair: already closed") 176 | 177 | func (qp *queuePair) Close() error { 178 | if qp.qp == nil { 179 | return errQPAlreadyClosed 180 | } 181 | 182 | // Transition QP to error state. Queue processing is stopped. 183 | // Work Requests pending or in process are completed in error, 184 | // when possible. 185 | if err := qp.Error(); err != nil { 186 | return err 187 | } 188 | 189 | send_cq := qp.qp.send_cq 190 | recv_cq := qp.qp.recv_cq 191 | errno := C.ibv_destroy_qp(qp.qp) 192 | if errno != 0 { 193 | return newError("ibv_destroy_qp", errno) 194 | } 195 | qp.qp = nil 196 | 197 | if send_cq != nil { 198 | if err := destroyCompletionQueue(send_cq); err != nil { 199 | return err 200 | } 201 | } 202 | if recv_cq != send_cq { 203 | if err := destroyCompletionQueue(recv_cq); err != nil { 204 | return err 205 | } 206 | } 207 | 208 | return newError("ibv_destroy_qp", errno) 209 | } 210 | 211 | func (qp *queuePair) Qpn() uint32 { 212 | return uint32(qp.qp.qp_num) 213 | } 214 | 215 | func (qp *queuePair) Psn() uint32 { 216 | return qp.psn 217 | } 218 | 219 | func (qp *queuePair) modify(attr *C.struct_ibv_qp_attr, mask int) error { 220 | errno := C.ibv_modify_qp(qp.qp, attr, C.int(mask)) 221 | return newError("ibv_modify_qp", errno) 222 | } 223 | 224 | func (qp *queuePair) Reset() error { 225 | attr := C.struct_ibv_qp_attr{} 226 | attr.qp_state = C.IBV_QPS_RESET 227 | mask := C.IBV_QP_STATE 228 | return qp.modify(&attr, mask) 229 | } 230 | 231 | func (qp *queuePair) Init() error { 232 | attr := C.struct_ibv_qp_attr{} 233 | attr.qp_state = C.IBV_QPS_INIT 234 | attr.pkey_index = 0 235 | attr.port_num = C.uint8_t(qp.port) 236 | // allow RDMA write 237 | attr.qp_access_flags = IBV_ACCESS_REMOTE_WRITE 238 | mask := C.IBV_QP_STATE | C.IBV_QP_PKEY_INDEX | C.IBV_QP_PORT | C.IBV_QP_ACCESS_FLAGS 239 | return qp.modify(&attr, mask) 240 | } 241 | 242 | func (qp *queuePair) ReadyToReceive(destLid uint16, destQpn, destPsn uint32) error { 243 | attr := C.struct_ibv_qp_attr{} 244 | attr.qp_state = C.IBV_QPS_RTR 245 | attr.path_mtu = C.IBV_MTU_2048 246 | attr.dest_qp_num = C.uint32_t(destQpn) 247 | attr.rq_psn = C.uint32_t(destPsn) 248 | // this must be > 0 to avoid IBV_WC_REM_INV_REQ_ERR 249 | attr.max_dest_rd_atomic = 1 250 | // Minimum RNR NAK timer (range 0..31) 251 | attr.min_rnr_timer = 26 252 | attr.ah_attr.is_global = 0 253 | attr.ah_attr.dlid = C.uint16_t(destLid) 254 | attr.ah_attr.sl = 0 255 | attr.ah_attr.src_path_bits = 0 256 | attr.ah_attr.port_num = C.uint8_t(qp.port) 257 | mask := C.IBV_QP_STATE | C.IBV_QP_AV | C.IBV_QP_PATH_MTU | C.IBV_QP_DEST_QPN | 258 | C.IBV_QP_RQ_PSN | C.IBV_QP_MAX_DEST_RD_ATOMIC | C.IBV_QP_MIN_RNR_TIMER 259 | return qp.modify(&attr, mask) 260 | } 261 | 262 | func (qp *queuePair) ReadyToSend() error { 263 | attr := C.struct_ibv_qp_attr{} 264 | attr.qp_state = C.IBV_QPS_RTS 265 | // Local ack timeout for primary path. 266 | // Timeout is calculated as 4.096e-6*(2**attr.timeout) seconds. 267 | attr.timeout = 14 268 | // Retry count (7 means forever) 269 | attr.retry_cnt = 6 270 | // RNR retry (7 means forever) 271 | attr.rnr_retry = 6 272 | attr.sq_psn = C.uint32_t(qp.psn) 273 | // this must be > 0 to avoid IBV_WC_REM_INV_REQ_ERR 274 | attr.max_rd_atomic = 1 275 | mask := C.IBV_QP_STATE | C.IBV_QP_TIMEOUT | C.IBV_QP_RETRY_CNT | C.IBV_QP_RNR_RETRY | 276 | C.IBV_QP_SQ_PSN | C.IBV_QP_MAX_QP_RD_ATOMIC 277 | return qp.modify(&attr, mask) 278 | } 279 | 280 | func (qp *queuePair) Error() error { 281 | attr := C.struct_ibv_qp_attr{} 282 | attr.qp_state = C.IBV_QPS_ERR 283 | mask := C.IBV_QP_STATE 284 | return qp.modify(&attr, mask) 285 | } 286 | 287 | func (qp *queuePair) PostSend(mr *MemoryRegion) error { 288 | return qp.PostSendImm(0, mr) 289 | } 290 | 291 | func (qp *queuePair) PostSendImm(imm uint32, mr *MemoryRegion) error { 292 | if qp.qp == nil { 293 | return errQPAlreadyClosed 294 | } 295 | 296 | var sendWr C.struct_ibv_send_wr 297 | var bad *C.struct_ibv_send_wr 298 | if imm > 0 { 299 | sendWr.opcode = C.IBV_WR_SEND_WITH_IMM 300 | // always send inline if there is immediate data 301 | sendWr.send_flags = C.IBV_SEND_INLINE 302 | sendWr.imm_data = C.uint32_t(imm) 303 | } else { 304 | sendWr.opcode = C.IBV_WR_SEND 305 | sendWr.send_flags = C.IBV_SEND_SIGNALED 306 | } 307 | 308 | if mr != nil { 309 | var sge C.struct_ibv_sge 310 | sendWr.sg_list = &sge 311 | sendWr.num_sge = 1 312 | mr.populateSge(qp.iface.pd, &sge) 313 | } else { 314 | // send inline if there is no memory region to send 315 | sendWr.send_flags = C.IBV_SEND_INLINE 316 | } 317 | 318 | sendWr.wr_id = C.uint64_t(uintptr(unsafe.Pointer(&sendWr))) 319 | qp.workRequests[sendWr.wr_id] = workRequest{mr, &sendWr, nil, false} 320 | 321 | errno := C.ibv_post_send(qp.qp, &sendWr, &bad) 322 | return newError("ibv_post_send", errno) 323 | } 324 | 325 | type workRequest struct { 326 | mr *MemoryRegion 327 | sendWr *C.struct_ibv_send_wr 328 | recvWr *C.struct_ibv_recv_wr 329 | keepalive bool 330 | } 331 | 332 | func (qp *queuePair) PostReceive(mr *MemoryRegion) error { 333 | if qp.qp == nil { 334 | return errQPAlreadyClosed 335 | } 336 | 337 | var recvWr C.struct_ibv_recv_wr 338 | var sge C.struct_ibv_sge 339 | var bad *C.struct_ibv_recv_wr 340 | recvWr.sg_list = &sge 341 | recvWr.num_sge = 1 342 | mr.populateSge(qp.iface.pd, &sge) 343 | 344 | recvWr.wr_id = C.uint64_t(uintptr(unsafe.Pointer(&recvWr))) 345 | qp.workRequests[recvWr.wr_id] = workRequest{mr, nil, &recvWr, false} 346 | 347 | errno := C.ibv_post_recv(qp.qp, &recvWr, &bad) 348 | return newError("ibv_post_recv", errno) 349 | } 350 | 351 | type rdma struct { 352 | remoteAddr uint64 353 | rkey uint32 354 | } 355 | 356 | func (qp *queuePair) PostWrite(mr *MemoryRegion, remoteAddr uint64, rkey uint32) error { 357 | if qp.qp == nil { 358 | return errQPAlreadyClosed 359 | } 360 | 361 | var sendWr C.struct_ibv_send_wr 362 | var sge C.struct_ibv_sge 363 | var bad *C.struct_ibv_send_wr 364 | sendWr.opcode = C.IBV_WR_RDMA_WRITE 365 | sendWr.send_flags = C.IBV_SEND_SIGNALED 366 | sendWr.sg_list = &sge 367 | sendWr.num_sge = 1 368 | mr.populateSge(qp.iface.pd, &sge) 369 | r := (*rdma)(unsafe.Pointer(&sendWr.wr)) 370 | r.remoteAddr = remoteAddr 371 | r.rkey = rkey 372 | 373 | sendWr.wr_id = C.uint64_t(uintptr(unsafe.Pointer(&sendWr))) 374 | qp.workRequests[sendWr.wr_id] = workRequest{mr, &sendWr, nil, false} 375 | 376 | errno := C.ibv_post_send(qp.qp, &sendWr, &bad) 377 | return newError("ibv_post_send", errno) 378 | } 379 | 380 | // Zero-length RDMA write. 381 | func (qp *queuePair) PostKeepalive() error { 382 | if qp.qp == nil { 383 | return errQPAlreadyClosed 384 | } 385 | 386 | var sendWr C.struct_ibv_send_wr 387 | var bad *C.struct_ibv_send_wr 388 | sendWr.opcode = C.IBV_WR_RDMA_WRITE 389 | sendWr.send_flags = C.IBV_SEND_SIGNALED 390 | 391 | sendWr.wr_id = C.uint64_t(uintptr(unsafe.Pointer(&sendWr))) 392 | qp.workRequests[sendWr.wr_id] = workRequest{nil, &sendWr, nil, true} 393 | 394 | errno := C.ibv_post_send(qp.qp, &sendWr, &bad) 395 | return newError("ibv_post_send", errno) 396 | } 397 | 398 | func (qp *queuePair) Setup(destLid uint16, destQpn, destPsn uint32, messages []*MemoryRegion) error { 399 | for _, mr := range messages { 400 | if err := qp.PostReceive(mr); err != nil { 401 | return err 402 | } 403 | } 404 | if err := qp.ReadyToReceive(destLid, destQpn, destPsn); err != nil { 405 | return err 406 | } 407 | return qp.ReadyToSend() 408 | } 409 | 410 | func (qp *queuePair) pollCompletionQueue(wc *C.struct_ibv_wc) (int, error) { 411 | cq := qp.qp.send_cq 412 | nwc := C.ibv_poll_cq(cq, 1, wc) 413 | if nwc < 0 { 414 | return 0, newError("ibv_poll_cq", -1) 415 | } else if nwc == 0 { 416 | return 0, nil 417 | } 418 | return 1, nil 419 | } 420 | 421 | type WorkCompletion struct { 422 | mr *MemoryRegion 423 | wc C.struct_ibv_wc 424 | keepalive bool 425 | } 426 | 427 | func (wc *WorkCompletion) String() string { 428 | s := C.GoString(C.ibv_wc_status_str(wc.wc.status)) 429 | return fmt.Sprintf("WorkCompletion{%s,wc:%+v,keepalive:%v,%v}", s, wc.wc, wc.keepalive, wc.mr) 430 | } 431 | 432 | func (wc *WorkCompletion) MemoryRegion() *MemoryRegion { 433 | return wc.mr 434 | } 435 | 436 | func (wc *WorkCompletion) Success() bool { 437 | return wc.wc.status == C.IBV_WC_SUCCESS 438 | } 439 | 440 | func (wc *WorkCompletion) Send() bool { 441 | return wc.wc.opcode == C.IBV_WC_SEND 442 | } 443 | 444 | func (wc *WorkCompletion) Write() bool { 445 | return wc.wc.opcode == C.IBV_WC_RDMA_WRITE 446 | } 447 | 448 | func (wc *WorkCompletion) Receive() bool { 449 | return wc.wc.opcode&C.IBV_WC_RECV != 0 450 | } 451 | 452 | func (wc *WorkCompletion) ImmData() uint32 { 453 | return uint32(wc.wc.imm_data) 454 | } 455 | 456 | func (wc *WorkCompletion) Keepalive() bool { 457 | return wc.keepalive 458 | } 459 | 460 | func (qp *queuePair) newWorkCompletion(wc *C.struct_ibv_wc) *WorkCompletion { 461 | // Inline sends (wr_id=0) must be signaled to prevent the send 462 | // queue from filling up, but their completions are only useful if 463 | // there is an error. 464 | if wc.wr_id == 0 { 465 | if wc.status != C.IBV_WC_SUCCESS { 466 | return &WorkCompletion{nil, *wc, false} 467 | } 468 | return nil 469 | } 470 | wr, ok := qp.workRequests[wc.wr_id] 471 | if !ok { 472 | panic("invalid work completion") 473 | } 474 | delete(qp.workRequests, wc.wr_id) 475 | return &WorkCompletion{wr.mr, *wc, wr.keepalive} 476 | } 477 | 478 | // Poll timeout is specified in milliseconds. If the returned work 479 | // completion and error are both nil, the caller should poll again. 480 | func (qp *queuePair) Poll(nsec int64) (*WorkCompletion, error) { 481 | if qp.qp == nil { 482 | return nil, errQPAlreadyClosed 483 | } 484 | 485 | if qp.qp.send_cq != qp.qp.recv_cq { 486 | panic("send_cq != recv_cq not implemented") 487 | } 488 | 489 | var wc C.struct_ibv_wc 490 | if nwc, err := qp.pollCompletionQueue(&wc); err != nil { 491 | return nil, err 492 | } else if nwc > 0 { 493 | return qp.newWorkCompletion(&wc), nil 494 | } 495 | 496 | cq := qp.qp.send_cq 497 | if errno := C.ibv_req_notify_cq(cq, 0); errno != 0 { 498 | return nil, newError("ibv_req_notify_cq", errno) 499 | } 500 | 501 | if nwc, err := qp.pollCompletionQueue(&wc); err != nil { 502 | return nil, err 503 | } else if nwc > 0 { 504 | return qp.newWorkCompletion(&wc), nil 505 | } 506 | 507 | qp.pollfd[0].Revents = 0 508 | // nsec>>20 is a quick conversion to milliseconds 509 | n, err := Poll(qp.pollfd, nsec>>20) 510 | if err != nil { 511 | return nil, os.NewSyscallError("poll", err) 512 | } 513 | if n > 0 { 514 | var evcq *C.struct_ibv_cq 515 | var evctx unsafe.Pointer 516 | errno := C.ibv_get_cq_event(cq.channel, &evcq, &evctx) 517 | if errno != 0 { 518 | return nil, newError("ibv_get_cq_event", errno) 519 | } 520 | C.ibv_ack_cq_events(evcq, 1) 521 | // caller should poll again 522 | return nil, nil 523 | } 524 | 525 | return nil, syscall.EAGAIN 526 | } 527 | 528 | func (qp *queuePair) Query() *QPAttr { 529 | if qp.qp == nil { 530 | return nil 531 | } 532 | var attr QPAttr 533 | errno := C.ibv_query_qp(qp.qp, &attr.attr, (1<<28)-1, &attr.initAttr) 534 | if errno != 0 { 535 | return nil 536 | } 537 | return &attr 538 | } 539 | 540 | func (qp *queuePair) Sending() bool { 541 | for _, wr := range qp.workRequests { 542 | if wr.sendWr != nil { 543 | return true 544 | } 545 | } 546 | return false 547 | } 548 | --------------------------------------------------------------------------------