├── .gitignore ├── README.md ├── e1000.h └── e1000.c /.gitignore: -------------------------------------------------------------------------------- 1 | *.swp 2 | *.swo 3 | e1000 4 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # vfio-e1000 2 | Simple Intel 1GbE network driver implementation using VFIO. 3 | 4 | ## Binding vfio-pci driver 5 | - Before using VFIO, you need to bind vfio-pci driver to the NIC like the following. 6 | 7 | ```sh 8 | % lspci -nn | grep -i Ether 9 | 01:00.0 Ethernet controller [0200]: Intel Corporation 82574L Gigabit Network Connection [8086:10d3] 10 | % sudo modprobe vfio-pci 11 | % echo 0000:01:00.0 | sudo tee -a /sys/bus/pci/devices/0000:01:00.0/driver/unbind 12 | % echo 8086 10d3 | sudo tee -a /sys/bus/pci/drivers/vfio-pci/new_id 13 | % sudo chown -R group:user /dev/vfio/66 14 | ``` 15 | 16 | - You can use [vfio-pci-bind](https://github.com/andre-richter/vfio-pci-bind) 17 | 18 | ## Compile 19 | - `gcc -o e1000 e1000.c` 20 | - Use INTx, dump received packets 21 | - `gcc -o e1000 e1000.c -DPOLL` 22 | - polling instead of interrupts 23 | - `gcc -o e1000 e1000.c -DMSIX -DTXINT -DECHO` 24 | - use MSIX, enable Tx interrupt, echo received packets 25 | - For more detail, see the source 26 | 27 | ## Usage 28 | - `./e1000 ` 29 | - for example, `./e1000 0000:01:00.0` 30 | 31 | ## Note 32 | - If you get out of memory error, probably you need to increase the memlock limits. 33 | - VFIO automatically lock the DMA memory region so as not to be moved. 34 | - Check current memlock liimit: `prlimit -l` 35 | - Temporary increase the limit: `sudo prlimit --memlock=-1 ./e1000 0000:01:00.0` 36 | - To make the configuration permanent, edit `/etc/security/limits.conf` 37 | - I only tested this with 82574L on Intel CPU. 38 | 39 | ## Requirements 40 | - IOMMU (VT-d for Intel) 41 | - VFIO supports no IOMMU mode, but this driver currently does not. 42 | 43 | ## Reference 44 | - https://www.kernel.org/doc/Documentation/vfio.txt 45 | - Intel® 82574 GbE Controller Family Datasheet, https://www.intel.ca/content/dam/doc/datasheet/82574l-gbe-controller-datasheet.pdf 46 | - MINIX e1000, https://github.com/Stichting-MINIX-Research-Foundation/minix/tree/master/minix/drivers/net/e1000 47 | - FreeBSD e1000, https://github.com/freebsd/freebsd/tree/master/sys/dev/e1000 48 | - Redox e1000, https://github.com/redox-os/drivers/blob/master/e1000d/ 49 | - snabb driver, https://github.com/snabbco/snabb/blob/master/src/apps/intel_mp/intel_mp.lua 50 | - old version, https://github.com/anttikantee/snabbswitch/blob/master/src/apps/intel/intel.lua 51 | 52 | ## License 53 | [MIT](https://opensource.org/licenses/MIT) 54 | -------------------------------------------------------------------------------- /e1000.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | typedef uint64_t u64; 6 | typedef uint32_t u32; 7 | typedef uint16_t u16; 8 | typedef uint8_t u8; 9 | 10 | #define CTRL_ 0x00 11 | #define CTRL_FD 1 12 | #define CTRL_LRST 1 << 3 // reserved 13 | #define CTRL_ASDE 1 << 5 14 | #define CTRL_SLU 1 << 6 15 | #define CTRL_ILOS 1 << 7 // reserved 16 | #define CTRL_RST 1 << 26 17 | #define CTRL_VME 1 << 30 18 | #define CTRL_PHY_RST 1 << 31 19 | 20 | #define STATUS 0x08 21 | 22 | // Flow Control Address 23 | #define FCAL 0x28 24 | #define FCAH 0x2C 25 | // Flow Control Type 26 | #define FCT 0x30 27 | // Flow Control Transmit Timer Value 28 | #define FCTTV 0x170 29 | 30 | // Interrupt Cause Read Regisetr 31 | #define ICR 0xC0 32 | 33 | // Interrupt Mask Set/Read Regisetr 34 | #define IMS 0xD0 35 | #define IMS_TXDW 1 36 | #define IMS_TXQE 1 << 1 37 | #define IMS_LSC 1 << 2 38 | #define IMS_RXSEQ 1 << 3 39 | #define IMS_RXDMT 1 << 4 40 | #define IMS_RXO 1 << 6 41 | #define IMS_RXT 1 << 7 42 | #define IMS_RXQ0 1 << 20 43 | #define IMS_RXQ1 1 << 21 44 | #define IMS_TXQ0 1 << 22 45 | #define IMS_TXQ1 1 << 23 46 | #define IMS_OTHER 1 << 24 47 | 48 | // Interrupt Mask Clear Register 49 | #define IMC 0xD8 50 | 51 | // Interrupt Vector Allocation Registers (for MSI-X) 52 | #define IVAR 0x000E4 53 | #define IVAR_RXQ0_VEC_SHIFT 0 54 | #define IVAR_EN_RXQ0 1 << 3 55 | #define IVAR_RXQ1_VEC_SHIFT 4 56 | #define IVAR_EN_RXQ1 1 << 7 57 | #define IVAR_TXQ0_VEC_SHIFT 8 58 | #define IVAR_EN_TXQ0 1 << 11 59 | #define IVAR_TXQ1_VEC_SHIFT 12 60 | #define IVAR_EN_TXQ1 1 << 15 61 | #define IVAR_OTHER_VEC_SHIFT 16 62 | #define IVAR_EN_OTHER 1 << 19 63 | 64 | // 3GIO Control Register 65 | #define GCR 0x05B00 66 | 67 | // Receive control 68 | #define RCTL 0x100 69 | #define RCTL_EN 1 << 1 70 | #define RCTL_UPE 1 << 3 71 | #define RCTL_MPE 1 << 4 72 | #define RCTL_LPE 1 << 5 73 | #define RCTL_LBM 1 << 6 | 1 << 7 74 | #define RCTL_BAM 1 << 15 75 | #define RCTL_BSIZE1 1 << 16 76 | #define RCTL_BSIZE2 1 << 17 77 | #define RCTL_BSEX 1 << 25 78 | #define RCTL_SECRC 1 << 26 79 | 80 | // Receive Descriptor Control 81 | #define RXDCTL 0x02828 82 | 83 | // Transmit Control 84 | #define TCTL 0x400 85 | #define TCTL_EN 1 << 1 86 | #define TCTL_PSP 1 << 3 87 | 88 | // Receive Descriptor Base Address 89 | #define RDBAL 0x2800 90 | #define RDBAH 0x2804 91 | // Receive Descriptor Length 92 | #define RDLEN 0x2808 93 | #define RDH 0x2810 94 | #define RDT 0x2818 95 | 96 | // Transmit Descriptor Base Address 97 | #define TDBAL 0x3800 98 | #define TDBAH 0x3804 99 | // Transmit Descriptor Length 100 | #define TDLEN 0x3808 101 | #define TDH 0x3810 102 | #define TDT 0x3818 103 | 104 | // Transmit Interrupt Delay Value 105 | #define TIDV 0x3820 106 | 107 | // Receive Address (MAC address) 108 | #define RAL0 0x5400 109 | #define RAH0 0x5404 110 | 111 | // some statistics register 112 | #define MPC 0x4010 // Missed Packets Count 113 | #define GPRC 0x4074 // Good Packets Received Counts 114 | #define GPTC 0x4080 // Good Packets Transmitted Count 115 | #define GORCL 0x4088 // Good Octets Received Count 116 | #define GORCH 0x408C 117 | #define GOTCL 0x4088 // Good Octets Transmitted Count 118 | #define GOTCH 0x408C 119 | #define RXERRC 0x400C 120 | 121 | // legacy descriptor 122 | struct rdesc { 123 | u64 buffer; // buffer address 124 | u16 length; 125 | u16 checksum; 126 | union { 127 | u8 status; 128 | struct { 129 | u8 dd : 1; // descriptor done 130 | u8 eop : 1; // end of packet 131 | u8 ixsm : 1; // ignore checksum indication 132 | u8 vp : 1; // 802.1Q 133 | u8 udpcs : 1; // UDP checksum calculated 134 | u8 tcpcs : 1; // TCP checksum calculated 135 | u8 ipcs : 1; // IPv4 checksum calculated 136 | u8 pif : 1; // passed in-exact filter 137 | }; 138 | }; 139 | union { 140 | u8 error; 141 | struct { 142 | u8 ce : 1; // CRC error 143 | u8 se : 1; // symbol error 144 | u8 seq : 1; // sequence error 145 | u8 rcv : 2; // reserved 146 | u8 tcpe : 1; // TCP/UDP checksum error 147 | u8 ipe : 1; // IPv4 checksum error 148 | u8 rxe : 1; // RX data error 149 | }; 150 | }; 151 | u16 vlantag; // VLAN tag 152 | } __attribute__((packed)); 153 | 154 | struct tdesc { 155 | u64 buffer; // buffer address 156 | u16 length; 157 | u8 cso; // checksum offset 158 | union { 159 | u8 command; 160 | struct { 161 | u8 eop : 1; // end of packet 162 | u8 ifcs : 1; // insert FCS 163 | u8 ic : 1; // insert checksum 164 | u8 rs : 1; // report status 165 | u8 rsv : 1; // reserved 166 | u8 dext : 1; // extension 167 | u8 vle : 1; // VLAN packet enable 168 | u8 ide : 1; // interrupt delay enable 169 | }; 170 | }; 171 | union { 172 | u8 status; 173 | struct { 174 | u8 dd : 1; // descriptor done 175 | u8 ec : 1; // excess collisions 176 | u8 lc : 1; // late collisions 177 | u8 rsv2 : 5; // reserved 178 | }; 179 | }; 180 | u8 css; // checksum start 181 | u16 special; // special field 182 | } __attribute__((packed)); 183 | 184 | #define MAX_MSIX_VECTOR_NUM 5 185 | -------------------------------------------------------------------------------- /e1000.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include 15 | 16 | #include 17 | #include 18 | #include 19 | #include 20 | #include 21 | #include 22 | 23 | #include 24 | 25 | #include "e1000.h" 26 | 27 | #define ASSERT(expr, msg, ...) \ 28 | do { \ 29 | if (!(expr)) { \ 30 | fprintf(stderr, "[Error] %s:%3d %15s(): ", __FILE__, __LINE__, \ 31 | __func__); \ 32 | fprintf(stderr, msg "\n", ##__VA_ARGS__); \ 33 | exit(1); \ 34 | } \ 35 | } while (0) 36 | 37 | #ifndef NDEBUG 38 | #define DASSERT ASSERT 39 | #define debug(msg, ...) \ 40 | do { \ 41 | fprintf(stderr, "[Debug] %s:%d %s(): ", __FILE__, __LINE__, __func__); \ 42 | fprintf(stderr, msg "\n", ##__VA_ARGS__); \ 43 | } while (0) 44 | #else 45 | #define DASSERT(...) \ 46 | do { \ 47 | } while (0) 48 | #define debug(fmt, ...) \ 49 | do { \ 50 | } while (0) 51 | #endif 52 | 53 | struct device { 54 | struct rdesc* rx_ring; 55 | struct tdesc* tx_ring; 56 | void** rx_ring_buf_vaddr; 57 | void** tx_ring_buf_vaddr; 58 | int fd; // VFIO device fd 59 | int gfd; // VFIO group fd 60 | int cfd; // VFIO container fd 61 | int efd; // event fd (for INTx, MSI) 62 | int efds[MAX_MSIX_VECTOR_NUM]; // event fd (for MSI-x) 63 | int epfd; // epoll fd 64 | struct vfio_device_info device_info; 65 | struct vfio_group_status group_status; 66 | struct vfio_iommu_type1_info iommu_info; 67 | struct vfio_region_info regs[VFIO_PCI_NUM_REGIONS]; 68 | struct vfio_irq_info irqs[VFIO_PCI_NUM_IRQS]; 69 | void* mmio_addr; // mmio address (BAR0); 70 | }; 71 | 72 | #define BUFSIZE 4096 73 | #define NUM_OF_DESC 256 // 256 * 16 = 4096 74 | 75 | static char* region_name[VFIO_PCI_NUM_REGIONS] = { 76 | "BAR0", "BAR1", "BAR2", "BAR3", "BAR4", "BAR5", "ROM", "CONFIG", "VGA"}; 77 | static char* irq_name[VFIO_PCI_NUM_IRQS] = {"INTX", "MSI", "MSIX", "ERR", 78 | "REQ"}; 79 | 80 | // 82574L BARs (non-prefethcable, 32-bit addressing only) 81 | // BAR0 : Memory BAR 82 | // BAR1 : Flash BAR 83 | // BAR2 : IO BAR 84 | // BAR3 : MSI-X BAR 85 | // BAR4 : Reserved 86 | // BAR5 : Reserved 87 | 88 | #ifdef PHYSADDR_MAP 89 | // Convert virtual address to physical address 90 | // https://www.kernel.org/doc/Documentation/vm/pagemap.txt 91 | static uintptr_t virt_to_phys(void* virt) { 92 | long pagesize = sysconf(_SC_PAGESIZE); 93 | int fd = open("/proc/self/pagemap", O_RDONLY); 94 | ASSERT(fd != -1, "failed to open /proc/self/pagemap"); 95 | off_t ret = 96 | lseek(fd, (uintptr_t)virt / pagesize * sizeof(uintptr_t), SEEK_SET); 97 | ASSERT(ret != -1, "lseek error"); 98 | uintptr_t entry = 0; 99 | ssize_t rc = read(fd, &entry, sizeof(entry)); 100 | ASSERT(rc > 0, "read error"); 101 | ASSERT(entry != 0, 102 | "failed to get physical address for %p (perhaps forgot sudo?)", 103 | virt); 104 | close(fd); 105 | 106 | return (entry & 0x7fffffffffffffULL) * pagesize + 107 | ((uintptr_t)virt) % pagesize; 108 | } 109 | #endif 110 | 111 | static inline void write_u32(struct device* dev, int offset, uint32_t value) { 112 | __asm__ volatile("" : : : "memory"); 113 | *((volatile uint32_t*)(dev->mmio_addr + offset)) = value; 114 | } 115 | 116 | static inline uint32_t read_u32(struct device* dev, int offset) { 117 | __asm__ volatile("" : : : "memory"); 118 | return *((volatile uint32_t*)(dev->mmio_addr + offset)); 119 | } 120 | 121 | static inline void set_flags_u32(struct device* dev, int offset, 122 | uint32_t flags) { 123 | write_u32(dev, offset, read_u32(dev, offset) | flags); 124 | } 125 | 126 | static inline void clear_flags_u32(struct device* dev, int offset, 127 | uint32_t flags) { 128 | write_u32(dev, offset, read_u32(dev, offset) & ~flags); 129 | } 130 | 131 | static void open_vfio(struct device* dev, int segn, int busn, int devn, 132 | int funcn) { 133 | dev->device_info.argsz = sizeof(struct vfio_device_info); 134 | dev->group_status.argsz = sizeof(struct vfio_group_status); 135 | dev->iommu_info.argsz = sizeof(struct vfio_iommu_type1_info); 136 | for (int i = 0; i < VFIO_PCI_NUM_REGIONS; i++) { 137 | dev->regs[i].argsz = sizeof(struct vfio_region_info); 138 | } 139 | for (int i = 0; i < VFIO_PCI_NUM_IRQS; i++) { 140 | dev->irqs[i].argsz = sizeof(struct vfio_irq_info); 141 | } 142 | 143 | // find iommu group for the device 144 | // `readlink /sys/bus/pci/device//iommu_group` 145 | char path[128], iommu_group_path[128]; 146 | struct stat st; 147 | snprintf(path, sizeof(path), "/sys/bus/pci/devices/%04x:%02x:%02x.%01x/", 148 | segn, busn, devn, funcn); 149 | int ret = stat(path, &st); 150 | ASSERT(ret >= 0, "No such device: %s", path); 151 | strncat(path, "iommu_group", sizeof(path) - strlen(path) - 1); 152 | 153 | int len = readlink(path, iommu_group_path, sizeof(iommu_group_path)); 154 | ASSERT(len > 0, "No iommu_group for device"); 155 | 156 | iommu_group_path[len] = '\0'; 157 | char* group_name = basename(iommu_group_path); 158 | int groupid; 159 | ret = sscanf(group_name, "%d", &groupid); 160 | ASSERT(ret == 1, "unknown group"); 161 | 162 | // open vfio file 163 | dev->cfd = open("/dev/vfio/vfio", O_RDWR); 164 | ASSERT(dev->cfd >= 0, "failed to open /dev/vfio/vfio"); 165 | 166 | snprintf(path, sizeof(path), "/dev/vfio/%d", groupid); 167 | dev->gfd = open(path, O_RDWR); 168 | ASSERT(dev->gfd >= 0, "failed to open %s", path); 169 | 170 | ret = ioctl(dev->gfd, VFIO_GROUP_GET_STATUS, &dev->group_status); 171 | ASSERT( 172 | dev->group_status.flags & VFIO_GROUP_FLAGS_VIABLE, 173 | "VFIO group is not visible (probably not all devices bound for vfio?)"); 174 | 175 | // set container 176 | ret = ioctl(dev->gfd, VFIO_GROUP_SET_CONTAINER, &dev->cfd); 177 | ASSERT(ret == 0, "failed to set container"); 178 | // set vfio type (type1 is for IOMMU like VT-d or AMD-Vi) 179 | ret = ioctl(dev->cfd, VFIO_SET_IOMMU, VFIO_TYPE1_IOMMU); 180 | ASSERT(ret == 0, "failed to set iommu type"); 181 | 182 | // get device descriptor 183 | snprintf(path, sizeof(path), "%04x:%02x:%02x.%01x", segn, busn, devn, 184 | funcn); 185 | dev->fd = ioctl(dev->gfd, VFIO_GROUP_GET_DEVICE_FD, path); 186 | ASSERT(dev->fd >= 0, "cannot get device fd"); 187 | } 188 | 189 | static void get_device_info(struct device* dev) { 190 | int i; 191 | ioctl(dev->fd, VFIO_DEVICE_GET_INFO, &dev->device_info); 192 | 193 | debug("num_regions: %d", dev->device_info.num_regions); 194 | debug("flags = CAPS, MMAP, WRITE, READ"); 195 | for (i = 0; i < dev->device_info.num_regions; i++) { 196 | dev->regs[i].index = i; 197 | ioctl(dev->fd, VFIO_DEVICE_GET_REGION_INFO, &dev->regs[i]); 198 | debug("region %d.flags = %d%d%d%d (%s)", i, 199 | !!(dev->regs[i].flags & VFIO_REGION_INFO_FLAG_CAPS), 200 | !!(dev->regs[i].flags & VFIO_REGION_INFO_FLAG_MMAP), 201 | !!(dev->regs[i].flags & VFIO_REGION_INFO_FLAG_WRITE), 202 | !!(dev->regs[i].flags & VFIO_REGION_INFO_FLAG_READ), 203 | region_name[i]); 204 | } 205 | 206 | debug("num_irqs: %d", dev->device_info.num_irqs); 207 | debug("flags = NORESIZE, AUTOMASKED, MASKABLE, NORESIZE"); 208 | for (i = 0; i < dev->device_info.num_irqs; i++) { 209 | dev->irqs[i].index = i; 210 | ioctl(dev->fd, VFIO_DEVICE_GET_IRQ_INFO, &dev->irqs[i]); 211 | 212 | debug("IRQ info %d (%s)", i, irq_name[i]); 213 | debug(" irq.flags = %d%d%d%d", 214 | !!(dev->irqs[i].flags & VFIO_IRQ_INFO_NORESIZE), 215 | !!(dev->irqs[i].flags & VFIO_IRQ_INFO_AUTOMASKED), 216 | !!(dev->irqs[i].flags & VFIO_IRQ_INFO_MASKABLE), 217 | !!(dev->irqs[i].flags & VFIO_IRQ_INFO_EVENTFD)); 218 | debug(" irq.index = %d", dev->irqs[i].index); 219 | debug(" irq.count = %d", dev->irqs[i].count); 220 | } 221 | } 222 | 223 | // Dump device PCI configuration space 224 | // Note that some fields may be virtualized by VFIO 225 | // (thus cannot mmap configuration space) 226 | // To check actual configuration space, `sudo lspci -xxxx -s ` 227 | static void dump_configuration_space(struct device* dev) { 228 | char buf[4096]; 229 | struct vfio_region_info* cs_info = &dev->regs[VFIO_PCI_CONFIG_REGION_INDEX]; 230 | int ret = pread(dev->fd, buf, cs_info->size > 4096 ? 4096 : cs_info->size, 231 | cs_info->offset); 232 | ASSERT(ret >= 0, "pread error"); 233 | 234 | int len; 235 | for (len = ret - 1; len >= 0; len--) { 236 | if (buf[len] != 0) 237 | break; 238 | } 239 | len = (len + 16) - (len + 16) % 16; 240 | 241 | for (int i = 0; i < len;) { 242 | printf("%3X: ", i); 243 | for (int j = 0; j < 16 && i < len; i++, j++) { 244 | printf("%02X ", (u8)buf[i]); 245 | } 246 | printf("\n"); 247 | } 248 | } 249 | 250 | void init_vfio(struct device* dev, int segn, int busn, int devn, int func) { 251 | open_vfio(dev, segn, busn, devn, func); 252 | get_device_info(dev); 253 | #ifndef NDEBUG 254 | dump_configuration_space(dev); 255 | #endif 256 | } 257 | 258 | // Enable DMA 259 | void enable_bus_master(struct device* dev) { 260 | struct vfio_region_info* cs_info = &dev->regs[VFIO_PCI_CONFIG_REGION_INDEX]; 261 | char buf[2]; 262 | pread(dev->fd, buf, 2, cs_info->offset + 4); 263 | *(u16*)(buf) |= 1 << 2; 264 | pwrite(dev->fd, buf, 2, cs_info->offset + 4); 265 | debug("PCI configuration space command reg = %04X\n", *(u16*)buf); 266 | } 267 | 268 | // Convert virtual address to IOVA 269 | static u64 get_iova(u64 virt_addr, ssize_t size) { 270 | static u64 _iova = 0; 271 | #if defined(IDENTITY_MAP) 272 | // Use virtual address as IOVA 273 | // Note that some architecture only support 3-level page table (39-bit) and 274 | // cannot use virtual address as IOVA 275 | return virt_addr; 276 | #elif defined(PHYSADDR_MAP) 277 | // Use physical address as IOVA 278 | return (u64)virt_to_phys(virt_addr); 279 | #else 280 | // Assign IOVA from 0 281 | u64 ret = _iova; 282 | _iova += size; 283 | return ret; 284 | #endif 285 | } 286 | 287 | // Allocate rx_ring and DMA buffer 288 | // XXX: should use hugetlb 289 | static u64 init_rx_buf(struct device* dev) { 290 | struct vfio_iommu_type1_dma_map dma_map = { 291 | .argsz = sizeof(dma_map), 292 | .flags = VFIO_DMA_MAP_FLAG_READ | VFIO_DMA_MAP_FLAG_WRITE}; 293 | 294 | ssize_t size = NUM_OF_DESC * sizeof(struct rdesc); 295 | dev->rx_ring = mmap(NULL, size, PROT_READ | PROT_WRITE, 296 | MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); 297 | ASSERT(dev->rx_ring != MAP_FAILED, "failed to mmap rx ring"); 298 | dev->rx_ring_buf_vaddr = 299 | mmap(NULL, sizeof(void*) * NUM_OF_DESC, PROT_READ | PROT_WRITE, 300 | MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); 301 | ASSERT(dev->rx_ring_buf_vaddr != MAP_FAILED, 302 | "failed to mmap rx vaddr buffer"); 303 | 304 | // setup iommu for rx_ring 305 | dma_map.size = NUM_OF_DESC * sizeof(struct rdesc); 306 | dma_map.vaddr = (u64)dev->rx_ring; 307 | dma_map.iova = get_iova((u64)dev->rx_ring, size); 308 | u64 rx_ring_iova = dma_map.iova; 309 | int ret = ioctl(dev->cfd, VFIO_IOMMU_MAP_DMA, &dma_map); 310 | ASSERT(ret == 0, "failed to map rx_ring"); 311 | 312 | // allocate buffer 313 | for (int i = 0; i < NUM_OF_DESC; i++) { 314 | void* buffer = mmap(NULL, BUFSIZE, PROT_READ | PROT_WRITE, 315 | MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); 316 | ASSERT(buffer != MAP_FAILED, "failed to mmap rx buffer"); 317 | 318 | // setup iommu for buffer 319 | dma_map.size = BUFSIZE; 320 | dma_map.vaddr = (u64)buffer; 321 | dma_map.iova = get_iova((u64)buffer, BUFSIZE); 322 | dev->rx_ring_buf_vaddr[i] = buffer; 323 | ret = ioctl(dev->cfd, VFIO_IOMMU_MAP_DMA, &dma_map); 324 | ASSERT(ret == 0, "failed to map rx buffer %d (%s)\n", i, 325 | strerror(errno)); 326 | dev->rx_ring[i].buffer = dma_map.iova; 327 | } 328 | return rx_ring_iova; 329 | } 330 | 331 | // allocate tx_ring 332 | // XXX: should use hugetlb 333 | static u64 init_tx_buf(struct device* dev) { 334 | struct vfio_iommu_type1_dma_map dma_map = { 335 | .argsz = sizeof(dma_map), 336 | .flags = VFIO_DMA_MAP_FLAG_READ | VFIO_DMA_MAP_FLAG_WRITE}; 337 | 338 | ssize_t size = NUM_OF_DESC * sizeof(struct tdesc); 339 | dev->tx_ring = mmap(NULL, size, PROT_READ | PROT_WRITE, 340 | MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); 341 | ASSERT(dev->tx_ring != MAP_FAILED, "failed to mmap tx ring"); 342 | dev->tx_ring_buf_vaddr = 343 | mmap(NULL, sizeof(void*) * NUM_OF_DESC, PROT_READ | PROT_WRITE, 344 | MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); 345 | ASSERT(dev->tx_ring_buf_vaddr != MAP_FAILED, 346 | "failed to mmap tx vaddr buffer"); 347 | 348 | // setup iommu for tx_ring 349 | dma_map.iova = get_iova((u64)dev->tx_ring, size); 350 | dma_map.size = size; 351 | dma_map.vaddr = (u64)dev->tx_ring; 352 | u64 tx_ring_iova = dma_map.iova; 353 | int ret = ioctl(dev->cfd, VFIO_IOMMU_MAP_DMA, &dma_map); 354 | ASSERT(ret == 0, "failed to map dev->tx_ring"); 355 | 356 | // allocate buffer 357 | for (int i = 0; i < NUM_OF_DESC; i++) { 358 | void* buffer = mmap(NULL, BUFSIZE, PROT_READ | PROT_WRITE, 359 | MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); 360 | ASSERT(buffer != MAP_FAILED, "failed to mmap tx buffer"); 361 | 362 | // setup iommu for buffer 363 | dma_map.iova = get_iova((u64)buffer, BUFSIZE); 364 | dma_map.size = BUFSIZE; 365 | dma_map.vaddr = (u64)buffer; 366 | dev->tx_ring_buf_vaddr[i] = buffer; 367 | ret = ioctl(dev->cfd, VFIO_IOMMU_MAP_DMA, &dma_map); 368 | ASSERT(ret == 0, "failed to map tx buffer %d", i); 369 | dev->tx_ring[i].buffer = dma_map.iova; 370 | } 371 | return tx_ring_iova; 372 | } 373 | 374 | // unmask INTx 375 | static void unmask_intx(struct device* dev) { 376 | char irq_set_buf[sizeof(struct vfio_irq_set)]; 377 | struct vfio_irq_set* irq_set = (struct vfio_irq_set*)irq_set_buf; 378 | irq_set->argsz = sizeof(struct vfio_irq_set); 379 | irq_set->count = 1; 380 | irq_set->flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_UNMASK; 381 | irq_set->index = VFIO_PCI_INTX_IRQ_INDEX; 382 | irq_set->start = 0; 383 | int ret = ioctl(dev->fd, VFIO_DEVICE_SET_IRQS, irq_set); 384 | ASSERT(ret == 0, "faield to unmask INTx interrupt"); 385 | } 386 | 387 | // Enable INTx interrupt 388 | static void enable_intx(struct device* dev) { 389 | debug("Use INTx interrupt"); 390 | struct vfio_irq_set* irq_set; 391 | char irq_set_buf[sizeof(struct vfio_irq_set) + sizeof(int)]; 392 | irq_set = (struct vfio_irq_set*)irq_set_buf; 393 | irq_set->argsz = sizeof(irq_set_buf); 394 | irq_set->count = 1; 395 | irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD | VFIO_IRQ_SET_ACTION_TRIGGER; 396 | irq_set->index = VFIO_PCI_INTX_IRQ_INDEX; 397 | irq_set->start = 0; 398 | dev->efd = eventfd(0, EFD_CLOEXEC | EFD_NONBLOCK); 399 | debug("efd = %d\n", dev->efd); 400 | ASSERT(dev->efd >= 0, "efd init failed"); 401 | *(int*)&irq_set->data = dev->efd; 402 | int ret = ioctl(dev->fd, VFIO_DEVICE_SET_IRQS, irq_set); 403 | ASSERT(ret == 0, "faield to enable INTx interrupt"); 404 | 405 | unmask_intx(dev); 406 | } 407 | 408 | // Enable MSI interrupt 409 | // 82574L only has one MSI interrupt vector 410 | // so basically same as INTx from driver's point of view 411 | static void enable_msi(struct device* dev) { 412 | debug("Use MSI interrupt"); 413 | struct vfio_irq_set* irq_set; 414 | char irq_set_buf[sizeof(struct vfio_irq_set) + sizeof(int)]; 415 | irq_set = (struct vfio_irq_set*)irq_set_buf; 416 | irq_set->argsz = sizeof(irq_set_buf); 417 | irq_set->count = 1; 418 | irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD | VFIO_IRQ_SET_ACTION_TRIGGER; 419 | irq_set->index = VFIO_PCI_MSI_IRQ_INDEX; 420 | irq_set->start = 0; 421 | dev->efd = eventfd(0, EFD_CLOEXEC | EFD_NONBLOCK); 422 | debug("efd = %d\n", dev->efd); 423 | ASSERT(dev->efd >= 0, "efd init failed"); 424 | *(int*)&irq_set->data = dev->efd; 425 | int ret = ioctl(dev->fd, VFIO_DEVICE_SET_IRQS, irq_set); 426 | ASSERT(ret == 0, "faield to enable MSI interrupt"); 427 | } 428 | 429 | // Enable MSI-X Interrupt 430 | // 82574L has five MSI-X interrupt vectors 431 | static void enable_msix(struct device* dev) { 432 | debug("Use MSI-X interrupt"); 433 | struct vfio_irq_set* irq_set; 434 | char irq_set_buf[sizeof(struct vfio_irq_set) + 435 | sizeof(int) * MAX_MSIX_VECTOR_NUM]; 436 | irq_set = (struct vfio_irq_set*)irq_set_buf; 437 | irq_set->argsz = sizeof(irq_set_buf); 438 | irq_set->count = MAX_MSIX_VECTOR_NUM; 439 | irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD | VFIO_IRQ_SET_ACTION_TRIGGER; 440 | irq_set->index = VFIO_PCI_MSIX_IRQ_INDEX; 441 | irq_set->start = 0; 442 | for (int i = 0; i < MAX_MSIX_VECTOR_NUM; i++) { 443 | dev->efds[i] = eventfd(0, EFD_CLOEXEC | EFD_NONBLOCK); 444 | ASSERT(dev->efds[i] >= 0, "efd init failed"); 445 | } 446 | memcpy((int*)&irq_set->data, dev->efds, sizeof(dev->efds)); 447 | int ret = ioctl(dev->fd, VFIO_DEVICE_SET_IRQS, irq_set); 448 | ASSERT(ret == 0, "faield to enable MSI-X interrupt"); 449 | 450 | // Setup MSI-X interrupt vector 451 | // RxQ0 => 0 452 | // RxQ1 => 1 453 | // TxQ0 => 2 454 | // TxQ1 => 3 455 | // Other => 4 456 | write_u32(dev, IVAR, 457 | IVAR_EN_RXQ0 | IVAR_EN_RXQ1 | IVAR_EN_TXQ0 | IVAR_EN_TXQ1 | 458 | IVAR_EN_OTHER | 0 << IVAR_RXQ0_VEC_SHIFT | 459 | 1 << IVAR_RXQ1_VEC_SHIFT | 2 << IVAR_TXQ0_VEC_SHIFT | 460 | 3 << IVAR_TXQ1_VEC_SHIFT | 4 << IVAR_OTHER_VEC_SHIFT); 461 | } 462 | 463 | // Disable all interrupts 464 | static void disable_interrupt(struct device* dev) { 465 | write_u32(dev, IMC, 0xffffffff); 466 | } 467 | 468 | // Reset device 469 | static void reset(struct device* dev) { 470 | set_flags_u32(dev, CTRL_, CTRL_RST); 471 | usleep(500 * 1000); 472 | } 473 | 474 | // Set general configuration 475 | static void global_configuration(struct device* dev) { 476 | // CTRL.FD = 1 477 | set_flags_u32(dev, CTRL_, CTRL_FD); 478 | // GCR[22] = 1 479 | write_u32(dev, GCR, read_u32(dev, GCR) | 1 << 22); 480 | 481 | // no flow control 482 | write_u32(dev, FCAH, 0); 483 | write_u32(dev, FCAL, 0); 484 | write_u32(dev, FCT, 0); 485 | write_u32(dev, FCTTV, 0); 486 | } 487 | 488 | // Initialize some statistics registers 489 | // NOTE: All statistics registers reset when read 490 | static void init_stat_regs(struct device* dev) { 491 | read_u32(dev, MPC); 492 | read_u32(dev, GPRC); 493 | read_u32(dev, GPTC); 494 | read_u32(dev, GORCL); 495 | read_u32(dev, GORCH); 496 | read_u32(dev, GOTCL); 497 | read_u32(dev, GOTCH); 498 | } 499 | 500 | // Set link up 501 | static void linkup(struct device* dev) { 502 | set_flags_u32(dev, CTRL_, CTRL_SLU); 503 | int retry = 50; 504 | printf("waiting linkup."); 505 | while (!(read_u32(dev, STATUS) & 0x2) && retry--) { 506 | printf("."); 507 | fflush(stdout); 508 | usleep(500 * 1000); 509 | } 510 | printf("\n"); 511 | ASSERT(read_u32(dev, STATUS) & 0x2, "failed to link up"); 512 | } 513 | 514 | // Receive Initialization 515 | static void init_receive(struct device* dev) { 516 | u64 rx_ring_iova = init_rx_buf(dev); 517 | debug("rx ring iova = %08lX", rx_ring_iova); 518 | write_u32(dev, RDLEN, NUM_OF_DESC * sizeof(struct rdesc)); 519 | write_u32(dev, RDBAL, (u32)(rx_ring_iova & 0xFFFFFFFFull)); 520 | write_u32(dev, RDBAH, (u32)(rx_ring_iova >> 32)); 521 | write_u32(dev, RDH, 0); 522 | write_u32(dev, RDT, NUM_OF_DESC - 1); 523 | 524 | // Enable receive 525 | write_u32(dev, RCTL, 526 | RCTL_EN | /* Enable */ 527 | RCTL_UPE | /* Unicast Promiscuous Enable*/ 528 | RCTL_MPE | /* Multicast Promiscuous Enable */ 529 | RCTL_BSIZE1 | /* BSIZE == 11b => 4096 bytes (if BSEX = 1) */ 530 | RCTL_BSIZE2 | /* */ 531 | RCTL_LPE | /* Long Packet Enable */ 532 | RCTL_BAM | /* Broadcast Accept Mode */ 533 | RCTL_BSEX | /* Buffer Size Extension */ 534 | RCTL_SECRC /* Strip Ethernet CRC from incoming packet */ 535 | ); 536 | } 537 | 538 | // Transmit Initialization 539 | static void init_transmit(struct device* dev) { 540 | u64 tx_ring_iova = init_tx_buf(dev); 541 | debug("tx ring iova = %08lX", tx_ring_iova); 542 | write_u32(dev, TDBAH, (u32)((u64)tx_ring_iova >> 32)); 543 | write_u32(dev, TDBAL, (u32)((u64)tx_ring_iova & 0xFFFFFFFFull)); 544 | write_u32(dev, TDLEN, NUM_OF_DESC * sizeof(struct tdesc)); 545 | write_u32(dev, TDH, 0); 546 | write_u32(dev, TDT, 0); 547 | 548 | // Enable transmit 549 | write_u32(dev, TCTL, 550 | TCTL_EN | /* Enable */ 551 | TCTL_PSP /* Pad short packets */ 552 | ); 553 | } 554 | 555 | // enable interrupt 556 | // 1. setup eventfd and enable device interrupts 557 | // 2. set IMS register appropriately 558 | static void enable_interrupt(struct device* dev) { 559 | #if defined(MSI) 560 | enable_msi(dev); 561 | #elif defined(MSIX) 562 | enable_msix(dev); 563 | #else 564 | enable_intx(dev); 565 | #endif 566 | 567 | write_u32(dev, IMS, 568 | IMS_LSC | /* Link State Change */ 569 | IMS_RXT | /* Receiver Timer Interrupt */ 570 | IMS_RXDMT /* Receiver descriptor minimum threshold hit */ 571 | ); 572 | 573 | #ifdef TXINT 574 | set_flags_u32(dev, IMS, IMS_TXDW); /* Transmit Descriptor Written Back */ 575 | // add some interrupt delay 576 | // (otherwise ICR_TXDW will be cleared before an interrupt arrives) 577 | write_u32(dev, TIDV, 1); 578 | #endif 579 | #ifdef MSIX 580 | set_flags_u32(dev, IMS, IMS_RXQ0 | IMS_TXQ0 | IMS_OTHER); 581 | #endif 582 | debug("IMS: %08X", read_u32(dev, IMS)); 583 | } 584 | 585 | void init_device(struct device* dev) { 586 | // mmap BAR0 587 | struct vfio_region_info* bar0_info = &dev->regs[VFIO_PCI_BAR0_REGION_INDEX]; 588 | dev->mmio_addr = mmap(NULL, bar0_info->size, PROT_READ | PROT_WRITE, 589 | MAP_SHARED, dev->fd, bar0_info->offset); 590 | ASSERT(dev->mmio_addr != MAP_FAILED, "mmap failed"); 591 | 592 | // c.f. manual 4.6 593 | 594 | /* 1. Disable Interrupts */ 595 | disable_interrupt(dev); 596 | 597 | /* 2. Global reset & general configuration */ 598 | reset(dev); 599 | disable_interrupt(dev); 600 | global_configuration(dev); 601 | 602 | /* 3. Setup the PHY and the link */ 603 | linkup(dev); 604 | 605 | /* 4. Initialize statistical counters */ 606 | init_stat_regs(dev); 607 | 608 | /* 5. Initialize Receive */ 609 | init_receive(dev); 610 | 611 | /* 6. Initialize Transmit */ 612 | init_transmit(dev); 613 | 614 | #ifndef POLL 615 | /* 7. Enable Interrupts */ 616 | enable_interrupt(dev); 617 | #endif 618 | 619 | /* dump some information */ 620 | u32 rah0 = read_u32(dev, RAH0); 621 | u32 ral0 = read_u32(dev, RAL0); 622 | debug("MAC: %02X:%02X:%02X:%02X:%02X:%02X", (ral0)&0xff, (ral0 >> 8) & 0xff, 623 | (ral0 >> 16) & 0xff, (ral0 >> 24) & 0xff, (rah0)&0xff, 624 | (rah0 >> 8) & 0xff); 625 | debug("CTRL: %08X", read_u32(dev, CTRL_)); 626 | u32 status = read_u32(dev, STATUS); 627 | debug("STATUS: %08X", status); 628 | debug(" FD = %d", status & 0x1); 629 | debug(" LU = %d", (status >> 1) & 0x1); 630 | debug(" SPEED = %d", (status >> 6) & 0x3); 631 | debug("RCTL: %08X", read_u32(dev, RCTL)); 632 | debug("RDBAL: %08X, RDBAH=%08X", read_u32(dev, RDBAL), 633 | read_u32(dev, RDBAH)); 634 | debug("RDLEN: %08X", read_u32(dev, RDLEN)); 635 | debug("TCTL: %08X", read_u32(dev, TCTL)); 636 | debug("TDBAL: %08X, TDBAH=%08X", read_u32(dev, TDBAL), 637 | read_u32(dev, TDBAH)); 638 | debug("TDLEN: %08X", read_u32(dev, TDLEN)); 639 | } 640 | 641 | void dump_pkt(void* addr) { 642 | struct ethhdr* eth = addr; 643 | 644 | printf("src=%02X:%02X:%02X:%02X:%02X:%02X\n", eth->h_source[5], 645 | eth->h_source[4], eth->h_source[3], eth->h_source[2], 646 | eth->h_source[1], eth->h_source[0]); 647 | 648 | printf("dst=%02X:%02X:%02X:%02X:%02X:%02X\n", eth->h_dest[5], 649 | eth->h_dest[4], eth->h_dest[3], eth->h_dest[2], eth->h_dest[1], 650 | eth->h_dest[0]); 651 | printf("proto=%04X\n", ntohs(eth->h_proto)); 652 | } 653 | 654 | // transmit packet 655 | void tx(struct device* dev, void* buffer, ssize_t len) { 656 | ASSERT(len <= BUFSIZE, "too much large packet: %lu", len); 657 | u32 tdt = read_u32(dev, TDT); 658 | u32 tdh = read_u32(dev, TDH); 659 | debug("tdh=%u, tdt=%u", tdh, tdt); 660 | if (tdh != ((tdt + 1) % NUM_OF_DESC)) { 661 | memcpy(dev->tx_ring_buf_vaddr[tdt], buffer, len); 662 | dev->tx_ring[tdt].length = len; 663 | dev->tx_ring[tdt].ifcs = 1; // insert FCS 664 | dev->tx_ring[tdt].eop = 1; // end of packets 665 | #ifdef TXINT 666 | dev->tx_ring[tdt].rs = 1; // report status 667 | #ifndef MSIX 668 | // In MSI-X, IDE bit should not be set (manual 7.2.8) 669 | dev->tx_ring[tdt].ide = 1; // interrupt delay enable 670 | #endif 671 | #endif 672 | write_u32(dev, TDT, (tdt + 1) % NUM_OF_DESC); 673 | } 674 | } 675 | 676 | static void set_source_mac(struct device* dev, struct ethhdr* eth) { 677 | u32 rah0 = read_u32(dev, RAH0); 678 | u32 ral0 = read_u32(dev, RAL0); 679 | eth->h_source[5] = (rah0 >> 8) & 0xff; 680 | eth->h_source[4] = (rah0)&0xff; 681 | eth->h_source[3] = (ral0 >> 24) & 0xff; 682 | eth->h_source[2] = (ral0 >> 16) & 0xff; 683 | eth->h_source[1] = (ral0 >> 8) & 0xff; 684 | eth->h_source[0] = (ral0)&0xff; 685 | } 686 | 687 | void set_ether(struct device* dev, void* addr) { 688 | struct ethhdr* eth = addr; 689 | memcpy(eth->h_source, eth->h_dest, 6); 690 | set_source_mac(dev, eth); 691 | } 692 | 693 | // 1. dump receive packtes 694 | // 2. echo pakcet if needed 695 | // 3. clear desc and advance RDH 696 | u32 rx(struct device* dev, u32 idx) { 697 | dump_pkt(dev->rx_ring_buf_vaddr[idx]); 698 | #ifdef ECHO 699 | // note that we can do this through zero-copy 700 | set_ether(dev, dev->rx_ring_buf_vaddr[idx]); 701 | tx(dev, dev->rx_ring_buf_vaddr[idx], dev->rx_ring[idx].length); 702 | #endif 703 | // clear desc 704 | dev->rx_ring[idx].dd = 0; 705 | u32 head = read_u32(dev, RDH); 706 | if (head != idx) { 707 | write_u32(dev, RDT, idx); 708 | } 709 | return (idx + 1) % NUM_OF_DESC; 710 | } 711 | 712 | // Poll receive descriptor 713 | static void poll(struct device* dev) { 714 | printf("start polling\n"); 715 | u32 rx_idx = 0; 716 | int cnt = 0; 717 | while (1) { 718 | if (dev->rx_ring[rx_idx].dd) { // descriptor done 719 | rx_idx = rx(dev, rx_idx); 720 | cnt += 1; 721 | } 722 | } 723 | } 724 | 725 | // Wait interrupt using epoll_wait and handle it when it arrives 726 | // Note that we can simply use `read(dev->efd, &u, sizeof(u))` for INTx and MSI 727 | // (eventfd should be created without `EFD_NONBLOCK`) 728 | static void handle_intr(struct device* dev) { 729 | // Create epoll fd 730 | dev->epfd = epoll_create1(EPOLL_CLOEXEC); 731 | ASSERT(dev->epfd >= 0, "failed to create epoll fd"); 732 | 733 | // Add eventfd to epoll 734 | #ifndef MSIX // INTx, MSI 735 | struct epoll_event ev = {.events = EPOLLIN | EPOLLPRI, .data.fd = dev->efd}; 736 | int ret = epoll_ctl(dev->epfd, EPOLL_CTL_ADD, dev->efd, &ev); 737 | ASSERT(ret == 0, "cannot add fd to epoll"); 738 | #else 739 | for (int i = 0; i < MAX_MSIX_VECTOR_NUM; i++) { 740 | struct epoll_event ev = {.events = EPOLLIN | EPOLLPRI, 741 | .data.fd = dev->efds[i]}; 742 | int ret = epoll_ctl(dev->epfd, EPOLL_CTL_ADD, dev->efds[i], &ev); 743 | ASSERT(ret == 0, "cannot add fd to epoll"); 744 | } 745 | #endif 746 | 747 | struct epoll_event evs; 748 | u32 rx_idx = 0; 749 | printf("waiting interrupts...\n"); 750 | for (;;) { 751 | // blocking wait 752 | int rc = epoll_wait(dev->epfd, &evs, 1, -1); 753 | ASSERT(rc > 0, "epoll error"); 754 | debug("epoll return: %d", rc); 755 | u64 u; 756 | debug("evs fd = %d", evs.data.fd); 757 | ssize_t s = read(evs.data.fd, &u, sizeof(u)); 758 | ASSERT(s == sizeof(u), "efd read failed"); 759 | 760 | u32 icr = read_u32(dev, ICR); 761 | debug("ICR = %08x", icr); 762 | 763 | #if !defined(MSIX) 764 | if (icr & (IMS_RXDMT | IMS_RXT)) { 765 | ASSERT(dev->rx_ring[rx_idx].dd == 1, "dd != 1"); 766 | debug("receive interrupt"); 767 | rx_idx = rx(dev, rx_idx); 768 | } 769 | if (icr & IMS_LSC) { 770 | debug("link state change"); 771 | } 772 | if (icr & (IMS_TXDW)) { 773 | debug("transmit interrupt"); 774 | } 775 | #else 776 | if (evs.data.fd == dev->efds[0]) { // RX0 777 | ASSERT(dev->rx_ring[rx_idx].dd == 1, "dd != 1"); 778 | debug("RX0 interrupt"); 779 | rx_idx = rx(dev, rx_idx); 780 | } else if (evs.data.fd == dev->efds[1]) { // RX1 781 | debug("RX1 interrupt"); 782 | } else if (evs.data.fd == dev->efds[2]) { // TX0 783 | debug("TX0 interrupt"); 784 | } else if (evs.data.fd == dev->efds[3]) { // TX1 785 | debug("TX1 interrupt"); 786 | } else { 787 | debug("Other interrupt"); 788 | } 789 | #endif 790 | 791 | // clear interrupt 792 | write_u32(dev, ICR, read_u32(dev, ICR) | 0xFFFFFFFF); 793 | 794 | #if !defined(MSI) && !defined(MSIX) 795 | // INTx is automatically masked by the VFIO INTx handler 796 | unmask_intx(dev); 797 | #endif 798 | } 799 | } 800 | 801 | // Construct dummy packet 802 | static ssize_t create_dummy_packet(struct device* dev, char* buf) { 803 | struct ethhdr* eth = (struct ethhdr*)buf; 804 | set_source_mac(dev, eth); 805 | eth->h_dest[0] = 0xFF; 806 | eth->h_dest[1] = 0xFF; 807 | eth->h_dest[2] = 0xFF; 808 | eth->h_dest[3] = 0xFF; 809 | eth->h_dest[4] = 0xFF; 810 | eth->h_dest[5] = 0xFF; 811 | eth->h_proto = htons(0x0800); // IPv4 812 | struct iphdr* ip = (struct iphdr*)((char*)eth + sizeof(struct ethhdr)); 813 | ip->ihl = 5; 814 | ip->version = 4; 815 | ip->tot_len = htons(sizeof(struct ip) + sizeof(struct udphdr) + 1); 816 | ip->protocol = 17; // UDP 817 | ip->ttl = 255; 818 | ip->saddr = htonl(0xc0a8140a); // 192.168.20.10 819 | ip->daddr = htonl(0xc0a81414); // 192.168.20.20 820 | // XXX: Should calculate checksum 821 | ip->check = 0; 822 | struct udphdr* udp = (struct udphdr*)((char*)ip + sizeof(struct ip)); 823 | udp->uh_sport = htons(10000); 824 | udp->uh_dport = htons(20000); 825 | udp->uh_ulen = htons(sizeof(struct udphdr) + 1); 826 | udp->uh_sum = 0; 827 | buf[sizeof(struct ethhdr) + sizeof(struct ip) + sizeof(struct udphdr)] = 828 | 'a'; 829 | 830 | return sizeof(struct ethhdr) + sizeof(struct ip) + sizeof(struct udphdr) + 831 | 1; 832 | } 833 | 834 | // Send a dummy packet when pressing a key 835 | static void pkt_send(struct device* dev) { 836 | char buf[4096]; 837 | memset(buf, 0, 4096); 838 | ssize_t len = create_dummy_packet(dev, buf); 839 | while (1) { 840 | getchar(); 841 | tx(dev, buf, len); 842 | debug("send pkt"); 843 | } 844 | } 845 | 846 | // Compile flag (() is default) 847 | // - (mapping from zero), IDENTITY_MAP, PHYSADDR_MAP 848 | // - (INTx), MSI, MSIX 849 | // - (interrupt), POLL, PKTSEND 850 | // - (no echo), ECHO 851 | // - (no tx interrupts), TXINT 852 | // 853 | // note that interrupts option is only valid when do `handle_intr()` 854 | 855 | int main(int argc, char* argv[]) { 856 | int segn, busn, devn, funcn, i; 857 | struct device dev; 858 | 859 | if (argc < 2 || sscanf(argv[1], "%04x:%02x:%02x.%d", &segn, &busn, &devn, 860 | &funcn) != 4) { 861 | printf("Usage: %s ssss:bb:dd.f\n", argv[1]); 862 | return -1; 863 | } 864 | 865 | init_vfio(&dev, segn, busn, devn, funcn); 866 | enable_bus_master(&dev); 867 | init_device(&dev); 868 | 869 | #if defined(POLL) 870 | poll(&dev); 871 | #elif defined(PKTSEND) 872 | pkt_send(&dev); 873 | #else 874 | handle_intr(&dev); 875 | #endif 876 | 877 | // XXX: It's better to do cleanup 878 | 879 | return 0; 880 | } 881 | --------------------------------------------------------------------------------