├── .gitignore ├── Gopkg.lock ├── Gopkg.toml ├── LICENSE ├── Makefile ├── README.md ├── device.go ├── encapsulates.png ├── etcd.go ├── iptables.go ├── main.go ├── subnet.go ├── vxlan.sh └── vxlan_overview.png /.gitignore: -------------------------------------------------------------------------------- 1 | vendor -------------------------------------------------------------------------------- /Gopkg.lock: -------------------------------------------------------------------------------- 1 | # This file is autogenerated, do not edit; changes may be undone by the next 'dep ensure'. 2 | 3 | 4 | [[projects]] 5 | name = "github.com/Sirupsen/logrus" 6 | packages = ["."] 7 | revision = "d682213848ed68c0a260ca37d6dd5ace8423f5ba" 8 | version = "v1.0.4" 9 | 10 | [[projects]] 11 | name = "github.com/coreos/etcd" 12 | packages = [ 13 | "client", 14 | "pkg/pathutil", 15 | "pkg/srv", 16 | "pkg/types", 17 | "version" 18 | ] 19 | revision = "28f3f26c0e303392556035b694f75768d449d33d" 20 | version = "v3.3.1" 21 | 22 | [[projects]] 23 | name = "github.com/coreos/go-iptables" 24 | packages = ["iptables"] 25 | revision = "259c8e6a4275d497442c721fa52204d7a58bde8b" 26 | version = "v0.2.0" 27 | 28 | [[projects]] 29 | name = "github.com/coreos/go-semver" 30 | packages = ["semver"] 31 | revision = "8ab6407b697782a06568d4b7f1db25550ec2e4c6" 32 | version = "v0.2.0" 33 | 34 | [[projects]] 35 | name = "github.com/ugorji/go" 36 | packages = ["codec"] 37 | revision = "9831f2c3ac1068a78f50999a30db84270f647af6" 38 | version = "v1.1" 39 | 40 | [[projects]] 41 | branch = "master" 42 | name = "github.com/vishvananda/netlink" 43 | packages = [ 44 | ".", 45 | "nl" 46 | ] 47 | revision = "25d2c79295b361cac34eb41cfae90d5c69078b2a" 48 | 49 | [[projects]] 50 | branch = "master" 51 | name = "github.com/vishvananda/netns" 52 | packages = ["."] 53 | revision = "be1fbeda19366dea804f00efff2dd73a1642fdcc" 54 | 55 | [[projects]] 56 | branch = "master" 57 | name = "golang.org/x/crypto" 58 | packages = ["ssh/terminal"] 59 | revision = "650f4a345ab4e5b245a3034b110ebc7299e68186" 60 | 61 | [[projects]] 62 | branch = "master" 63 | name = "golang.org/x/sys" 64 | packages = [ 65 | "unix", 66 | "windows" 67 | ] 68 | revision = "37707fdb30a5b38865cfb95e5aab41707daec7fd" 69 | 70 | [solve-meta] 71 | analyzer-name = "dep" 72 | analyzer-version = 1 73 | inputs-digest = "f70a513a74c98c294713c58abcd03dae4ae7a54168cc2f4c99893fecdc55cc5e" 74 | solver-name = "gps-cdcl" 75 | solver-version = 1 76 | -------------------------------------------------------------------------------- /Gopkg.toml: -------------------------------------------------------------------------------- 1 | # Gopkg.toml example 2 | # 3 | # Refer to https://github.com/golang/dep/blob/master/docs/Gopkg.toml.md 4 | # for detailed Gopkg.toml documentation. 5 | # 6 | # required = ["github.com/user/thing/cmd/thing"] 7 | # ignored = ["github.com/user/project/pkgX", "bitbucket.org/user/project/pkgA/pkgY"] 8 | # 9 | # [[constraint]] 10 | # name = "github.com/user/project" 11 | # version = "1.0.0" 12 | # 13 | # [[constraint]] 14 | # name = "github.com/user/project2" 15 | # branch = "dev" 16 | # source = "github.com/myfork/project2" 17 | # 18 | # [[override]] 19 | # name = "github.com/x/y" 20 | # version = "2.4.0" 21 | 22 | 23 | [[constraint]] 24 | name = "github.com/Sirupsen/logrus" 25 | version = "1.0.4" 26 | 27 | [[constraint]] 28 | branch = "master" 29 | name = "github.com/vishvananda/netlink" 30 | 31 | [[constraint]] 32 | name = "github.com/coreos/etcd" 33 | version = "3.3.1" 34 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2018 ~ present cssivision 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | build: 2 | GOOS=linux GOARCH=amd64 go build -o vxlan -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # VXLAN 2 | 3 | This is a toy used to learn VXLAN. 4 | 5 | Virtual Extensible LAN (VXLAN) is a network virtualization technology that attempts to address the scalability problems associated with large cloud computing deployments. It uses a VLAN-like encapsulation technique to encapsulate OSI layer 2 Ethernet frames within layer 4 UDP datagrams. 6 | 7 | ## The objective of VXLan 8 | 9 | ### 1, Create virtual L2 network over physical L3 network 10 | ![](./vxlan_overview.png) 11 | 12 | ### 2, VXLan encpasulates L2 packet inside L3 packet 13 | ![](./encapsulates.png) 14 | 15 | ### 3, `VTEP` implementaion 16 | 17 | packet encpasulates is not enought for L2 over L3. VXLan device need to implement the following features. 18 | - ARP resolution: Need to reply to ARP request from local servers without broadcasting the ARP packet. 19 | - Destination search: Need to find the destination location corresponding to the destination MAC. 20 | 21 | those features refered as `VTEP`(VXLAN endpoints, which terminate VXLAN tunnels and may be either virtual or physical switch ports, are known as VXLAN tunnel endpoints (VTEPs)). 22 | 23 | there are some variations of `VETP` implementation, we use local agent and virtual VXLan switch run on Linux servers. 24 | 25 | ## Implementation `VTEP` in this project. 26 | 27 | Create the vxlan device, Then as each remote host is discovered (either on startup or when they are added), do the following 28 | 29 | - Create routing table entry for the remote subnet. It goes via the vxlan device but also specifies a next hop (of the remote host). 30 | - Create a static ARP entry for the remote host IP address (and the VTEP MAC) 31 | - Create an FDB entry with the VTEP MAC and the public IP of the remote daemon. 32 | 33 | In this scheme the scaling of table entries is linear to the number of remote hosts - 1 route, 1 arp entry and 1 FDB entry per host. 34 | 35 | use `etcd` as the key-value store to exchange information when remote host status changed(add, delete, update, etc...). 36 | 37 | ## Usage 38 | 39 | get the lastest release [binary](https://github.com/cssivision/vxlan/releases). 40 | 41 | Run 42 | ```sh 43 | sudo ./vxlan -etcdEndpoint http://etcd:2379 44 | ``` 45 | 46 | you will get log similar to the following. 47 | ``` 48 | INFO[0000] Determining IP address of default interface 49 | INFO[0000] Using interface with name eth0 and address 10.146.0.3 50 | INFO[0000] Defaulting external address to interface address (10.146.0.3) 51 | INFO[0000] VXLAN device already exists 52 | INFO[0000] Returning existing device 53 | INFO[0000] subnet key expired in: 2018-02-25 09:23:53.467058164 +0000 UTC 54 | INFO[0000] create subnet: 10.10.238.0, net mask: 24 55 | INFO[0000] MTU: 1410 56 | INFO[0000] VXLan HardwareAddr: 1a:0f:87:98:5e:c7 57 | INFO[0000] Running backend. 58 | INFO[0000] adding subnet: 10.5.10.0/24 PublicIP: 10.140.0.3 VtepMAC: f6:ad:73:33:de:0b 59 | INFO[0000] calling AddARP: 10.5.10.0, f6:ad:73:33:de:0b 60 | INFO[0000] calling AddFDB: 10.140.0.3, f6:ad:73:33:de:0b 61 | ``` 62 | 63 | ## Use with docker 64 | Docker daemon accepts --bip argument to configure the subnet of the docker0 bridge. It also accepts --mtu to set the MTU for docker0 and veth devices that it will be creating. 65 | 66 | use `subnet` and `MTU` in log 67 | ``` 68 | INFO[0000] create subnet: 10.10.238.0, net mask: 24 69 | INFO[0000] MTU: 1410 70 | ``` 71 | instead of `10.10.238.0/24` use `10.10.238.1/24` as the ip of the docker0 bridge. 72 | ```sh 73 | dockerd --bip=${10.10.238.1/24} --mtu=1410 & 74 | ``` 75 | 76 | ## Reference 77 | - Flannel https://github.com/coreos/flannel 78 | - VXLan attributes, Please consult the man page for `ip link` and see the vxlan section for more details. 79 | - How VXLan work? https://www.slideshare.net/enakai/how-vxlan-works-on-linux 80 | - how-vxlan-works-on-l2-and-across-l3-networks https://www.slideshare.net/anandnande/how-vxlan-works-on-l2-and-across-l3-networks 81 | - IP route fundamental 82 | - https://www.thegeekstuff.com/2012/04/ip-routing-intro/ 83 | - https://www.thegeekstuff.com/2012/04/route-examples/ 84 | - https://www.thegeekstuff.com/2012/05/route-flags/ 85 | -------------------------------------------------------------------------------- /device.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "fmt" 5 | "net" 6 | "syscall" 7 | 8 | "github.com/Sirupsen/logrus" 9 | "github.com/vishvananda/netlink" 10 | ) 11 | 12 | type vxlanDeviceAttrs struct { 13 | vni uint32 14 | name string 15 | vtepIndex int 16 | vtepAddr net.IP 17 | vtepPort int 18 | gbp bool 19 | } 20 | 21 | type vxlanDevice struct { 22 | link *netlink.Vxlan 23 | directRouting bool 24 | } 25 | 26 | func newVxlanDevice(devAttrs *vxlanDeviceAttrs) (*vxlanDevice, error) { 27 | link := &netlink.Vxlan{ 28 | LinkAttrs: netlink.LinkAttrs{ 29 | Name: devAttrs.name, 30 | }, 31 | VxlanId: int(devAttrs.vni), 32 | VtepDevIndex: devAttrs.vtepIndex, 33 | SrcAddr: devAttrs.vtepAddr, 34 | Port: devAttrs.vtepPort, 35 | Learning: false, 36 | GBP: devAttrs.gbp, 37 | } 38 | 39 | link, err := ensureLink(link) 40 | if err != nil { 41 | return nil, err 42 | } 43 | return &vxlanDevice{ 44 | link: link, 45 | }, nil 46 | } 47 | 48 | func ensureLink(vxlan *netlink.Vxlan) (*netlink.Vxlan, error) { 49 | err := netlink.LinkAdd(vxlan) 50 | if err == syscall.EEXIST { 51 | // it's ok if the device already exists as long as config is similar 52 | logrus.Infof("VXLAN device already exists") 53 | existing, err := netlink.LinkByName(vxlan.Name) 54 | if err != nil { 55 | return nil, err 56 | } 57 | 58 | incompat := vxlanLinksIncompat(vxlan, existing) 59 | if incompat == "" { 60 | logrus.Infof("Returning existing device") 61 | return existing.(*netlink.Vxlan), nil 62 | } 63 | 64 | // delete existing 65 | logrus.Warningf("%q already exists with incompatable configuration: %v; recreating device", vxlan.Name, incompat) 66 | if err = netlink.LinkDel(existing); err != nil { 67 | return nil, fmt.Errorf("failed to delete interface: %v", err) 68 | } 69 | 70 | // create new 71 | if err = netlink.LinkAdd(vxlan); err != nil { 72 | return nil, fmt.Errorf("failed to create vxlan interface: %v", err) 73 | } 74 | } else if err != nil { 75 | return nil, err 76 | } 77 | 78 | ifindex := vxlan.Index 79 | link, err := netlink.LinkByIndex(vxlan.Index) 80 | if err != nil { 81 | return nil, fmt.Errorf("can't locate created vxlan device with index %v", ifindex) 82 | } 83 | 84 | var ok bool 85 | if vxlan, ok = link.(*netlink.Vxlan); !ok { 86 | return nil, fmt.Errorf("created vxlan device with index %v is not vxlan", ifindex) 87 | } 88 | 89 | return vxlan, nil 90 | } 91 | 92 | func vxlanLinksIncompat(l1, l2 netlink.Link) string { 93 | if l1.Type() != l2.Type() { 94 | return fmt.Sprintf("link type: %v vs %v", l1.Type(), l2.Type()) 95 | } 96 | 97 | v1 := l1.(*netlink.Vxlan) 98 | v2 := l2.(*netlink.Vxlan) 99 | 100 | if v1.VxlanId != v2.VxlanId { 101 | return fmt.Sprintf("vni: %v vs %v", v1.VxlanId, v2.VxlanId) 102 | } 103 | 104 | if v1.VtepDevIndex > 0 && v2.VtepDevIndex > 0 && v1.VtepDevIndex != v2.VtepDevIndex { 105 | return fmt.Sprintf("vtep (external) interface: %v vs %v", v1.VtepDevIndex, v2.VtepDevIndex) 106 | } 107 | 108 | if len(v1.SrcAddr) > 0 && len(v2.SrcAddr) > 0 && !v1.SrcAddr.Equal(v2.SrcAddr) { 109 | return fmt.Sprintf("vtep (external) IP: %v vs %v", v1.SrcAddr, v2.SrcAddr) 110 | } 111 | 112 | if len(v1.Group) > 0 && len(v2.Group) > 0 && !v1.Group.Equal(v2.Group) { 113 | return fmt.Sprintf("group address: %v vs %v", v1.Group, v2.Group) 114 | } 115 | 116 | if v1.L2miss != v2.L2miss { 117 | return fmt.Sprintf("l2miss: %v vs %v", v1.L2miss, v2.L2miss) 118 | } 119 | 120 | if v1.Port > 0 && v2.Port > 0 && v1.Port != v2.Port { 121 | return fmt.Sprintf("port: %v vs %v", v1.Port, v2.Port) 122 | } 123 | 124 | if v1.GBP != v2.GBP { 125 | return fmt.Sprintf("gbp: %v vs %v", v1.GBP, v2.GBP) 126 | } 127 | 128 | return "" 129 | } 130 | 131 | func (dev *vxlanDevice) configure(ipn string) error { 132 | if err := ensureV4AddressOnLink(ipn, dev.link); err != nil { 133 | return fmt.Errorf("failed to ensure address of interface %s: %s", dev.link.Attrs().Name, err) 134 | } 135 | 136 | if err := netlink.LinkSetUp(dev.link); err != nil { 137 | return fmt.Errorf("failed to set interface %s to UP state: %s", dev.link.Attrs().Name, err) 138 | } 139 | 140 | return nil 141 | } 142 | 143 | func (dev *vxlanDevice) handleSubnetEvents(batch []Event) { 144 | for _, event := range batch { 145 | sn := event.Subnet 146 | attrs := event.Attrs 147 | 148 | // This route is used when traffic should be vxlan encapsulated 149 | vxlanRoute := netlink.Route{ 150 | LinkIndex: dev.link.Attrs().Index, 151 | Scope: netlink.SCOPE_UNIVERSE, 152 | Dst: sn.ToIPNet(), 153 | Gw: sn.IP.ToIP(), 154 | } 155 | vxlanRoute.SetFlag(syscall.RTNH_F_ONLINK) 156 | 157 | if event.Type == eventAdd { 158 | logrus.Infof("adding subnet: %s PublicIP: %s VtepMAC: %s", sn.StringSep(".", "/"), attrs.PublicIP.ToIP(), net.HardwareAddr(attrs.HardwareAddr)) 159 | if err := dev.AddARP(neighbor{IP: sn.IP.ToIP(), MAC: net.HardwareAddr(attrs.HardwareAddr)}); err != nil { 160 | logrus.Error("AddARP failed: ", err) 161 | continue 162 | } 163 | 164 | if err := dev.AddFDB(neighbor{IP: attrs.PublicIP.ToIP(), MAC: net.HardwareAddr(attrs.HardwareAddr)}); err != nil { 165 | logrus.Error("AddFDB failed: ", err) 166 | 167 | // Try to clean up the ARP entry then continue 168 | if err := dev.DelARP(neighbor{IP: sn.IP.ToIP(), MAC: net.HardwareAddr(attrs.HardwareAddr)}); err != nil { 169 | logrus.Error("DelARP failed: ", err) 170 | } 171 | 172 | continue 173 | } 174 | 175 | // Set the route - the kernel would ARP for the Gw IP address if it hadn't already been set above so make sure 176 | // this is done last. 177 | if err := netlink.RouteReplace(&vxlanRoute); err != nil { 178 | logrus.Errorf("failed to add vxlanRoute (%s -> %s): %v", vxlanRoute.Dst, vxlanRoute.Gw, err) 179 | 180 | // Try to clean up both the ARP and FDB entries then continue 181 | if err := dev.DelARP(neighbor{IP: sn.IP.ToIP(), MAC: net.HardwareAddr(attrs.HardwareAddr)}); err != nil { 182 | logrus.Error("DelARP failed: ", err) 183 | } 184 | 185 | if err := dev.DelFDB(neighbor{IP: attrs.PublicIP.ToIP(), MAC: net.HardwareAddr(attrs.HardwareAddr)}); err != nil { 186 | logrus.Error("DelFDB failed: ", err) 187 | } 188 | 189 | continue 190 | } 191 | } else { 192 | logrus.Infof("invalid event type: %v\n", event.Type) 193 | } 194 | } 195 | } 196 | 197 | type neighbor struct { 198 | MAC net.HardwareAddr 199 | IP net.IP 200 | } 201 | 202 | func (dev *vxlanDevice) AddFDB(n neighbor) error { 203 | logrus.Infof("calling AddFDB: %v, %v", n.IP, n.MAC) 204 | return netlink.NeighSet(&netlink.Neigh{ 205 | LinkIndex: dev.link.Index, 206 | State: netlink.NUD_PERMANENT, 207 | Family: syscall.AF_BRIDGE, 208 | Flags: netlink.NTF_SELF, 209 | IP: n.IP, 210 | HardwareAddr: n.MAC, 211 | }) 212 | } 213 | 214 | func (dev *vxlanDevice) DelFDB(n neighbor) error { 215 | logrus.Infof("calling DelFDB: %v, %v", n.IP, n.MAC) 216 | return netlink.NeighDel(&netlink.Neigh{ 217 | LinkIndex: dev.link.Index, 218 | Family: syscall.AF_BRIDGE, 219 | Flags: netlink.NTF_SELF, 220 | IP: n.IP, 221 | HardwareAddr: n.MAC, 222 | }) 223 | } 224 | 225 | func (dev *vxlanDevice) AddARP(n neighbor) error { 226 | logrus.Infof("calling AddARP: %v, %v", n.IP, n.MAC) 227 | return netlink.NeighSet(&netlink.Neigh{ 228 | LinkIndex: dev.link.Index, 229 | State: netlink.NUD_PERMANENT, 230 | Type: syscall.RTN_UNICAST, 231 | IP: n.IP, 232 | HardwareAddr: n.MAC, 233 | }) 234 | } 235 | 236 | func (dev *vxlanDevice) DelARP(n neighbor) error { 237 | logrus.Infof("calling DelARP: %v, %v", n.IP, n.MAC) 238 | return netlink.NeighDel(&netlink.Neigh{ 239 | LinkIndex: dev.link.Index, 240 | State: netlink.NUD_PERMANENT, 241 | Type: syscall.RTN_UNICAST, 242 | IP: n.IP, 243 | HardwareAddr: n.MAC, 244 | }) 245 | } 246 | 247 | // ensureV4AddressOnLink ensures that there is only one v4 Addr on `link` and it equals `ipn`. 248 | // If there exist multiple addresses on link, it returns an error message to tell callers to remove additional address. 249 | func ensureV4AddressOnLink(ipn string, link netlink.Link) error { 250 | addr, err := netlink.ParseAddr(ipn) 251 | if err != nil { 252 | return fmt.Errorf("parse address error: %v", err) 253 | } 254 | 255 | existingAddrs, err := netlink.AddrList(link, netlink.FAMILY_V4) 256 | if err != nil { 257 | return err 258 | } 259 | 260 | // this will never happen. This situation can only be caused by a user, so get them to sort it out. 261 | if len(existingAddrs) > 1 { 262 | return fmt.Errorf("link has incompatible addresses. Remove additional addresses and try again. %#v", link) 263 | } 264 | 265 | // If the device has an incompatible address then delete it. This can happen if the lease changes for example. 266 | if len(existingAddrs) == 1 && !existingAddrs[0].Equal(*addr) { 267 | if err := netlink.AddrDel(link, &existingAddrs[0]); err != nil { 268 | return fmt.Errorf("failed to remove IP address %s from %s: %s", ipn, link.Attrs().Name, err) 269 | } 270 | existingAddrs = []netlink.Addr{} 271 | } 272 | 273 | // Actually add the desired address to the interface if needed. 274 | if len(existingAddrs) == 0 { 275 | if err := netlink.AddrAdd(link, addr); err != nil { 276 | return fmt.Errorf("failed to add IP address %s to %s: %s", ipn, link.Attrs().Name, err) 277 | } 278 | } 279 | 280 | return nil 281 | } 282 | -------------------------------------------------------------------------------- /encapsulates.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cssivision/vxlan/80e9b9d4e3dca83944e7e3db54b8d103b609e7c0/encapsulates.png -------------------------------------------------------------------------------- /etcd.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "time" 5 | 6 | "github.com/coreos/etcd/client" 7 | ) 8 | 9 | func newEtcdClient(cfg config) (client.KeysAPI, error) { 10 | etcdCfg := client.Config{ 11 | Endpoints: []string{cfg.etcdEndpoint}, 12 | Transport: client.DefaultTransport, 13 | // set timeout per request to fail fast when the target endpoint is unavailable 14 | HeaderTimeoutPerRequest: time.Second, 15 | } 16 | c, err := client.New(etcdCfg) 17 | if err != nil { 18 | return nil, err 19 | } 20 | return client.NewKeysAPI(c), nil 21 | } 22 | -------------------------------------------------------------------------------- /iptables.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "fmt" 5 | "strings" 6 | "time" 7 | 8 | "github.com/Sirupsen/logrus" 9 | "github.com/coreos/go-iptables/iptables" 10 | ) 11 | 12 | type IPTables interface { 13 | AppendUnique(table string, chain string, rulespec ...string) error 14 | Delete(table string, chain string, rulespec ...string) error 15 | Exists(table string, chain string, rulespec ...string) (bool, error) 16 | } 17 | 18 | type IPTablesRule struct { 19 | table string 20 | chain string 21 | rulespec []string 22 | } 23 | 24 | func forwardRules(network string) []IPTablesRule { 25 | return []IPTablesRule{ 26 | // These rules allow traffic to be forwarded if it is to or from the network range. 27 | {"filter", "FORWARD", []string{"-s", network, "-j", "ACCEPT"}}, 28 | {"filter", "FORWARD", []string{"-d", network, "-j", "ACCEPT"}}, 29 | } 30 | } 31 | 32 | func ipTablesRulesExist(ipt IPTables, rules []IPTablesRule) (bool, error) { 33 | for _, rule := range rules { 34 | exists, err := ipt.Exists(rule.table, rule.chain, rule.rulespec...) 35 | if err != nil { 36 | // this shouldn't ever happen 37 | return false, fmt.Errorf("failed to check rule existence: %v", err) 38 | } 39 | if !exists { 40 | return false, nil 41 | } 42 | } 43 | 44 | return true, nil 45 | } 46 | 47 | func setupAndEnsureIPTables(rules []IPTablesRule, resyncPeriod int) { 48 | ipt, err := iptables.New() 49 | if err != nil { 50 | // if we can't find iptables, give up and return 51 | logrus.Errorf("Failed to setup IPTables. iptables binary was not found: %v", err) 52 | return 53 | } 54 | 55 | defer func() { 56 | teardownIPTables(ipt, rules) 57 | }() 58 | 59 | for { 60 | // Ensure that all the iptables rules exist every 5 seconds 61 | if err := ensureIPTables(ipt, rules); err != nil { 62 | logrus.Errorf("Failed to ensure iptables rules: %v", err) 63 | } 64 | 65 | time.Sleep(time.Duration(resyncPeriod) * time.Second) 66 | } 67 | } 68 | 69 | func ensureIPTables(ipt IPTables, rules []IPTablesRule) error { 70 | exists, err := ipTablesRulesExist(ipt, rules) 71 | if err != nil { 72 | return fmt.Errorf("Error checking rule existence: %v", err) 73 | } 74 | if exists { 75 | // if all the rules already exist, no need to do anything 76 | return nil 77 | } 78 | // Otherwise, teardown all the rules and set them up again 79 | // We do this because the order of the rules is important 80 | logrus.Info("Some iptables rules are missing; deleting and recreating rules") 81 | teardownIPTables(ipt, rules) 82 | if err = setupIPTables(ipt, rules); err != nil { 83 | return fmt.Errorf("Error setting up rules: %v", err) 84 | } 85 | return nil 86 | } 87 | 88 | func setupIPTables(ipt IPTables, rules []IPTablesRule) error { 89 | for _, rule := range rules { 90 | logrus.Info("Adding iptables rule: ", strings.Join(rule.rulespec, " ")) 91 | err := ipt.AppendUnique(rule.table, rule.chain, rule.rulespec...) 92 | if err != nil { 93 | return fmt.Errorf("failed to insert IPTables rule: %v", err) 94 | } 95 | } 96 | 97 | return nil 98 | } 99 | 100 | func teardownIPTables(ipt IPTables, rules []IPTablesRule) { 101 | for _, rule := range rules { 102 | logrus.Info("Deleting iptables rule: ", strings.Join(rule.rulespec, " ")) 103 | // We ignore errors here because if there's an error it's almost certainly because the rule 104 | // doesn't exist, which is fine (we don't need to delete rules that don't exist) 105 | ipt.Delete(rule.table, rule.chain, rule.rulespec...) 106 | } 107 | } 108 | -------------------------------------------------------------------------------- /main.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "context" 5 | "errors" 6 | "flag" 7 | "fmt" 8 | "math/rand" 9 | "net" 10 | "os" 11 | "os/signal" 12 | "syscall" 13 | "time" 14 | 15 | "github.com/Sirupsen/logrus" 16 | "github.com/vishvananda/netlink" 17 | ) 18 | 19 | func init() { 20 | rand.Seed(time.Now().Unix()) 21 | } 22 | 23 | const ( 24 | defaultVNI = 1 25 | iptablesResyncSeconds = 5 26 | encapOverhead = 50 27 | vxlanNetwork = "10.5.0.0/16" 28 | subNetworkTpl = "10.5.%v.0" 29 | ) 30 | 31 | type config struct { 32 | etcdEndpoint string 33 | } 34 | 35 | func main() { 36 | cfg := config{} 37 | flag.StringVar(&cfg.etcdEndpoint, "etcdEndpoint", "http://127.0.0.1:2379", "etcd endpoint") 38 | flag.Parse() 39 | 40 | sigs := make(chan os.Signal, 1) 41 | signal.Notify(sigs, os.Interrupt, syscall.SIGTERM) 42 | 43 | extIface, err := lookupExtIface() 44 | if err != nil { 45 | panic(fmt.Sprintf("lookupExtIface err: %v", err)) 46 | } 47 | 48 | devAttrs := vxlanDeviceAttrs{ 49 | vni: defaultVNI, 50 | name: fmt.Sprintf("vxlan.%v", defaultVNI), 51 | vtepIndex: extIface.Iface.Index, 52 | vtepAddr: extIface.IfaceAddr, 53 | vtepPort: 0, 54 | gbp: false, 55 | } 56 | 57 | dev, err := newVxlanDevice(&devAttrs) 58 | if err != nil { 59 | panic(fmt.Sprintf("newVXLANDevice err: %v", err)) 60 | } 61 | dev.directRouting = false 62 | 63 | publicIP := FromIP(extIface.ExtAddr) 64 | snIP := FromIP(net.ParseIP(fmt.Sprintf(subNetworkTpl, 1+rand.Intn(254)))) 65 | sn := IP4Net{ 66 | IP: snIP, 67 | PrefixLen: 24, 68 | } 69 | attrs := Attrs{ 70 | PublicIP: publicIP, 71 | Subnet: sn, 72 | HardwareAddr: dev.link.HardwareAddr, 73 | } 74 | 75 | ctx := context.Background() 76 | 77 | sm := newManager(cfg) 78 | if err := sm.createSubnet(ctx, sn, attrs); err != nil { 79 | panic(fmt.Errorf("create subnet fail: %v", err)) 80 | } 81 | 82 | logrus.Infof("create subnet: %v, net mask: %v", sn.IP.ToIP(), sn.PrefixLen) 83 | 84 | go handleSubnets(ctx, sn, &sm, dev) 85 | 86 | if err := dev.configure(fmt.Sprintf("%v/32", snIP.ToIP())); err != nil { 87 | panic(fmt.Errorf("failed to configure interface %s: %s", dev.link.Attrs().Name, err)) 88 | } 89 | 90 | go setupAndEnsureIPTables(forwardRules(vxlanNetwork), iptablesResyncSeconds) 91 | logrus.Infof("MTU: %v", extIface.Iface.MTU-encapOverhead) 92 | logrus.Infof("VXLan HardwareAddr: %v", dev.link.HardwareAddr) 93 | logrus.Info("Running backend.") 94 | <-sigs 95 | logrus.Info("shutdownHandler sent cancel signal...") 96 | } 97 | 98 | type externalInterface struct { 99 | Iface *net.Interface 100 | IfaceAddr net.IP 101 | ExtAddr net.IP 102 | } 103 | 104 | func lookupExtIface() (*externalInterface, error) { 105 | var iface *net.Interface 106 | var ifaceAddr net.IP 107 | var err error 108 | 109 | logrus.Info("Determining IP address of default interface") 110 | if iface, err = getDefaultGatewayIface(); err != nil { 111 | return nil, fmt.Errorf("failed to get default interface: %s", err) 112 | } 113 | 114 | if ifaceAddr == nil { 115 | ifaceAddr, err = getIfaceIP4Addr(iface) 116 | if err != nil { 117 | return nil, fmt.Errorf("failed to find IPv4 address for interface %s", iface.Name) 118 | } 119 | } 120 | 121 | logrus.Infof("Using interface with name %s and address %s", iface.Name, ifaceAddr) 122 | 123 | if iface.MTU == 0 { 124 | return nil, fmt.Errorf("failed to determine MTU for %s interface", ifaceAddr) 125 | } 126 | 127 | var extAddr net.IP 128 | if extAddr == nil { 129 | logrus.Infof("Defaulting external address to interface address (%s)", ifaceAddr) 130 | extAddr = ifaceAddr 131 | } 132 | 133 | return &externalInterface{ 134 | Iface: iface, 135 | IfaceAddr: ifaceAddr, 136 | ExtAddr: extAddr, 137 | }, nil 138 | } 139 | 140 | func getDefaultGatewayIface() (*net.Interface, error) { 141 | routes, err := netlink.RouteList(nil, syscall.AF_INET) 142 | if err != nil { 143 | return nil, err 144 | } 145 | 146 | for _, route := range routes { 147 | if route.Dst == nil || route.Dst.String() == "0.0.0.0/0" { 148 | if route.LinkIndex <= 0 { 149 | return nil, errors.New("Found default route but could not determine interface") 150 | } 151 | return net.InterfaceByIndex(route.LinkIndex) 152 | } 153 | } 154 | 155 | return nil, errors.New("Unable to find default route") 156 | } 157 | 158 | func getIfaceAddrs(iface *net.Interface) ([]netlink.Addr, error) { 159 | link := &netlink.Device{ 160 | netlink.LinkAttrs{ 161 | Index: iface.Index, 162 | }, 163 | } 164 | 165 | return netlink.AddrList(link, syscall.AF_INET) 166 | } 167 | 168 | func getIfaceIP4Addr(iface *net.Interface) (net.IP, error) { 169 | addrs, err := getIfaceAddrs(iface) 170 | if err != nil { 171 | return nil, err 172 | } 173 | 174 | // prefer non link-local addr 175 | var ll net.IP 176 | 177 | for _, addr := range addrs { 178 | if addr.IP.To4() == nil { 179 | continue 180 | } 181 | 182 | if addr.IP.IsGlobalUnicast() { 183 | return addr.IP, nil 184 | } 185 | 186 | if addr.IP.IsLinkLocalUnicast() { 187 | ll = addr.IP 188 | } 189 | } 190 | 191 | if ll != nil { 192 | // didn't find global but found link-local. it'll do. 193 | return ll, nil 194 | } 195 | 196 | return nil, errors.New("No IPv4 address found for given interface") 197 | } 198 | -------------------------------------------------------------------------------- /subnet.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "context" 5 | "encoding/json" 6 | "fmt" 7 | "net" 8 | "path" 9 | "regexp" 10 | "strconv" 11 | "time" 12 | 13 | "github.com/Sirupsen/logrus" 14 | "github.com/coreos/etcd/client" 15 | ) 16 | 17 | var ( 18 | subnetRegex = regexp.MustCompile(`(\d+\.\d+.\d+.\d+)-(\d+)`) 19 | eventAdd = "add" 20 | ) 21 | 22 | type IP4 uint 23 | 24 | func (ip IP4) Octets() (a, b, c, d byte) { 25 | a, b, c, d = byte(ip>>24), byte(ip>>16), byte(ip>>8), byte(ip) 26 | return 27 | } 28 | 29 | func (ip IP4) ToIP() net.IP { 30 | return net.IPv4(ip.Octets()) 31 | } 32 | 33 | func (ip IP4) StringSep(sep string) string { 34 | a, b, c, d := ip.Octets() 35 | return fmt.Sprintf("%d%s%d%s%d%s%d", a, sep, b, sep, c, sep, d) 36 | } 37 | 38 | func FromBytes(ip []byte) IP4 { 39 | return IP4(uint32(ip[3]) | 40 | (uint32(ip[2]) << 8) | 41 | (uint32(ip[1]) << 16) | 42 | (uint32(ip[0]) << 24)) 43 | } 44 | 45 | func FromIP(ip net.IP) IP4 { 46 | return FromBytes(ip.To4()) 47 | } 48 | 49 | // similar to net.IPNet but has uint based representation 50 | type IP4Net struct { 51 | IP IP4 52 | PrefixLen uint 53 | } 54 | 55 | func (n IP4Net) ToIPNet() *net.IPNet { 56 | return &net.IPNet{ 57 | IP: n.IP.ToIP(), 58 | Mask: net.CIDRMask(int(n.PrefixLen), 32), 59 | } 60 | } 61 | 62 | func (n IP4Net) StringSep(octetSep, prefixSep string) string { 63 | return fmt.Sprintf("%s%s%d", n.IP.StringSep(octetSep), prefixSep, n.PrefixLen) 64 | } 65 | 66 | func MakeSubnetKey(sn IP4Net) string { 67 | return sn.StringSep(".", "-") 68 | } 69 | 70 | type Attrs struct { 71 | PublicIP IP4 72 | Subnet IP4Net 73 | HardwareAddr net.HardwareAddr 74 | } 75 | 76 | type manager struct { 77 | cli client.KeysAPI 78 | Prefix string 79 | } 80 | 81 | func newManager(cfg config) manager { 82 | etcdCli, err := newEtcdClient(cfg) 83 | if err != nil { 84 | panic(fmt.Sprintf("new etcd client err: %v", err)) 85 | } 86 | 87 | return manager{ 88 | cli: etcdCli, 89 | Prefix: "/vxlan", 90 | } 91 | } 92 | 93 | type Event struct { 94 | Type string 95 | Subnet IP4Net 96 | Attrs Attrs 97 | } 98 | 99 | type subnetWatcher struct { 100 | Subnet *IP4Net 101 | } 102 | 103 | func (sw *subnetWatcher) update(evts []Event) []Event { 104 | batch := []Event{} 105 | 106 | for _, e := range evts { 107 | if sw.Subnet != nil && e.Subnet.IP == sw.Subnet.IP && e.Subnet.PrefixLen == sw.Subnet.PrefixLen { 108 | continue 109 | } 110 | 111 | batch = append(batch, e) 112 | } 113 | 114 | return batch 115 | } 116 | 117 | func watchSubnets(ctx context.Context, sm *manager, ownSn *IP4Net, receiver chan []Event) { 118 | var index *uint64 119 | 120 | sw := subnetWatcher{ 121 | Subnet: ownSn, 122 | } 123 | 124 | for { 125 | var evts []Event 126 | var err error 127 | evts, index, err = sm.watchEvents(ctx, index) 128 | if err != nil { 129 | logrus.Errorf("Watch subnets: %v", err) 130 | time.Sleep(time.Second) 131 | continue 132 | } 133 | 134 | var batch []Event 135 | if len(evts) > 0 { 136 | batch = sw.update(evts) 137 | } 138 | 139 | if len(batch) > 0 { 140 | receiver <- batch 141 | } 142 | } 143 | } 144 | 145 | func (m *manager) watchEvents(ctx context.Context, index *uint64) ([]Event, *uint64, error) { 146 | if index == nil { 147 | return m.getSubnets(ctx) 148 | } 149 | 150 | evt, idx, err := m.watchSubnets(ctx, index) 151 | if err != nil { 152 | return nil, nil, err 153 | } 154 | 155 | return []Event{evt}, &idx, nil 156 | } 157 | 158 | func (m *manager) watchSubnets(ctx context.Context, since *uint64) (Event, uint64, error) { 159 | key := path.Join(m.Prefix, "subnets") 160 | opts := &client.WatcherOptions{ 161 | AfterIndex: *since, 162 | Recursive: true, 163 | } 164 | 165 | e, err := m.cli.Watcher(key, opts).Next(ctx) 166 | if err != nil { 167 | return Event{}, 0, err 168 | } 169 | 170 | evt, err := parseSubnetWatchResponse(e) 171 | return evt, e.Node.ModifiedIndex, err 172 | } 173 | 174 | func (m *manager) getSubnets(ctx context.Context) ([]Event, *uint64, error) { 175 | key := path.Join(m.Prefix, "subnets") 176 | resp, err := m.cli.Get(ctx, key, &client.GetOptions{Recursive: true, Quorum: true}) 177 | if err != nil { 178 | if etcdErr, ok := err.(client.Error); ok && etcdErr.Code == client.ErrorCodeKeyNotFound { 179 | return []Event{}, nil, nil 180 | } 181 | return nil, nil, err 182 | } 183 | 184 | evts := []Event{} 185 | 186 | for _, node := range resp.Node.Nodes { 187 | l, err := nodeToEvent(node) 188 | if err != nil { 189 | logrus.Warningf("Ignoring bad subnet node: %v", err) 190 | continue 191 | } 192 | 193 | evts = append(evts, *l) 194 | } 195 | 196 | return evts, &resp.Index, nil 197 | } 198 | 199 | func ParseSubnetKey(s string) *IP4Net { 200 | if parts := subnetRegex.FindStringSubmatch(s); len(parts) == 3 { 201 | snIp := net.ParseIP(parts[1]).To4() 202 | prefixLen, err := strconv.ParseUint(parts[2], 10, 5) 203 | if snIp != nil && err == nil { 204 | return &IP4Net{IP: FromIP(snIp), PrefixLen: uint(prefixLen)} 205 | } 206 | } 207 | 208 | return nil 209 | } 210 | 211 | func nodeToEvent(node *client.Node) (*Event, error) { 212 | sn := ParseSubnetKey(node.Key) 213 | if sn == nil { 214 | return nil, fmt.Errorf("failed to parse subnet key %s", node.Key) 215 | } 216 | 217 | attrs := &Attrs{} 218 | if err := json.Unmarshal([]byte(node.Value), attrs); err != nil { 219 | return nil, err 220 | } 221 | 222 | evt := Event{ 223 | Type: eventAdd, 224 | Attrs: *attrs, 225 | Subnet: attrs.Subnet, 226 | } 227 | 228 | return &evt, nil 229 | } 230 | 231 | func parseSubnetWatchResponse(resp *client.Response) (Event, error) { 232 | sn := ParseSubnetKey(resp.Node.Key) 233 | if sn == nil { 234 | return Event{}, fmt.Errorf("%v %q: not a subnet, skipping", resp.Action, resp.Node.Key) 235 | } 236 | 237 | switch resp.Action { 238 | case "delete", "expire": 239 | return Event{}, fmt.Errorf("%v %q: not support, skipping", resp.Action, resp.Node.Key) 240 | 241 | default: 242 | attrs := &Attrs{} 243 | err := json.Unmarshal([]byte(resp.Node.Value), attrs) 244 | if err != nil { 245 | return Event{}, err 246 | } 247 | 248 | evt := Event{ 249 | Type: eventAdd, 250 | Subnet: *sn, 251 | Attrs: *attrs, 252 | } 253 | return evt, nil 254 | } 255 | } 256 | 257 | func (m *manager) createSubnet(ctx context.Context, sn IP4Net, attrs Attrs) error { 258 | key := path.Join(m.Prefix, "subnets", MakeSubnetKey(sn)) 259 | value, err := json.Marshal(attrs) 260 | if err != nil { 261 | return err 262 | } 263 | 264 | opts := &client.SetOptions{ 265 | PrevExist: client.PrevNoExist, 266 | TTL: time.Hour * 24, 267 | } 268 | 269 | resp, err := m.cli.Set(ctx, key, string(value), opts) 270 | if err != nil { 271 | return err 272 | } 273 | 274 | if resp.Node.Expiration != nil { 275 | logrus.Infof("subnet key expired in: %v", resp.Node.Expiration) 276 | } 277 | return nil 278 | } 279 | 280 | func handleSubnets(ctx context.Context, sn IP4Net, sm *manager, dev *vxlanDevice) { 281 | evts := make(chan []Event) 282 | go func() { 283 | watchSubnets(ctx, sm, &sn, evts) 284 | logrus.Info("watch subnets exit") 285 | }() 286 | 287 | for evtBatch := range evts { 288 | dev.handleSubnetEvents(evtBatch) 289 | } 290 | } 291 | -------------------------------------------------------------------------------- /vxlan.sh: -------------------------------------------------------------------------------- 1 | # The remote IP should be 100.1.1.1/100.1.1.2. This should work: 2 | ip address add 100.1.1.1/24 dev enp0s8 3 | ifconfig enp0s8 up 4 | ip link add name vxlan42 type vxlan id 42 dev enp0s8 remote 100.1.1.2 local 100.1.1.1 dstport 4789 5 | ip address add 50.1.1.1/24 dev vxlan42 6 | ip link set up vxlan42 7 | 8 | ip address add 100.1.1.2/24 dev enp0s8 9 | ifconfig enp0s8 up 10 | ip link add name vxlan42 type vxlan id 42 dev enp0s8 remote 100.1.1.1 local 100.1.1.2 dstport 4789 11 | ip address add 50.1.1.2/24 dev vxlan42 12 | ip link set up vxlan42 -------------------------------------------------------------------------------- /vxlan_overview.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cssivision/vxlan/80e9b9d4e3dca83944e7e3db54b8d103b609e7c0/vxlan_overview.png --------------------------------------------------------------------------------