├── .github └── FUNDING.yml ├── .gitmodules ├── Makefile ├── _redirects ├── archetypes └── default.md ├── config.toml ├── content ├── IPv6 │ └── _index.md ├── _index.md ├── arch │ └── _index.md ├── cni │ ├── _index.md │ ├── calico.md │ ├── cilium.md │ ├── flannel.md │ ├── iaas │ │ └── _index.md │ ├── kindnet.md │ └── weave.md ├── credits.md ├── dns │ └── _index.md ├── ingress │ ├── _index.md │ ├── egress │ │ └── _index.md │ ├── gateway │ │ └── _index.md │ └── ingress │ │ └── _index.md ├── lab │ └── _index.md ├── security │ └── _index.md └── services │ ├── Headless │ └── _index.md │ ├── Optimisations │ └── _index.md │ ├── _index.md │ ├── clusterIP │ ├── _index.md │ ├── control-plane.md │ └── dataplane │ │ ├── IPVS.md │ │ ├── eBPF.md │ │ └── iptables.md │ ├── loadBalancer │ └── _index.md │ ├── mesh │ └── _index.md │ └── nodeport │ └── _index.md ├── layouts ├── partials │ ├── favicon.html │ ├── logo.html │ └── menu-footer.html └── shortcodes │ ├── div.html │ └── iframe.html ├── license.md ├── netlify.toml └── static └── images ├── favicon.png └── k8s-guide-logo.png /.github/FUNDING.yml: -------------------------------------------------------------------------------- 1 | # These are supported funding model platforms 2 | 3 | github: [networkop] # Replace with up to 4 GitHub Sponsors-enabled usernames e.g., [user1, user2] 4 | patreon: # Replace with a single Patreon username 5 | open_collective: # Replace with a single Open Collective username 6 | ko_fi: # Replace with a single Ko-fi username 7 | tidelift: # Replace with a single Tidelift platform-name/package-name e.g., npm/babel 8 | community_bridge: # Replace with a single Community Bridge project-name e.g., cloud-foundry 9 | liberapay: # Replace with a single Liberapay username 10 | issuehunt: # Replace with a single IssueHunt username 11 | otechie: # Replace with a single Otechie username 12 | lfx_crowdfunding: # Replace with a single LFX Crowdfunding project-name e.g., cloud-foundry 13 | custom: # Replace with up to 4 custom sponsorship URLs e.g., ['link1', 'link2'] 14 | -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "themes/hugo-theme-learn"] 2 | path = themes/hugo-theme-learn 3 | url = https://github.com/matcornic/hugo-theme-learn.git 4 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | # That's because i have two hugo versions 2 | HUGO := hugo-0.74.3 3 | 4 | RANDOM_STR = 5 | 6 | DEFAULT: local 7 | 8 | ## Start a local server 9 | local: 10 | ${HUGO} server -D --bind 0.0.0.0 11 | 12 | ## Push the latest commit upstream 13 | release: 14 | git add . 15 | git commit -m "$$(date)" 16 | git push 17 | 18 | ## Create a new chapter 19 | chapter: 20 | hugo new --kind chapter newchapter/_index.md 21 | 22 | # From: https://gist.github.com/klmr/575726c7e05d8780505a 23 | help: 24 | @echo "$$(tput sgr0)";sed -ne"/^## /{h;s/.*//;:d" -e"H;n;s/^## //;td" -e"s/:.*//;G;s/\\n## /---/;s/\\n/ /g;p;}" ${MAKEFILE_LIST}|awk -F --- -v n=$$(tput cols) -v i=15 -v a="$$(tput setaf 6)" -v z="$$(tput sgr0)" '{printf"%s%*s%s ",a,-i,$$1,z;m=split($$2,w," ");l=n-i;for(j=1;j<=m;j++){l-=length(w[j])+1;if(l<= 0){l=n-i-length(w[j])-1;printf"\n%*s ",-i," ";}printf"%s ",w[j];}printf"\n";}' 25 | 26 | 27 | # https://desk.draw.io/support/solutions/articles/16000042542-embed-html 28 | 29 | 30 | -------------------------------------------------------------------------------- /_redirects : -------------------------------------------------------------------------------- 1 | https://k8s.networkop.co.uk/arch/* https://www.tkng.io/:splat 301! 2 | -------------------------------------------------------------------------------- /archetypes/default.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: "{{ replace .Name "-" " " | title }}" 3 | date: {{ .Date }} 4 | draft: true 5 | --- 6 | 7 | -------------------------------------------------------------------------------- /config.toml: -------------------------------------------------------------------------------- 1 | baseURL = "http://www.tkng.io/" 2 | languageCode = "en-us" 3 | title = "The Kubernetes Networking Guide" 4 | theme = "hugo-theme-learn" 5 | 6 | [outputs] 7 | home = [ "HTML", "RSS", "JSON"] 8 | 9 | 10 | [params] 11 | description = "The Kubernetes Networking Guide" 12 | showVisitedLinks = false 13 | images = ["images/k8s-guide-logo.png"] 14 | disableLanguageSwitchingButton = true 15 | disableMermaid = true 16 | editURL = "https://github.com/networkop/k8s-networking-guide/tree/master/content/" 17 | disableInlineCopyToClipBoard = true 18 | 19 | 20 | [[menu.shortcuts]] 21 | name = " Github repo" 22 | identifier = "ds" 23 | url = "https://github.com/networkop/k8s-networking-guide" 24 | weight = 10 25 | 26 | [[menu.shortcuts]] 27 | name = " Lab repo" 28 | identifier = "labs" 29 | url = "https://github.com/networkop/k8s-guide-labs" 30 | weight = 20 31 | 32 | [[menu.shortcuts]] 33 | name = " Credits" 34 | url = "/credits" 35 | weight = 30 36 | -------------------------------------------------------------------------------- /content/IPv6/_index.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: "IPv6" 3 | date: 2020-09-13T17:33:04+01:00 4 | weight: 90 5 | summary: "The state and readiness of IPv6 networking" 6 | --- 7 | 8 | 9 | # Under construction [help needed] -------------------------------------------------------------------------------- /content/_index.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: "The Kubernetes Networking Guide" 3 | --- 4 | 5 | # The Kubernetes Networking Guide 6 | 7 | The purpose of this website is to provide an overview of various **Kubernetes networking components** with a specific focus on **exactly how** they implement the required functionality. 8 | 9 | The information here can be used for educational purposes, however, the main goal is to provide a single point of reference for designing, operating and troubleshooting cluster networking solutions. 10 | 11 | {{% notice warning %}} 12 | This is not a generic Kubernetes learning resource. The assumption is that the reader is already familiar with basic concepts and building blocks of a Kubernetes cluster -- pods, deployments, services. 13 | {{% /notice %}} 14 | 15 | 16 | 17 | ## Structure 18 | 19 | The guide is split into multiple parts which can be studied mostly independently, however they all work together to provide a complete end-to-end cluster network abstractions. 20 | 21 | {{% children description="true" %}} 22 | {{% /children %}} 23 | 24 | {{% notice info %}} 25 | **Why this structure?** -- To explain Kubernetes from a network-centric view in a language understandable to people with a traditional network engineering background. This structure is also based on how [#sig-network](https://github.com/kubernetes/community/tree/master/sig-network) is organised into interest groups. 26 | {{% /notice %}} 27 | 28 | 29 | ## Hands-on Labs {#labs} 30 | 31 | Where possible, every topic in this guide will include a dedicated hands-on lab which can be spun up locally in a matter of minutes. Refer to the [Lab](lab/) page for setup instructions. 32 | 33 | 34 | 35 | ## Contributing 36 | If you found an error or want to add something to this guide, just click the **Edit this page** link displayed on top right of each page (except this one), and submit a pull request. 37 | 38 | {{% notice note %}} 39 | When submitting brand new content, please consider adding a corresponding lab to the [Labs repo](https://github.com/networkop/k8s-guide-labs) 40 | {{% /notice %}} 41 | 42 | 43 | -------------------------------------------------------------------------------- /content/arch/_index.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: The Kubernetes Network Model 3 | menuTitle: Network Model 4 | weight: 5 5 | summary: "High-level overview of cluster networking components" 6 | --- 7 | 8 | The [official documentation](https://kubernetes.io/docs/concepts/cluster-administration/networking/#the-kubernetes-network-model) does a very good job of describing the cluster network assumptions and requirements. I'll repeat the main ideas here for completeness and to lay the foundation for the rest of the article. Kubernetes networking can be seen as several (more or less) orthogonal problems: 9 | 10 | * **Local** communications between containers in the same Pod -- solved by the local loopback interface. 11 | * **Pod-to-Pod** East-West communication -- solved by a CNI plugin and discussed in the [CNI](/cni/) chapter of this guide. 12 | * Multi-pod **service** abstraction -- a way to group similar Pods and load-balance traffic to them, discussed in the [Services](/services/) 13 | chapter of this guide. 14 | * **Ingress** & Egress communication -- getting the traffic in and out of the Kubernetes cluster, discussed in the [Ingress & Egress](/ingress/) chapter of this guide. 15 | 16 | In addition to the above, there are a number of auxiliary problems that are covered in their separate chapters: 17 | 18 | * **Network Policies** -- a way to filter traffic going to and from Pods. 19 | * **DNS** -- the foundation of cluster service discovery. 20 | * **IPv6** -- unfortunately still requires a separate chapter to discuss the multitude of caveats and limitations. 21 | 22 | Despite their orthogonality, each layer builds on top of abstractions provided by another, for example: 23 | 24 | * **Ingress** -- associates a URL with a backend Service, learns the associated Endpoints and sends the traffic to one of the PodIPs, relying on the Pod-to-Pod connectivity. 25 | * **Service** -- performs the client-side load-balancing on the originating Node and sends the traffic to the destination PodIP, effectively relying on the Node-to-Pod connectivity. 26 | 27 | Here's an example of how different Kubernetes Resources are stacked together to provide a **North-South** connectivity: 28 | 29 | {{< iframe "https://viewer.diagrams.net/?highlight=0000ff&edit=_blank&hide-pages=1&editable=false&layers=1&nav=0&page-id=ydZ7vDq7JmuY7Tl_GMgH&title=k8s-guide.drawio#Uhttps%3A%2F%2Fraw.githubusercontent.com%2Fnetworkop%2Fk8s-guide-labs%2Fmaster%2Fdiagrams%2Fk8s-guide.drawio" >}} 30 | 31 | While the above is the canonical way of exposing an application in Kubernetes, it is by no way the only one. Like in any typical cloud infrastructure, the functions of different layers overlap and, thus, create a space for additional deployment scenarios: 32 | 33 | * **Ingress** can **proxy TCP and UDP** traffic to the backend ports. This works handy when the application protocol is not HTTP or if you want to string multiple Ingress proxies together. While Ingress controllers support this through custom [ConfigMaps](https://kubernetes.github.io/ingress-nginx/user-guide/exposing-tcp-udp-services/) or [annotations](https://docs.citrix.com/en-us/citrix-k8s-ingress-controller/how-to/tcp-udp-ingress.html), the gateway API project (which can be viewed as an evolution of Ingress) supports these features [natively](https://gateway-api.sigs.k8s.io/guides/tcp/). 34 | * **Service** of type **LoadBalancer or NodePort** can be used to expose backend ports without an Ingress. This can be useful when the pods need to expose an esoteric protocol (e.g. NETCONF) or when application proxy functions are simply not needed, e.g. small-scale, internal cluster, with no need for TLS termination or traffic rate-limiting. 35 | 36 | 37 | {{% notice note %}} 38 | The main point is that Kubernetes Networking is not just a CNI or a kube-proxy or an Ingress controller. It's all of the above working in unison to provide a consistent network abstraction for hosted applications and external users. 39 | {{% /notice %}} -------------------------------------------------------------------------------- /content/cni/_index.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: CNI 3 | menuTitle: "CNI" 4 | weight: 10 5 | summary: Pod Networking within and between Nodes 6 | --- 7 | 8 | ## Main Goals 9 | 10 | The official documentation [outlines](https://kubernetes.io/docs/concepts/cluster-administration/networking/#the-kubernetes-network-model) a number of requirements that any CNI plugin implementation should support. Rephrasing it in a slightly different way, a CNI plugin must provide at least the following two things: 11 | 12 | * **Connectivity** - making sure that a Pod gets its default `eth0` interface with IP reachable from the root network namespace of the hosting Node. 13 | * **Reachability** - making sure that Pods from other Nodes can reach each other directly (without NAT). 14 | 15 | Connectivity requirement is the most straight-forward one to understand -- every Pod must have a NIC to communicate with anything outside of its own network namespace. Some local processes on the Node (e.g. kubelet) need to reach PodIP from the root network namespace (e.g. to perform health and readiness checks), hence the root NS connectivity requirement. 16 | 17 | There are a number of [reference](https://github.com/containernetworking/plugins#main-interface-creating) CNI plugins that can be used to setup connectivity, most notable examples are: 18 | 19 | * **ptp** -- creates a veth link in the root namespace and plugs the other end into the Pod's namespace. 20 | * **bridge** -- does the same but also connects the rootNS end of the link to the bridge. 21 | * **macvlan/ipvlan** -- use the corresponding drivers to connect containers directly to the NIC of the Node. 22 | 23 | {{% notice info %}} 24 | These reference plugins are very often combined and re-used by other, more complicated CNI plugins (see [kindnet](/cni/kindnet/) or [flannel](/cni/flannel)). 25 | {{% /notice %}} 26 | 27 | Reachability, on the other hand, may require a bit of unpacking: 28 | 29 | * Every Pod gets a unique IP from a `PodCIDR` range configured on the Node. 30 | * This range is assigned to the Node during kubelet bootstrapping phase. 31 | * Nodes are not aware of `PodCIDRs` assigned to other Nodes, allocations are normally managed by the controller-manager based on the `--cluster-cidr` configuration flag. 32 | * Depending on the type of underlying connectivity, establishing end-to-end reachability between `PodCIDRs` may require different methods: 33 | - If all Nodes are in the **same Layer2 domain**, the connectivity can be established by configuring a **full mesh of static routes** on all Nodes with NextHop set to the internal IP of the peer Nodes. 34 | - If some Nodes are in **different Layer2 domains**, the connectivity can be established with either: 35 | * **Orchestrating the underlay** -- usually done with BGP for on-prem or some form of dynamically-provisioned static routes for public cloud environments. 36 | * **Encapsulating in the overlay** -- VXLAN is still the most popular encap type. 37 | 38 | {{% notice info %}} 39 | The above mechanisms are not determined exclusively by the underlying network. Plugins can use a mixture of different methods (e.g. host-based static routes for the same L2 segment and overlays for anything else) and the choice can be made purely based on operational complexity (e.g. overlays over BGP). 40 | {{% /notice %}} 41 | 42 | {{% notice note %}} 43 | It goes without saying that the base underlying assumption is that Nodes can reach each other using their Internal IPs. It is the responsibility of the infrastructure provider (IaaS) to fulfil this requirement. 44 | {{% /notice %}} 45 | 46 | ## Secondary Goals 47 | 48 | In addition to the base functionality described above, there's always a need to do things like: 49 | 50 | * **IP address management** to keep track of IPs allocated to each individual Pod. 51 | * **Port mappings** to expose Pods to the outside world. 52 | * **Bandwidth control** to control egress/ingress traffic rates. 53 | * **Source NAT** for traffic leaving the cluster (e.g. Internet) 54 | 55 | These functions can be performed by the same monolithic plugin or via a **plugin chaining**, where multiple plugins are specified in the configuration file and get invoked sequentially by the container runtime. 56 | 57 | 58 | {{% notice info %}} 59 | [CNI plugins repository](https://github.com/containernetworking/plugins) provides reference implementations of the most commonly used plugins. 60 | {{% /notice %}} 61 | 62 | ## Operation 63 | 64 | Contrary to the typical network plugin design approach that includes a long-lived stateful daemon, [CNI Specification](https://github.com/containernetworking/cni/blob/master/SPEC.md) defines an interface -- a set of input/output parameters that a CNI binary is expected to ingest/produce. This makes for a very clean design that is also very easy to swap and upgrade. The most beautiful thing is that the plugin becomes completely stateless -- it's just a binary file on a disk that gets invoked whenever a Pod gets created or deleted. Here's a sequence of steps that a container runtime has to do whenever a new Pod gets created: 65 | 66 | 1. It creates a new network namespace. 67 | 2. It reads and parses the CNI configuration file -- the (numerically) first file from `/etc/cni/net.d` 68 | 3. For every plugin specified in the configuration file, it invokes the corresponding binary, passing it the following information: 69 | * Environment variables `CNI_COMMAND`, `CNI_CONTAINERID`, `CNI_NETNS`, `CNI_IFNAME`, `CNI_PATH` and `CNI_ARGS`. 70 | * A minified version of the CNI configuration file (excluding any other plugins). 71 | 72 | The last step, if done manually, would look something like this: 73 | 74 | ```bash 75 | CNI_COMMAND=ADD \ 76 | CNI_CONTAINERID=cid \ 77 | CNI_NETNS=/var/run/netns/id \ 78 | CNI_IFNAME=eth0 \ 79 | CNI_PATH=/opt/bin/bin \ 80 | CNI_ARGS=K8S_POD_NAMESPACE=foo;K8S_POD_NAME=bar; \ 81 | cni_plugin < /etc/cni/net.d/01-cni.conf 82 | ``` 83 | 84 | The CNI plugin then does all of the required interface plumbing and IP allocation and returns back (prints to stdout) the resulting [data structure](https://github.com/containernetworking/cni/blob/master/SPEC.md#result). In the case of plugin chaining, all this information (original inputs + result) gets passed to all plugins along the chain. 85 | 86 | Despite its design simplicity, unless you have something else that takes care of establishing end-to-end reachability (e.g. cloud controller), a CNI binary must be accompanied by a long-running stateful daemon/agent. This daemon usually runs in the root network namespace and manages the Node's network stack between CNI binary invocations -- at the very least it adds and removes static routes as Nodes are added to or removed from the cluster. Its operation is not dictated by any standard and the only requirement is to establish Pod-to-Pod reachability. 87 | 88 | {{% notice note %}} 89 | In reality, this daemon does a lot more than just manage reachability and may include a kube-proxy replacement, Kubernetes controller, IPAM etc. 90 | {{% /notice %}} 91 | 92 | 93 | {{% notice tip %}} 94 | See [meshnet-cni](https://github.com/networkop/meshnet-cni#architecture) for an example of binary+daemon architecture. 95 | {{% /notice %}} 96 | 97 | 98 | 99 | ## What to know more? 100 | 101 | To learn more about CNI, you can search for the "Kubernetes and the CNI: Where We Are and What's Next", which I cannot recommend highly enough. It is what's shaped my current view of the CNI and heavily inspired the current article. Some other links I can recommend: 102 | 103 | * [Slides: Kubernetes and the CNI: Where We Are and What's Next](https://www.caseyc.net/cni-talk-kubecon-18.pdf) 104 | * [CNI Specificaion](https://github.com/containernetworking/cni/blob/master/SPEC.md) 105 | * [CNI plugin implemented in bash](https://www.altoros.com/blog/kubernetes-networking-writing-your-own-simple-cni-plug-in-with-bash/) 106 | * [EVPN CNI plugin](http://logingood.github.io/kubernetes/cni/2016/05/14/netns-and-cni.html) 107 | * [Writing your first CNI plugin](http://dougbtv.com/nfvpe/2017/06/22/cni-tutorial/) 108 | * [Building a meshnet-cni](https://networkop.co.uk/post/2018-11-k8s-topo-p1/) 109 | * [CNI plugin chaining](https://karampok.me/posts/chained-plugins-cni/) 110 | -------------------------------------------------------------------------------- /content/cni/calico.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: "calico" 3 | menuTitle: "calico" 4 | date: 2020-11-16T12:33:04+01:00 5 | weight: 15 6 | --- 7 | 8 | [Calico](https://docs.projectcalico.org/about/about-calico) is another example of a full-blown Kubernetes "networking solution" with functionality including network policy controller, kube-proxy replacement and network traffic observability. CNI functionality is still the core element of Calico and the focus of this chapter will be on how it satisfies the Kubernetes network model [requirements](/cni/#main-goals). 9 | 10 | 11 | * **Connectivity** is set up by creating a `veth` link and moving one side of that link into a Pod's namespace. The other side of the link is left dangling in the node's root namespace. For each local Pod, Calico sets up a PodIP host-route pointing over the veth link. 12 | 13 | {{% notice note %}} 14 | One oddity of Calico CNI is that the node end of the veth link does not have an IP address. In order to provide Pod-to-Node egress connectivity, each `veth` link is set up with `proxy_arp` which makes root NS respond to any ARP request coming from the Pod (assuming that the node has a default route itself). 15 | {{% /notice %}} 16 | 17 | * **Reachability** can be established in two different ways: 18 | 19 | 1. Static routes and overlays -- Calico supports IPIP and VXLAN and has an option to only setup tunnels for traffic crossing the L3 subnet boundary. 20 | 21 | 2. BGP -- the most popular choice for on-prem deployments, it works by configuring a [Bird](https://bird.network.cz/) BGP speaker on every node and setting up peerings to ensure that reachability information gets propagated to every node. There are several [options](https://docs.projectcalico.org/networking/bgp) for how to set up this peering, including full-mesh between nodes, dedicated route-reflector node and external peering with the physical network. 22 | 23 | {{% notice info %}} 24 | The above two modes are not mutually exclusive, BGP can be used with IPIP in public cloud environments. For a complete list of networking options for both on-prem and public cloud environments, refer to [this guide](https://docs.projectcalico.org/networking/determine-best-networking). 25 | {{% /notice %}} 26 | 27 | For demonstration purposes, we'll use a BGP-based configuration option with external off-cluster route-reflector. The fully converged and populated IP and MAC tables will look like this: 28 | 29 | {{< iframe "https://viewer.diagrams.net/?highlight=0000ff&edit=_blank&hide-pages=1&editable=false&layers=1&nav=0&page-id=5Q_VDU4fQs1RRTjQc7gX&title=k8s-guide.drawio#Uhttps%3A%2F%2Fraw.githubusercontent.com%2Fnetworkop%2Fk8s-guide-labs%2Fmaster%2Fdiagrams%2Fk8s-guide.drawio" >}} 30 | 31 | 32 | ### Lab 33 | 34 | Assuming that the lab environment is already [set up](/lab/), calico can be enabled with the following commands: 35 | 36 | ```bash 37 | make calico 38 | ``` 39 | 40 | Check that the calico-node daemonset has all pods in `READY` state: 41 | 42 | ```bash 43 | $ kubectl -n calico-system get daemonset 44 | NAME DESIRED CURRENT READY UP-TO-DATE AVAILABLE NODE SELECTOR AGE 45 | calico-node 3 3 3 3 3 kubernetes.io/os=linux 61s 46 | ``` 47 | 48 | Now we need to "kick" all Pods to restart and pick up the new CNI plugin: 49 | 50 | ```bash 51 | make nuke-all-pods 52 | ``` 53 | 54 | To make sure kube-proxy and calico set up the right set of NAT rules, existing NAT tables need to be flushed and re-populated: 55 | 56 | ``` 57 | make flush-nat && make calico-restart 58 | ``` 59 | 60 | Build and start a GoBGP-based route reflector: 61 | 62 | ``` 63 | make gobgp-build && make gobgp-rr 64 | ``` 65 | 66 | Finally, reconfigure Calico's BGP daemonset to peer with the GoBGP route reflector: 67 | 68 | ``` 69 | make gobgp-calico-patch 70 | ``` 71 | 72 | --- 73 | 74 | Here's how the information from the diagram can be validated (using `worker2` as an example): 75 | 76 | 1. Pod IP and default route 77 | 78 | ```bash 79 | $ NODE=k8s-guide-worker2 make tshoot 80 | bash-5.0# ip -4 -br addr show dev eth0 81 | eth0@if2 UP 10.244.190.5/32 82 | 83 | bash-5.0# ip route 84 | default via 169.254.1.1 dev eth0 85 | 169.254.1.1 dev eth0 scope link 86 | ``` 87 | 88 | Note how the default route is pointing to the fake next-hop address `169.254.1.1`. This will be the same for all Pods and this IP will resolve to the same MAC address configured on all veth links: 89 | 90 | ``` 91 | bash-5.0# ip neigh 92 | 169.254.1.1 dev eth0 lladdr ee:ee:ee:ee:ee:ee REACHABLE 93 | ``` 94 | 95 | 2. Node's routing table 96 | 97 | ```bash 98 | $ docker exec k8s-guide-worker2 ip route 99 | default via 172.18.0.1 dev eth0 100 | 10.244.175.0/24 via 172.18.0.4 dev eth0 proto bird 101 | 10.244.190.0 dev calid7f7f4e15dd scope link 102 | blackhole 10.244.190.0/24 proto bird 103 | 10.244.190.1 dev calid599cd3d268 scope link 104 | 10.244.190.2 dev cali82aeec08a68 scope link 105 | 10.244.190.3 dev calid2e34ad38c6 scope link 106 | 10.244.190.4 dev cali4a822ce5458 scope link 107 | 10.244.190.5 dev cali0ad20b06c15 scope link 108 | 10.244.236.0/24 via 172.18.0.5 dev eth0 proto bird 109 | 172.18.0.0/16 dev eth0 proto kernel scope link src 172.18.0.3 110 | ``` 111 | 112 | A few interesting things to note in the above output: 113 | 114 | * The 2 x /24 routes programmed by `bird` are the PodCIDR ranges of the other two nodes. 115 | * The blackhole /24 route is the PodCIDR of the local node. 116 | * Inside the local PodCIDR there's a /32 host-route configured for each running Pod. 117 | 118 | 3. BGP RIB of the GoBGP route reflector 119 | 120 | ``` 121 | docker exec gobgp gobgp global rib 122 | 123 | Network Next Hop AS_PATH Age Attrs 124 | *> 10.244.175.0/24 172.18.0.4 00:05:04 [{Origin: i} {LocalPref: 100}] 125 | *> 10.244.190.0/24 172.18.0.3 00:05:04 [{Origin: i} {LocalPref: 100}] 126 | *> 10.244.236.0/24 172.18.0.5 00:05:03 [{Origin: i} {LocalPref: 100}] 127 | 128 | ``` 129 | 130 | ### A day in the life of a Packet 131 | 132 | Let's track what happens when Pod-1 (actual name is net-tshoot-rg2lp) tries to talk to Pod-3 (net-tshoot-6wszq). 133 | 134 | {{% notice note %}} 135 | We'll assume that the ARP and MAC tables are converged and fully populated. In order to do that issue a ping from Pod-1 to Pod-3's IP (10.244.236.0) 136 | {{% /notice %}} 137 | 138 | 0. Check the peer interface index of the veth link of Pod-1: 139 | 140 | ``` 141 | $ kubectl -n default exec net-tshoot-rg2lp -- ip -br addr show dev eth0 142 | 3: eth0@if14: mtu 1410 qdisc noqueue state UP mode DEFAULT group default 143 | link/ether b2:24:13:ec:77:42 brd ff:ff:ff:ff:ff:ff link-netnsid 0 144 | ``` 145 | 146 | This information (if14) will be used in step 2 to identify the node side of the veth link. 147 | 148 | 1. Pod-1 wants to send a packet to `10.244.236.0`. Its network stack performs a route lookup: 149 | 150 | ```bash 151 | $ kubectl -n default exec net-tshoot-rg2lp -- ip route get 10.244.236.0 152 | 10.244.236.0 via 169.254.1.1 dev eth0 src 10.244.175.4 uid 0 153 | cache 154 | ``` 155 | 156 | 2. The nexthop IP is `169.254.1.1` on `eth0`, ARP table lookup is needed to get the destination MAC: 157 | 158 | ```bash 159 | $ kubectl -n default exec net-tshoot-rg2lp -- ip neigh show 169.254.1.1 160 | 169.254.1.1 dev eth0 lladdr ee:ee:ee:ee:ee:ee STALE 161 | ``` 162 | 163 | As mentioned above, the node side of the veth link doesn't have any IP configured: 164 | 165 | ``` 166 | $ docker exec k8s-guide-worker ip addr show dev if14 167 | 14: calic8441ae7134@if3: mtu 1410 qdisc noqueue state UP group default 168 | link/ether ee:ee:ee:ee:ee:ee brd ff:ff:ff:ff:ff:ff link-netns cni-262ff521-1b00-b1c9-f0d5-0943a48a2ddc 169 | ``` 170 | 171 | So in order to respond to an ARP request for `169.254.1.1`, all veth links have proxy ARP enabled: 172 | ``` 173 | $ docker exec k8s-guide-worker cat /proc/sys/net/ipv4/conf/calic8441ae7134/proxy_arp 174 | 1 175 | ``` 176 | 177 | 3. The packet reaches the root namespace of the ingress node, where another L3 lookup takes place: 178 | 179 | ``` 180 | $ docker exec k8s-guide-worker ip route get 10.244.236.0 fibmatch 181 | 10.244.236.0/24 via 172.18.0.5 dev eth0 proto bird 182 | ``` 183 | 184 | 4. The packet is sent to the target node where another FIB lookup is performed: 185 | 186 | ``` 187 | $ docker exec k8s-guide-control-plane ip route get 10.244.236.0 fibmatch 188 | 10.244.236.0 dev cali0ec6986a945 scope link 189 | ``` 190 | 191 | The target IP is reachable over the `veth` link so ARP is used to determine the destination MAC address: 192 | 193 | ``` 194 | docker exec k8s-guide-control-plane ip neigh show 10.244.236.0 195 | 10.244.236.0 dev cali0ec6986a945 lladdr de:85:25:60:86:5b STALE 196 | ``` 197 | 198 | 5. Finally, the packet gets delivered to the `eth0` interface of the target pod: 199 | 200 | ``` 201 | kubectl exec net-tshoot-6wszq -- ip -br addr show dev eth0 202 | eth0@if2 UP 10.244.236.0/32 fe80::dc85:25ff:fe60:865b/64 203 | ``` 204 | 205 | ### SNAT functionality 206 | 207 | SNAT functionality for traffic egressing the cluster is done in two stages: 208 | 209 | 1. `cali-POSTROUTING` chain is inserted at the top of the POSTROUTING chain. 210 | 211 | 2. Inside that chain `cali-nat-outgoin` is SNAT'ing all egress traffic originating from `cali40masq-ipam-pools`. 212 | 213 | ``` 214 | iptables -t nat -vnL 215 | <...> 216 | Chain POSTROUTING (policy ACCEPT 5315 packets, 319K bytes) 217 | pkts bytes target prot opt in out source destination 218 | 7844 529K cali-POSTROUTING all -- * * 0.0.0.0/0 0.0.0.0/0 /* cali:O3lYWMrLQYEMJtB5 */ 219 | <...> 220 | Chain cali-POSTROUTING (1 references) 221 | pkts bytes target prot opt in out source destination 222 | 7844 529K cali-fip-snat all -- * * 0.0.0.0/0 0.0.0.0/0 /* cali:Z-c7XtVd2Bq7s_hA */ 223 | 7844 529K cali-nat-outgoing all -- * * 0.0.0.0/0 0.0.0.0/0 /* cali:nYKhEzDlr11Jccal */ 224 | <...> 225 | Chain cali-nat-outgoing (1 references) 226 | pkts bytes target prot opt in out source destination 227 | 1 84 MASQUERADE all -- * * 0.0.0.0/0 0.0.0.0/0 /* cali:flqWnvo8yq4ULQLa */ match-set cali40masq-ipam-pools src ! match-set cali40all-ipam-pools dst random-fully 228 | 229 | ``` 230 | 231 | Calico configures all IPAM pools as ipsets for a more efficient matching within iptables. These pools can be viewed on each individual node: 232 | 233 | ``` 234 | $ docker exec k8s-guide-control-plane ipset -L cali40masq-ipam-pools 235 | Name: cali40masq-ipam-pools 236 | Type: hash:net 237 | Revision: 6 238 | Header: family inet hashsize 1024 maxelem 1048576 239 | Size in memory: 512 240 | References: 1 241 | Number of entries: 1 242 | Members: 243 | 10.244.128.0/17 244 | ``` 245 | 246 | ### Caveats and Gotchas 247 | 248 | * Calico support GoBGP-based routing, but only as an [experimental feature](https://github.com/projectcalico/calico-bgp-daemon). 249 | * BGP configs are generated from templates based on the contents of the Calico [datastore](https://docs.projectcalico.org/getting-started/kubernetes/hardway/the-calico-datastore). This makes the customization of the generated BGP config very [problematic](https://github.com/projectcalico/calico/issues/1604). 250 | 251 | 252 | 253 | -------------------------------------------------------------------------------- /content/cni/flannel.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: "flannel" 3 | menuTitle: "flannel" 4 | date: 2020-09-13T17:33:04+01:00 5 | weight: 12 6 | --- 7 | 8 | [Flannel](https://github.com/coreos/flannel) is another example of a dual CNI plugin design: 9 | 10 | * **Connectivity** is taken care of by the `flannel` binary. This binary is a `metaplugin` -- a plugin that wraps other reference CNI plugins. In the [simplest case](https://github.com/containernetworking/plugins/tree/master/plugins/meta/flannel#operation), it generates a `bridge` plugin configuration and "delegates" the connectivity setup to it. 11 | 12 | * **Reachability** is taken care of by the Daemonset running `flanneld`. Here's an approximate sequence of actions of what happens when the daemon starts: 13 | 1. It queries the Kubernetes Node API to discover its local `PodCIDR` and `ClusterCIDR`. This information is saved in the `/run/flannel/subnet.env` and is used by the flannel metaplugin to generate the `host-local` IPAM configuration. 14 | 2. It creates a vxlan interfaces called `flannel.1` and updates the Kubernetes Node object with its MAC address (along with its own Node IP). 15 | 3. Using Kubernetes API, it discovers the VXLAN MAC information of other Nodes and builds a local unicast head-end replication (HER) table for its vxlan interface. 16 | 17 | {{% notice info %}} 18 | This plugin assumes that daemons have a way to exchange information (e.g. VXLAN MAC). Previously, this required a separate database (hosted etcd) which was considered a big disadvantage. The new version of the plugin uses Kubernetes API to store that information in annotations of a Node API object. 19 | {{% /notice %}} 20 | 21 | The fully converged IP and MAC tables will look like this: 22 | 23 | {{< iframe "https://viewer.diagrams.net/?highlight=0000ff&edit=_blank&hide-pages=1&editable=false&layers=1&nav=0&page-id=jdjgs82ws8dfcGyB_vlg&title=k8s-guide.drawio#Uhttps%3A%2F%2Fraw.githubusercontent.com%2Fnetworkop%2Fk8s-guide-labs%2Fmaster%2Fdiagrams%2Fk8s-guide.drawio" >}} 24 | 25 | 26 | 27 | 28 | 29 | ### Lab 30 | 31 | Assuming that the lab is already [setup](/lab/), flannel can be enabled with the following 3 commands: 32 | 33 | ```bash 34 | make flannel 35 | ``` 36 | 37 | Check that the flannel daemonset has reached the `READY` state: 38 | 39 | ```bash 40 | $ kubectl -n kube-system get daemonset -l app=flannel 41 | NAME DESIRED CURRENT READY UP-TO-DATE AVAILABLE NODE SELECTOR AGE 42 | kube-flannel-ds 3 3 3 3 3 90s 43 | ``` 44 | 45 | Now we need to "kick" all Pods to restart and pick up the new CNI plugin: 46 | 47 | ```bash 48 | make nuke-all-pods 49 | ``` 50 | 51 | Here's how the information from the diagram can be validated (using `worker2` as an example): 52 | 53 | 1. Pod IP and default route 54 | 55 | ```bash 56 | $ NODE=k8s-guide-worker2 make tshoot 57 | bash-5.0# ip route get 1.1 58 | 1.1.0.0 via 10.244.2.1 dev eth0 src 10.244.2.6 uid 0 59 | ``` 60 | 61 | 2. Node routing table 62 | 63 | ```bash 64 | $ docker exec -it k8s-guide-worker2 ip route 65 | default via 172.18.0.1 dev eth0 66 | 10.244.0.0/24 via 10.244.0.0 dev flannel.1 onlink 67 | 10.244.1.0/24 via 10.244.1.0 dev flannel.1 onlink 68 | 10.244.2.0/24 dev cni0 proto kernel scope link src 10.244.2.1 69 | 172.18.0.0/16 dev eth0 proto kernel scope link src 172.18.0.2 70 | ``` 71 | 72 | 3. Static ARP entries for NextHops 73 | 74 | ```bash 75 | $ docker exec -it k8s-guide-worker2 ip neigh | grep PERM 76 | 10.244.1.0 dev flannel.1 lladdr ce:0a:4f:22:a4:2a PERMANENT 77 | 10.244.0.0 dev flannel.1 lladdr 5a:11:99:ab:8c:22 PERMANENT 78 | 79 | ``` 80 | 81 | 4. VXLAN forwarding database 82 | 83 | ```bash 84 | $ docker exec -it k8s-guide-worker2 bridge fdb show dev flannel.1 85 | 5a:11:99:ab:8c:22 dst 172.18.0.3 self permanent 86 | ce:0a:4f:22:a4:2a dst 172.18.0.4 self permanent 87 | ``` 88 | 89 | ### A day in the life of a Packet 90 | 91 | Let's track what happens when Pod-1 tries to talk to Pod-3. 92 | 93 | {{% notice note %}} 94 | We'll assume that the ARP and MAC tables are converged and fully populated. 95 | {{% /notice %}} 96 | 97 | 1\. Pod-1 wants to send a packet to `10.244.0.2`. Its network stack looks up the routing table to find the NextHop IP: 98 | 99 | ```bash 100 | $ kubectl exec -it net-tshoot-4sg7g -- ip route get 10.244.0.2 101 | 10.244.0.2 via 10.244.1.1 dev eth0 src 10.244.1.6 uid 0 102 | ``` 103 | 104 | 2\. The packet reaches the `cbr0` bridge in the root network namespace, where the lookup is performed again: 105 | 106 | ```bash 107 | $ docker exec -it k8s-guide-worker ip route get 10.244.0.2 108 | 10.244.0.2 via 10.244.0.0 dev flannel.1 src 10.244.1.0 uid 0 109 | ``` 110 | 111 | 3\. The NextHop and the outgoing interfaces are set, the ARP table lookup returns the static entry provisioned by the `flanneld`: 112 | 113 | ```bash 114 | $ docker exec -it k8s-guide-worker ip neigh get 10.244.0.0 dev flannel.1 115 | 10.244.0.0 dev flannel.1 lladdr 5a:11:99:ab:8c:22 PERMANENT 116 | ``` 117 | 118 | 4\. Next, the FDB of the VXLAN interface is consulted to find out the destination VTEP IP: 119 | 120 | ```bash 121 | $ docker exec -it k8s-guide-worker bridge fdb | grep 5a:11:99:ab:8c:22 122 | 5a:11:99:ab:8c:22 dev flannel.1 dst 172.18.0.3 self permanent 123 | ``` 124 | 125 | 5\. The packet is VXLAN-encapsulated and sent to the `control-node` where `flannel.1` matches the VNI and the VXLAN MAC: 126 | 127 | ```bash 128 | $ docker exec -it k8s-guide-control-plane ip link show flannel.1 129 | 3: flannel.1: mtu 1450 qdisc noqueue state UNKNOWN mode DEFAULT group default 130 | link/ether 5a:11:99:ab:8c:22 brd ff:ff:ff:ff:ff:ff 131 | ``` 132 | 133 | 6\. The packet gets decapsulated and its original destination IP looked up in the main routing table: 134 | 135 | ```bash 136 | $ docker exec -it k8s-guide-control-plane ip route get 10.244.0.2 137 | 10.244.0.2 dev cni0 src 10.244.0.1 uid 0 138 | ``` 139 | 140 | 7\. The ARP and bridge tables are then consulted to find the outgoing veth interface: 141 | 142 | ```bash 143 | $ docker exec -it k8s-guide-control-plane ip neigh get 10.244.0.2 dev cni0 144 | 10.244.0.2 dev cni0 lladdr 7e:46:23:43:6f:ec REACHABLE 145 | $ docker exec -it k8s-guide-control-plane bridge fdb get 7e:46:23:43:6f:ec br cni0 146 | 7e:46:23:43:6f:ec dev vethaabf9eb2 master cni0 147 | ``` 148 | 149 | 8\. Finally, the packet arrives in the Pod-3's network namespace where it gets processed by the local network stack: 150 | 151 | ```bash 152 | $ kubectl exec -it net-tshoot-rkg46 -- ip route get 10.244.0.2 153 | local 10.244.0.2 dev lo src 10.244.0.2 uid 0 154 | ``` 155 | 156 | ### SNAT functionality 157 | 158 | Similar to [kindnet](/cni/kindnet/) `flanneld` sets up the SNAT rules to enable egress connectivity for the Pods, the only difference is that it does this directly inside the `POSTROUTING` chain: 159 | 160 | ```bash 161 | Chain POSTROUTING (policy ACCEPT 327 packets, 20536 bytes) 162 | pkts bytes target prot opt in out source destination 163 | 0 0 RETURN all -- * * 10.244.0.0/16 10.244.0.0/16 164 | 0 0 MASQUERADE all -- * * 10.244.0.0/16 !224.0.0.0/4 random-fully 165 | 0 0 RETURN all -- * * !10.244.0.0/16 10.244.0.0/24 166 | 0 0 MASQUERADE all -- * * !10.244.0.0/16 10.244.0.0/16 random-fully 167 | ``` 168 | 169 | ### Caveats and Gotchas 170 | 171 | * The official [installation manifest](https://raw.githubusercontent.com/coreos/flannel/master/Documentation/kube-flannel.yml) does not install the CNI binary by default. This binary is distributed as a part of [reference CNI plugins](https://github.com/containernetworking/plugins/releases) and needs to be installed separately. 172 | * flannel can run in a `direct routing` mode, which acts by installing static routes for hosts on the same subnet. 173 | * flannel can use generic UDP encapsulation instead of VXLAN -------------------------------------------------------------------------------- /content/cni/iaas/_index.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: Public and Private Clouds 3 | menuTitle: "IaaS" 4 | weight: 13 5 | summary: Cloud-based Kubernetes deployments 6 | --- 7 | 8 | Kubernetes was designed to run inside a cloud environment. The idea is that the IaaS layer can provide resources that Kubernetes can consume without having to implement them internally. These resources include VMs (for Node management), L4 load-balancers (for service type LoadBalancer) and persistent storage (for PersistentVolumes). The reason why it's important for networking is that the underlying cloud SDN is also programmable and can be managed by the Kubernetes itself. 9 | 10 | {{% notice note %}} 11 | Although it is possible to run Kubernetes directly on baremetal, all of these problems will still need to be addressed individually by the cluster administrator. 12 | {{% /notice %}} 13 | 14 | ## Operation 15 | 16 | A typical managed Kubernetes deployment includes a simple CNI plugin called [kubenet](https://kubernetes.io/docs/concepts/extend-kubernetes/compute-storage-net/network-plugins/#kubenet) which is another example of a `metaplugin` -- it [re-uses](https://github.com/kubernetes/kubernetes/blob/master/pkg/kubelet/dockershim/network/kubenet/kubenet_linux.go#L88) `bridge`, `host-local` and `loopback` reference CNI plugins and orchestrates them to provide **connectivity**. 17 | 18 | {{% notice note %}} 19 | It is enabled with a kubelet argument `--network-plugin=kubenet` which, for managed Kubernetes, means that it cannot be replaced with a different CNI plugin. 20 | {{% /notice %}} 21 | 22 | 23 | One notable difference with `kubenet` is that there is no daemon component in the plugin. In this case, **reachability** is provided by the underlying SDN and orchestrated by a [Cloud Controller Manager](https://kubernetes.io/docs/concepts/architecture/cloud-controller/). Behind the scenes, for each PodCIDR it installs a **static route** pointing to the Node IP -- this way traffic between Pods can just follow the default route in the root namespace, safely assuming that the underlying virtual router will know where to forward the packets. 24 | 25 | {{< iframe "https://viewer.diagrams.net/?highlight=0000ff&edit=_blank&hide-pages=1&editable=false&layers=1&nav=0&page-id=A5cMEZUylDs-XIrDOgQv&title=k8s-guide.drawio#Uhttps%3A%2F%2Fraw.githubusercontent.com%2Fnetworkop%2Fk8s-guide-labs%2Fmaster%2Fdiagrams%2Fk8s-guide.drawio" >}} 26 | 27 | {{% notice note %}} 28 | If you're interested in using BGP to establish reachability in cloud environment, be sure to check out [cloudroutesync](https://github.com/networkop/cloudroutesync). 29 | {{% /notice %}} 30 | 31 | 32 | ### GKE 33 | 34 | Google's CCM uses [IP Alias ranges](https://cloud.google.com/vpc/docs/alias-ip) to provide reachability. The VPC subnet gets configured with a secondary address range that is the same as the cluster CIDR: 35 | 36 | ``` 37 | $ gcloud compute networks subnets describe private-subnet | grep -A 3 secondaryIpRanges 38 | secondaryIpRanges: 39 | - ipCidrRange: 10.244.0.0/22 40 | rangeName: private-secondary 41 | ``` 42 | 43 | Each new Node VM gets created with an alias range set to the Node's PodCIDR: 44 | 45 | ``` 46 | $ gcloud compute instances describe gke-node | grep -A 3 aliasIpRanges 47 | networkInterfaces: 48 | - aliasIpRanges: 49 | - ipCidrRange: 10.224.1.0/24 50 | subnetworkRangeName: private-secondary 51 | ``` 52 | 53 | Inside of the Node VM there's a standard set of interfaces: 54 | 55 | ``` 56 | $ ip -4 -br add show 57 | lo UNKNOWN 127.0.0.1/8 58 | eth0 UP 172.16.0.12/32 59 | cbr0 UP 10.224.1.1/24 60 | ``` 61 | 62 | The routing table only has a single non-directly connected default route: 63 | 64 | ``` 65 | $ ip route 66 | default via 172.16.0.1 dev eth0 proto dhcp metric 1024 67 | 172.16.0.12 dev eth0 proto dhcp scope link metric 1024 68 | 10.224.1.0/24 dev cbr0 proto kernel scope link src 10.224.1.1 69 | ``` 70 | 71 | {{% notice info %}} 72 | IP Alias is a special kind of static route that, amongst [other benefits](https://cloud.google.com/vpc/docs/alias-ip#key_benefits_of_alias_ip_ranges), gets checked for potential conflicts and automatically updates the corresponding anti-spoofing rules to allow VM to emit packets with non-native IPs. 73 | {{% /notice %}} 74 | 75 | ### AKS 76 | 77 | Azure uses normal static routes to setup reachability: 78 | 79 | ``` 80 | az network route-table show --ids "id" | grep -A 5 10.224.1.0 81 | "addressPrefix": "10.224.1.0/24", 82 | "etag": "W/\"tag\"", 83 | "id": "id", 84 | "name": "name", 85 | "nextHopIpAddress": "172.16.0.12", 86 | "nextHopType": "VirtualAppliance", 87 | ``` 88 | 89 | Inside of the Node VM there's a standard set of interfaces: 90 | 91 | 92 | ``` 93 | # ip -4 -br add show 94 | lo UNKNOWN 127.0.0.1/8 95 | eth0 UP 172.16.0.12/16 96 | cbr0 UP 10.224.1.1/24 97 | ``` 98 | 99 | And there is only a single non-directly connected route pointing out the primary interface: 100 | 101 | ``` 102 | # ip route 103 | default via 172.16.0.1 dev eth0 104 | 172.16.0.0/16 dev eth0 proto kernel scope link src 172.16.0.12 105 | 10.224.1.0/24 dev cbr0 proto kernel scope link src 10.224.1.1 106 | 168.63.129.16 via 172.16.0.1 dev eth0 107 | 169.254.169.254 via 172.16.0.1 dev eth0 108 | ``` 109 | 110 | {{% notice info %}} 111 | Azure Nodes can also be configured with ["Azure CNI"](https://docs.microsoft.com/en-us/azure/aks/configure-azure-cni) where Pod IPs get allocated from the same range as the underlying VNET. 112 | {{% /notice %}} 113 | 114 | 115 | ### EKS 116 | 117 | EKS takes a slightly different approach and runs as special [AWS CNI plugin](https://github.com/aws/amazon-vpc-cni-k8s) as a daemonset on all nodes. The functionality of this plugin is documented in the [proposal](https://github.com/aws/amazon-vpc-cni-k8s/blob/master/docs/cni-proposal.md) in a lot of detail. 118 | 119 | {{% notice info %}} 120 | The VPC-native routing is achieved by assigning each Node's ENI with secondary IPs, and adding more ENIs as the max number of IPs per ENI [limit](https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/using-eni.html#AvailableIpPerENI) is exceeded. 121 | {{% /notice %}} 122 | 123 | 124 | One thing worth mentioning here is that in EKS's case, it's possible to replace the AWS CNI plugin with a number of [3rd party plugins](https://docs.aws.amazon.com/eks/latest/userguide/alternate-cni-plugins.html). In this case, VPC-native routing is not available since VPC virtual router won't be aware of the PodCIDRs and the only option is to run those plugins in the overlay mode -- by building a full-mesh of VXLAN tunnels and static routes that forward traffic over them. -------------------------------------------------------------------------------- /content/cni/kindnet.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: "kindnet" 3 | menuTitle: "kindnet" 4 | weight: 11 5 | date: 2020-09-13T17:33:04+01:00 6 | --- 7 | 8 | 9 | Here is how [kindnet](https://github.com/aojea/kindnet#kindnet-components) satisfies the two main CNI plugin [requirements](/cni/): 10 | 11 | * **Reachability** is established by installing one static route per peer Node with NextHops pointing to the internal Node IPs. These routes get checked every 10 seconds to detect if there were any changes. 12 | * **Connectivity** is established by a mix of reference CNI plugins -- [`ptp`]((https://www.cni.dev/plugins/current/main/ptp/)) is used to create veth links, [`host-local`](https://www.cni.dev/plugins/current/ipam/host-local/) to allocate IPs and [`portmap`](https://www.cni.dev/plugins/current/meta/portmap/) to configure port mappings. The configuration file gets generated by each of the `kindnetd` daemons on startup. 13 | 14 | The diagram below shows how a fully converged routing table will look like: 15 | 16 | 17 | {{< iframe "https://viewer.diagrams.net/?highlight=0000ff&edit=_blank&hide-pages=1&editable=false&layers=1&nav=0&page-id=a-ASnmM8o81X1hkJ6S8l&title=k8s-guide.drawio#Uhttps%3A%2F%2Fraw.githubusercontent.com%2Fnetworkop%2Fk8s-guide-labs%2Fmaster%2Fdiagrams%2Fk8s-guide.drawio" >}} 18 | 19 | ### Lab 20 | 21 | This plugin is built into the Lab cluster by default, so the only thing required is to bring up the [Lab environment](/lab/) 22 | 23 | ``` 24 | make setup && make up 25 | ``` 26 | 27 | Here's how to validate and verify the above diagram in the Lab environment, using the second Node as an example: 28 | 29 | 1. Pod IP and default route 30 | 31 | Pod IP should have a /24 subnet mask (same as `PodCIDR`) and the default route pointing to the first IP of that subnet. 32 | 33 | ``` 34 | $ NODE=k8s-guide-worker2 make tshoot 35 | bash-5.0# ip -br -4 add show eth0 36 | eth0@if5 UP 10.244.2.8/24 37 | bash-5.1# ip route 38 | default via 10.244.2.1 dev eth0 39 | 10.244.2.0/24 via 10.244.2.1 dev eth0 src 10.244.2.8 40 | 10.244.2.1 dev eth0 scope link src 10.244.2.8 41 | ``` 42 | 43 | {{% notice note %}} 44 | Note how the Pod routing is set up so that all the traffic, including the intra-subnet Pod-to-Pod communication, is sent over the same next-hop. This allows for all Pods to be interconnected via L3 without relying on a bridge or ARP for neighbor discovery. 45 | {{% /notice %}} 46 | 47 | 48 | 2. Node routing table 49 | 50 | It should contain one /32 host-route per local Pod and one /24 per peer node. 51 | 52 | ``` 53 | docker exec -it k8s-guide-worker2 bash 54 | root@k8s-guide-worker2:/# ip route 55 | default via 172.18.0.1 dev eth0 56 | 10.244.0.0/24 via 172.18.0.10 dev eth0 57 | 10.244.1.0/24 via 172.18.0.11 dev eth0 58 | 10.244.2.2 dev vethf821f7f9 scope host 59 | 10.244.2.3 dev veth87514986 scope host 60 | 10.244.2.4 dev veth9829983c scope host 61 | 10.244.2.5 dev veth010c83ae scope host 62 | 10.244.2.8 dev vetha1079faf scope host 63 | ``` 64 | 65 | 3. PodCIDR gateway 66 | 67 | One notable thing is that the root namespace side of all veth links has the same IP address: 68 | 69 | ``` 70 | root@k8s-guide-worker2:/# ip -br -4 addr show | grep veth 71 | vethf821f7f9@if3 UP 10.244.2.1/32 72 | veth87514986@if3 UP 10.244.2.1/32 73 | veth9829983c@if3 UP 10.244.2.1/32 74 | veth010c83ae@if3 UP 10.244.2.1/32 75 | vetha1079faf@if3 UP 10.244.2.1/32 76 | ``` 77 | 78 | They each act as the default gateway for their peer Pods and don't have to be attached to a bridge. 79 | 80 | ### A day in the life of a Packet 81 | 82 | Let's track what happens when Pod-1 tries to talk to Pod-3. 83 | 84 | {{% notice note %}} 85 | We'll assume that the ARP and MAC tables are converged and fully populated. 86 | {{% /notice %}} 87 | 88 | 1. Pod-1 wants to send a packet to `10.244.0.5`. Its network stack looks up the routing table to find the NextHop IP: 89 | 90 | ``` 91 | $ kubectl exec -it net-tshoot-wxgcw -- ip route get 10.244.0.5 92 | 10.244.0.5 via 10.244.1.1 dev eth0 src 10.244.1.3 uid 0 93 | ``` 94 | 95 | 2. The packet is sent down the veth link and pops out in the root network namespace of the host, which repeats the lookup: 96 | 97 | ``` 98 | $ docker exec -it k8s-guide-worker ip route get 10.244.0.5 99 | 10.244.0.5 via 172.18.0.10 dev eth0 src 172.18.0.11 uid 0 100 | ``` 101 | 102 | 3. The packet gets L2-switches by the `kind` bridge and enters the control-plane's root network namespace: 103 | 104 | ``` 105 | docker exec -it k8s-guide-control-plane ip route get 10.244.0.5 106 | 10.244.0.5 dev veth9f517bf3 src 10.244.0.1 uid 0 107 | ``` 108 | 109 | 4. Finally, the packet arrives in the Pod-3's network namespace where it gets processed by the local network stack: 110 | 111 | ``` 112 | kubectl exec -it net-tshoot-x6wv9 -- ip route get 10.244.0.5 113 | local 10.244.0.5 dev lo src 10.244.0.5 uid 0 114 | ``` 115 | 116 | ### SNAT functionality 117 | 118 | In addition to the main CNI functionality, `kindnet` also sets up a number of IP masquerade (Source NAT) rules. These rules allow Pods to access the same networks as the hosting Node (e.g. Internet). The new `KIND-MASQ-AGENT` chain is inserted into the NAT's `POSTROUTING` chain and includes a special `RETURN` rule to exclude all traffic in the cluster-cidr range (10.244.0.0/16): 119 | 120 | ``` 121 | root@k8s-guide-worker2:/# iptables -t nat -nvL | grep -B 4 -A 4 KIND-MASQ 122 | Chain POSTROUTING (policy ACCEPT 3073 packets, 233K bytes) 123 | pkts bytes target prot opt in out source destination 124 | 61703 4686K KUBE-POSTROUTING all -- * * 0.0.0.0/0 0.0.0.0/0 /* kubernetes postrouting rules */ 125 | 0 0 DOCKER_POSTROUTING all -- * * 0.0.0.0/0 172.18.0.1 126 | 54462 4060K KIND-MASQ-AGENT all -- * * 0.0.0.0/0 0.0.0.0/0 ADDRTYPE match dst-type !LOCAL /* kind-masq-agent: ensure nat POSTROUTING directs all non-LOCAL destination traffic to our custom KIND-MASQ-AGENT chain */ 127 | 128 | Chain KIND-MASQ-AGENT (1 references) 129 | pkts bytes target prot opt in out source destination 130 | 46558 3587K RETURN all -- * * 0.0.0.0/0 10.244.0.0/16 /* kind-masq-agent: local traffic is not subject to MASQUERADE */ 131 | 7904 473K MASQUERADE all -- * * 0.0.0.0/0 0.0.0.0/0 /* kind-masq-agent: outbound traffic is subject to MASQUERADE (must be last in chain) */ 132 | ``` 133 | 134 | ### Caveats and Gotchas 135 | 136 | * Assumes all Nodes are in the same L2 domain. 137 | * Relies on host-local, ptp, portmap and loopback [reference plugins](https://github.com/containernetworking/plugins#plugins-supplied). 138 | -------------------------------------------------------------------------------- /content/cni/weave.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: "weave" 3 | menuTitle: "weave" 4 | date: 2020-10-17T12:33:04+01:00 5 | weight: 14 6 | --- 7 | 8 | [Weave Net](https://www.weave.works/docs/net/latest/overview/) is one of the "heavyweight" CNI plugins with a wide range of features and its own proprietary control plane to disseminate routing information between nodes. The scope of the plugin extends far beyond the base CNI functionality examined in this chapter and includes Network Policies, Encryption, Multicast and support for other container orchestration platforms (Swarm, Mesos). 9 | 10 | Following a similar pattern, let's examine how `weave` achieves the base CNI plugin functionality: 11 | 12 | * **Connectivity** is set up by the `weave-net` binary by attaching pods to the `weave` Linux bridge. The bridge is, in turn, attached to the Open vSwitch's kernel datapath which forwards the packets over the vxlan interface towards the target node. 13 | 14 | {{% notice info %}} 15 | Although it would have been possible to attach containers directly to the OVS datapath (ODP), Linux bridge plays the role of an egress router for all local pods so that ODP is only used for pod-to-pod forwarding. 16 | {{% /notice %}} 17 | 18 | * **Reachability** is established by two separate mechanisms: 19 | 20 | 1. [Weave Mesh](https://github.com/weaveworks/mesh) helps agents discover each other, check health, connectivity and exchange node-local details, e.g. IPs for VXLAN tunnel endpoint. 21 | 2. OVS datapath acts as a standard learning L2 switch with flood-and-learn behaviour being [programmed](https://github.com/weaveworks/go-odp) by the local agent (based on information distributed by the Mesh). All pods get their IPs from a single cluster-wide subnet and see their peers as if they were attached to a single broadcast domain. 22 | 23 | 24 | {{% notice info %}} 25 | The cluster-wide CIDR range is still split into multiple non-overlapping ranges, which may look like a node-local pod CIDRs, however, all Pod IPs still have the same prefix length as the cluster CIDR, effectively making them part of the same L3 subnet. 26 | {{% /notice %}} 27 | 28 | 29 | The fully converged and populated IP and MAC tables will look like this: 30 | 31 | {{< iframe "https://viewer.diagrams.net/?highlight=0000ff&edit=_blank&hide-pages=1&editable=false&layers=1&nav=0&page-id=GzriSjSBuyDBEbTt2saz&title=k8s-guide.drawio#Uhttps%3A%2F%2Fraw.githubusercontent.com%2Fnetworkop%2Fk8s-guide-labs%2Fmaster%2Fdiagrams%2Fk8s-guide.drawio" >}} 32 | 33 | 34 | 35 | 36 | ### Lab 37 | 38 | 39 | Assuming that the lab is already [set up](/lab/), weave can be enabled with the following commands: 40 | 41 | ```bash 42 | make weave 43 | ``` 44 | 45 | Check that the weave daemonset has reached the `READY` state: 46 | 47 | ```bash 48 | $ kubectl -n kube-system get daemonset -l name=weave-net 49 | NAME DESIRED CURRENT READY UP-TO-DATE AVAILABLE NODE SELECTOR AGE 50 | weave-net 3 3 3 3 3 30s 51 | ``` 52 | 53 | Now we need to "kick" all Pods to restart and pick up the new CNI plugin: 54 | 55 | ```bash 56 | make nuke-all-pods 57 | ``` 58 | 59 | To make sure kube-proxy and weave set up the right set of NAT rules, existing NAT tables need to be flushed and repopulated: 60 | 61 | ``` 62 | make flush-nat && make weave-restart 63 | ``` 64 | 65 | --- 66 | 67 | Here's how the information from the diagram can be validated (using `worker2` as an example): 68 | 69 | 1. Pod IP and default route 70 | 71 | ```bash 72 | $ NODE=k8s-guide-worker2 make tshoot 73 | bash-5.0# ip route 74 | default via 10.44.0.0 dev eth0 75 | 10.32.0.0/12 dev eth0 proto kernel scope link src 10.44.0.7 76 | ``` 77 | 78 | 2. Node routing table 79 | 80 | ```bash 81 | $ docker exec -it k8s-guide-worker2 ip route 82 | default via 172.18.0.1 dev eth0 83 | 10.32.0.0/12 dev weave proto kernel scope link src 10.44.0.0 84 | 172.18.0.0/16 dev eth0 proto kernel scope link src 172.18.0.4 85 | ``` 86 | 87 | 3. ODP configuration and flows (output omitted for brevity) 88 | 89 | 90 | ``` 91 | WEAVEPOD=$(kubectl get pods -n kube-system -l name=weave-net --field-selector spec.nodeName=k8s-guide-worker2 -o jsonpath='{.items[0].metadata.name}') 92 | kubectl exec -it $WEAVEPOD -n kube-system -- /home/weave/weave --local report 93 | ``` 94 | 95 | ### A day in the life of a Packet 96 | 97 | Let's track what happens when Pod-1 (actual name is net-tshoot-22drp) tries to talk to Pod-3 (net-tshoot-pbp7z). 98 | 99 | {{% notice note %}} 100 | We'll assume that the ARP and MAC tables are converged and fully populated. In order to do that issue a ping command from Pod-1 to Pod-3's IP (10.40.0.1) 101 | {{% /notice %}} 102 | 103 | 104 | 1. Pod-1 wants to send a packet to `10.40.0.1`. Its network stack looks up the routing table: 105 | 106 | ```bash 107 | $ kubectl exec -it net-tshoot-22drp -- ip route get 10.40.0.1 108 | 10.40.0.1 dev eth0 src 10.32.0.4 uid 0 109 | cache 110 | ``` 111 | 112 | 2. Since the target IP is from a directly-connected network, the next step is to check its local ARP table: 113 | 114 | ```bash 115 | $ kubectl exec -it net-tshoot-22drp -- ip neigh show 10.40.0.1 116 | 10.40.0.1 dev eth0 lladdr d6:8d:31:c4:95:85 STALE 117 | ``` 118 | 119 | 3. The packet is sent out of the veth interface and hits the `weave` bridge in the root NS, where a L2 lookup is performed: 120 | 121 | ``` 122 | $ docker exec -it k8s-guide-worker bridge fdb get d6:8d:31:c4:95:85 br weave 123 | d6:8d:31:c4:95:85 dev vethwe-bridge master weave 124 | ``` 125 | 126 | 4. The packet is sent from the `weave` bridge down to the OVS kernel datapath over a veth link: 127 | 128 | ``` 129 | $ docker exec -it k8s-guide-worker ip link | grep vethwe- 130 | 12: vethwe-datapath@vethwe-bridge: mtu 1376 qdisc noqueue master datapath state UP mode DEFAULT group default 131 | 13: vethwe-bridge@vethwe-datapath: mtu 1376 qdisc noqueue master weave state UP mode DEFAULT group default 132 | ``` 133 | 134 | 5. The ODP does a flow lookup to determine what actions to apply to the packet (the output is redacted for brevity) 135 | 136 | ``` 137 | $ WEAVEPOD=$(kubectl get pods -n kube-system -l name=weave-net --field-selector spec.nodeName=k8s-guide-worker -o jsonpath='{.items[0].metadata.name}') 138 | $ kubectl exec -it $WEAVEPOD -n kube-system -- /home/weave/weave --local report 139 | <...> 140 | { 141 | "FlowKeys": [ 142 | "UnknownFlowKey{type: 22, key: 00000000, mask: 00000000}", 143 | "EthernetFlowKey{src: 0a:75:b7:d0:31:58, dst: d6:8d:31:c4:95:85}", 144 | "UnknownFlowKey{type: 25, key: 00000000000000000000000000000000, mask: 00000000000000000000000000000000}", 145 | "UnknownFlowKey{type: 23, key: 0000, mask: 0000}", 146 | "InPortFlowKey{vport: 1}", 147 | "UnknownFlowKey{type: 24, key: 00000000, mask: 00000000}" 148 | ], 149 | "Actions": [ 150 | "SetTunnelAction{id: 0000000000ade6da, ipv4src: 172.18.0.3, ipv4dst: 172.18.0.2, ttl: 64, df: true}", 151 | "OutputAction{vport: 2}" 152 | ], 153 | "Packets": 2, 154 | "Bytes": 84, 155 | "Used": 258933878 156 | }, 157 | <...> 158 | ``` 159 | 160 | 6. ODP encapsulates the original packet into a VXLAN frame ands sends the packet out of its local vxlan port: 161 | 162 | ``` 163 | $ kubectl exec -it $WEAVEPOD -n kube-system -- /home/weave/weave --local report | jq '.Router.OverlayDiagnostics.fastdp.Vports[2]' 164 | { 165 | "ID": 2, 166 | "Name": "vxlan-6784", 167 | "TypeName": "vxlan" 168 | } 169 | ``` 170 | 171 | 7. The VXLAN frame gets L2-switched by the `kind` bridge and arrives at the `control-plane` node, where another ODP lookup is performed 172 | 173 | ``` 174 | $ WEAVEPOD=$(kubectl get pods -n kube-system -l name=weave-net --field-selector spec.nodeName=k8s-guide-control-plane -o jsonpath='{.items[0].metadata.name}') 175 | $ kubectl exec -it $WEAVEPOD -n kube-system -- /home/weave/weave --local report 176 | <...> 177 | { 178 | "FlowKeys": [ 179 | "UnknownFlowKey{type: 22, key: 00000000, mask: 00000000}", 180 | "UnknownFlowKey{type: 24, key: 00000000, mask: 00000000}", 181 | "UnknownFlowKey{type: 25, key: 00000000000000000000000000000000, mask: 00000000000000000000000000000000}", 182 | "TunnelFlowKey{id: 0000000000ade6da, ipv4src: 172.18.0.3, ipv4dst: 172.18.0.2}", 183 | "InPortFlowKey{vport: 2}", 184 | "UnknownFlowKey{type: 23, key: 0000, mask: 0000}", 185 | "EthernetFlowKey{src: 0a:75:b7:d0:31:58, dst: d6:8d:31:c4:95:85}" 186 | ], 187 | "Actions": [ 188 | "OutputAction{vport: 1}" 189 | ], 190 | "Packets": 3, 191 | "Bytes": 182, 192 | "Used": 259264545 193 | }, 194 | <...> 195 | ``` 196 | 197 | 8. The output port is the veth link connecting ODP to the `weave` bridge: 198 | 199 | ``` 200 | $ kubectl exec -it $WEAVEPOD -n kube-system -- /home/weave/weave --local report | jq '.Router.OverlayDiagnostics.fastdp.Vports[1]' 201 | { 202 | "ID": 1, 203 | "Name": "vethwe-datapath", 204 | "TypeName": "netdev" 205 | } 206 | ``` 207 | 208 | 9. Following another L2 lookup in the `weave` bridge, the packet is sent down the veth link connected to the target Pod-3: 209 | 210 | ``` 211 | $ docker exec -it k8s-guide-control-plane bridge fdb get d6:8d:31:c4:95:85 br weave 212 | d6:8d:31:c4:95:85 dev vethwepl6be12f5 master weave 213 | ``` 214 | 215 | 10. Finally, the packet gets delivered to the `eth0` interface of the target pod: 216 | 217 | ``` 218 | $ kubectl exec -it net-tshoot-pbp7z -- ip link show dev eth0 219 | 16: eth0@if17: mtu 1376 qdisc noqueue state UP mode DEFAULT group default 220 | link/ether d6:8d:31:c4:95:85 brd ff:ff:ff:ff:ff:ff link-netnsid 0 221 | ``` 222 | 223 | ### SNAT functionality 224 | 225 | SNAT functionality for traffic egressing the cluster is done in two stages: 226 | 227 | 1. All packets that don't match the cluster CIDR range, get sent to the IP of the local `weave` bridge which sends them down the default route already configured in the root namespace. 228 | 229 | 2. A new `WEAVE` chain gets appended to the POSTROUTING chain which matches all packets from the cluster IP range `10.32.0.0/12` destined to all non-cluster IPs `!10.32.0.0/12` and translates all flows leaving the node (`MASQUERADE`): 230 | 231 | ``` 232 | iptables -t nat -vnL 233 | <...> 234 | Chain POSTROUTING (policy ACCEPT 6270 packets, 516K bytes) 235 | pkts bytes target prot opt in out source destination 236 | 51104 4185K WEAVE all -- * * 0.0.0.0/0 0.0.0.0/0 237 | <...> 238 | Chain WEAVE (1 references) 239 | pkts bytes target prot opt in out source destination 240 | 4 336 RETURN all -- * * 0.0.0.0/0 0.0.0.0/0 match-set weaver-no-masq-local dst /* Prevent SNAT to locally running containers */ 241 | 0 0 RETURN all -- * * 10.32.0.0/12 224.0.0.0/4 242 | 0 0 MASQUERADE all -- * * !10.32.0.0/12 10.32.0.0/12 243 | 2 120 MASQUERADE all -- * * 10.32.0.0/12 !10.32.0.0/12 244 | ``` 245 | 246 | 247 | ### Partial connectivity 248 | 249 | One of the interesting and unique features of Weave is its ability to function in environments with partial connectivity. This functionality is enabled by [Weave Mesh](https://github.com/weaveworks/mesh) and its use of the [gossip protocol](https://en.wikipedia.org/wiki/Gossip_protocol), allowing mesh members to dynamically discover each other and build the topology graph which is used to calculate the most optimal forwarding path. 250 | 251 | One way to demonstrate this is to break the connectivity between two worker nodes and verify that pods are still able to reach each other. Let's start by checking that ping works under normal conditions: 252 | 253 | ``` 254 | POD_WORKER2_IP=$(kubectl get pods -n default --field-selector spec.nodeName=k8s-guide-worker2 -o jsonpath='{.items[0].status.podIP}') 255 | POD_WORKER1_NAME=$(kubectl get pods -n default --field-selector spec.nodeName=k8s-guide-worker -o jsonpath='{.items[0].metadata.name}') 256 | kubectl -n default exec $POD_WORKER1_NAME -- ping -q -c 5 $POD_WORKER2_IP 257 | PING 10.40.0.7 (10.40.0.7) 56(84) bytes of data. 258 | 259 | --- 10.40.0.7 ping statistics --- 260 | 5 packets transmitted, 5 received, 0% packet loss, time 4055ms 261 | rtt min/avg/max/mdev = 0.136/0.178/0.278/0.051 ms 262 | ``` 263 | 264 | Get the IPs of the two worker nodes: 265 | 266 | ``` 267 | IP_WORKER1=$(docker inspect k8s-guide-worker --format='{{range .NetworkSettings.Networks}}{{.IPAddress}}{{end}}') 268 | IP_WORKER2=$(docker inspect k8s-guide-worker2 --format='{{range .NetworkSettings.Networks}}{{.IPAddress}}{{end}}') 269 | ``` 270 | 271 | Add a new `DROP` rule for the traffic between these two IPs: 272 | 273 | ``` 274 | sudo iptables -I FORWARD -s $IP_WORKER1 -d $IP_WORKER2 -j DROP 275 | ``` 276 | 277 | A few seconds later, once the control plane has reconverged, repeat the ping test: 278 | 279 | ``` 280 | kubectl -n default exec $POD_WORKER1_NAME -- ping -q -c 5 $POD_WORKER2_IP 281 | 282 | PING 10.40.0.7 (10.40.0.7) 56(84) bytes of data. 283 | 284 | --- 10.40.0.7 ping statistics --- 285 | 5 packets transmitted, 5 received, 0% packet loss, time 4031ms 286 | rtt min/avg/max/mdev = 0.347/0.489/0.653/0.102 ms 287 | 288 | ``` 289 | 290 | The connectivity still works, although the traffic between the two worker nodes is definitely dropped: 291 | 292 | ``` 293 | sudo iptables -nvL FORWARD | grep DROP 294 | Chain FORWARD (policy DROP 0 packets, 0 bytes) 295 | 312 43361 DROP all -- * * 172.18.0.5 172.18.0.4 296 | ``` 297 | 298 | One thing worth noting here is that the average RTT has almost doubled compared to the original test. This is because the traffic is now relayed by the control-plane node - the only node that has full connectivity to both worker nodes. In the dataplane, this is achieved with a special UDP-based protocol called sleeve(https://www.weave.works/docs/net/latest/concepts/router-encapsulation/). 299 | 300 | 301 | The sending node (172.18.0.5) encapsulates ICMP packets for the other worker node (172.18.0.4) in a Sleeve payload and sends them to the control-plane node (172.18.0.2), which relays them on to the correct destination: 302 | 303 | 304 | ``` 305 | 12:28:54.056814 IP 172.18.0.5.48052 > 172.18.0.2.6784: UDP, length 106 306 | 12:28:54.057599 IP 172.18.0.2.48052 > 172.18.0.4.6784: UDP, length 106 307 | 12:28:54.057957 IP 172.18.0.4.48052 > 172.18.0.2.6784: UDP, length 106 308 | 12:28:54.058376 IP 172.18.0.2.48052 > 172.18.0.5.6784: UDP, length 106 309 | ``` 310 | 311 | Although it certainly comes with substantial performance trade-offs, this functionality can become very handy in environments with bad network links or where remote nodes are hosted in an isolated network environment with limited/restricted external connectivity. 312 | 313 | Don't forget to remove the drop rule at the end of the testing: 314 | 315 | ``` 316 | sudo iptables -D FORWARD -s $IP_WORKER1 -d $IP_WORKER2 -j DROP 317 | ``` 318 | 319 | 320 | ### Caveats and Gotchas 321 | 322 | * The official installation guide contains a number of [things to watch out for](https://www.weave.works/docs/net/latest/kubernetes/kube-addon/#-things-to-watch-out-for). 323 | * Addition/Deletion or intermittent connectivity to nodes [results](https://github.com/weaveworks/weave/issues/3645) in flow invalidation on all nodes, which, for a brief period of time, disrupts all connections until the flood-and-learn re-populates all forwarding tables. 324 | 325 | 326 | 327 | ### Additional reading: 328 | 329 | [Weave's IPAM](https://www.weave.works/docs/net/latest/tasks/ipam/ipam/) 330 | [Overlay Method Selection](https://github.com/weaveworks/weave/blob/master/docs/fastdp.md) 331 | [OVS dataplane Implementation Details](http://www.openvswitch.org//support/ovscon2016/8/0935-pumputis.pdf) 332 | 333 | 334 | -------------------------------------------------------------------------------- /content/credits.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: Credits 3 | disableToc: true 4 | --- 5 | 6 | ## Contributors 7 | 8 | [TKNG contributors](https://github.com/networkop/k8s-networking-guide/graphs/contributors) 9 | 10 | 11 | ## Tooling 12 | 13 | * [Netlify](https://www.netlify.com) - Continuous deployement and hosting of this documentation 14 | * [Hugo](https://gohugo.io/) - static blog generator 15 | * [Flux](https://github.com/fluxcd/flux) - the GitOps operator for Kubernetes 16 | * [Kind](https://kind.sigs.k8s.io/docs/user/quick-start/) -- tool for running local Kubernetes clusters using Docker container “nodes” 17 | 18 | ## Special Thanks 19 | 20 | Special thanks to Roman Dodin [@ntdvps](https://twitter.com/ntdvps) for donating the domain [tkng.io](tkng.io) -------------------------------------------------------------------------------- /content/dns/_index.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: "DNS" 3 | date: 2020-09-13T17:33:04+01:00 4 | weight: 100 5 | summary: "The role and configuration of DNS" 6 | --- 7 | 8 | DNS plays a central role in Kubernetes service discovery. As it is mentioned in the [Services chapter](/services/), DNS is an essential part of how Services are consumed by end clients and, while implementation is not baked into core Kubernetes controllers, [DNS specification] is very explicit about the behaviour expected from such an implementation. The DNS spec defines the rules for the format of the queries and the expected responses. All Kubernetes Services have at least one corresponding A/AAAA DNS record in the format of `{service-name}.{namespace}.svc.{cluster-domain}` and the response format depends on the type of a Service: 9 | 10 | | Service Type | Response | 11 | |--------------|----------| 12 | | ClusterIP, NodePort, LoadBalancer | ClusterIP value | 13 | | Headless | List of Endpoint IPs | 14 | | ExternalName | CNAME pointing to the value of `spec.externalName` | 15 | 16 | {{% notice note %}} 17 | Some Services have additional SRV and PTR records and Pods also have a corresponding A/AAAA record; see the [official docs](https://kubernetes.io/docs/concepts/services-networking/dns-pod-service/) for more details. 18 | {{% /notice %}} 19 | 20 | Historically, there had been two implementations of this DNS spec -- one based on `dnsmasq` and another one based on `CoreDNS`, the latter had become the [default option](https://kubernetes.io/blog/2018/07/10/coredns-ga-for-kubernetes-cluster-dns/) for kubeadm since Kubernetes 1.11. 21 | 22 | 23 | 24 | ## Service Discovery -- Server-side 25 | 26 | CoreDNS implements the Kubernetes DNS spec in a [dedicated plugin](https://coredns.io/plugins/kubernetes/) that gets compiled into a static binary and deployed in a Kubernetes cluster as a Deployment and exposed as a ClusterIP service. This means that all communications with the DNS service inside a cluster are subject to the same network forwarding rules used and limitations experienced by normal Pods and set up by the [CNI](/cni/) and [Services](/services/) plugins. 27 | 28 | Since DNS speed and stability are considered [crucial](https://isitdns.com/) in any network-based communication, CoreDNS implementation is [highly optimised](https://github.com/coredns/deployment/blob/master/kubernetes/Scaling_CoreDNS.md) to minimise memory consumption and maximise query processing rate. In order to achieve that, CoreDNS stores only the [relevant parts](https://github.com/coredns/coredns/blob/a644eb4472ab61cdef8405b4e42bc9892f2e9295/plugin/kubernetes/object/service.go#L33) of [Services](https://github.com/coredns/coredns/blob/a644eb4472ab61cdef8405b4e42bc9892f2e9295/plugin/kubernetes/object/service.go#L12), [Pods](https://github.com/coredns/coredns/blob/a644eb4472ab61cdef8405b4e42bc9892f2e9295/plugin/kubernetes/object/pod.go#L13) and [Endpoints](https://github.com/coredns/coredns/blob/a644eb4472ab61cdef8405b4e42bc9892f2e9295/plugin/kubernetes/object/endpoint.go#L14) objects in its [local cache](https://github.com/coredns/coredns/blob/a644eb4472ab61cdef8405b4e42bc9892f2e9295/plugin/kubernetes/controller.go#L115) that is optimised to return a response in a [single lookup](https://github.com/coredns/coredns/blob/a644eb4472ab61cdef8405b4e42bc9892f2e9295/plugin/kubernetes/kubernetes.go#L495). 29 | 30 | By default, CoreDNS also acts as a DNS proxy for all external domains (e.g. example.com) using the [`forward` plugin](https://coredns.io/plugins/forward/) and is often deployed with the [`cache` plugin](https://coredns.io/plugins/cache/) enabled. The entire CoreDNS configuration can be found in the `coredns` ConfigMap: 31 | 32 | ```yaml 33 | apiVersion: v1 34 | kind: ConfigMap 35 | metadata: 36 | name: coredns 37 | namespace: kube-system 38 | data: 39 | Corefile: | 40 | .:53 { 41 | errors 42 | health { 43 | lameduck 5s 44 | } 45 | ready 46 | kubernetes cluster.local in-addr.arpa ip6.arpa { 47 | pods insecure 48 | fallthrough in-addr.arpa ip6.arpa 49 | ttl 30 50 | } 51 | prometheus :9153 52 | forward . /etc/resolv.conf { 53 | max_concurrent 1000 54 | } 55 | cache 30 56 | loop 57 | reload 58 | loadbalance 59 | } 60 | ``` 61 | 62 | ## Service Discovery -- Client-side 63 | 64 | DNS configuration inside a Pod is controlled by the `spec.dnsPolicy` and `spec.dnsConfig` [settings](https://kubernetes.io/docs/concepts/services-networking/dns-pod-service/#pod-s-dns-policy). By default, kubelet will configure the cluster DNS IP, stored in the configuration file and [hard-coded](https://github.com/kubernetes/kubernetes/blob/cde45fb161c5a4bfa7cfe45dfd814f6cc95433f7/cmd/kubeadm/app/constants/constants.go#L638) to the tenth IP of the ClusterIP range by the kubeadm. 65 | 66 | ``` 67 | root@k8s-guide-control-plane:/# cat /var/lib/kubelet/config.yaml 68 | apiVersion: kubelet.config.k8s.io/v1beta1 69 | ... 70 | clusterDNS: 71 | - 10.96.0.10 72 | ... 73 | ``` 74 | With the above default settings, this is how a Pod deployed in the default namespace would see its own `resolv.conf` file: 75 | 76 | ``` 77 | $ cat /etc/resolv.conf 78 | search default.svc.cluster.local svc.cluster.local cluster.local 79 | nameserver 10.96.0.10 80 | options ndots:5 81 | ``` 82 | 83 | The search domains and `ndots` value are configured so that any non-FQDN DNS query made by a Pod is first tried in all of the specified domains, which allows for internal cluster DNS schema to take precedence over the external DNS ([explanation](https://github.com/kubernetes/kubernetes/issues/33554#issuecomment-266251056)). For example, any Pod in the `default` Namespace, can lookup the ClusterIP of the `kubernetes` Service in a single lookup (the shell is running a `stern -n kube-system -l k8s-app=kube-dns` in the background): 84 | 85 | ```bash 86 | $ kubectl -n default exec ds/net-tshoot -- dig kubernetes +search +short 87 | 10.96.0.1 88 | coredns-558bd4d5db-sqhkz coredns [INFO] 10.244.0.5:36255 - 36946 "A IN kubernetes.default.svc.cluster.local. udp 77 false 4096" NOERROR qr,aa,rd 106 0.0002139s 89 | ``` 90 | 91 | The downside of this behaviour is that any external domain lookup will require at least 4 separate queries: 92 | 93 | ``` 94 | $ kubectl -n default exec ds/net-tshoot -- dig tkng.io +search +short 95 | coredns-558bd4d5db-5jbgh coredns [INFO] 10.244.0.5:54816 - 13660 "A IN tkng.io.default.svc.cluster.local. udp 74 false 4096" NXDOMAIN qr,aa,rd 144 0.0002719s 96 | coredns-558bd4d5db-5jbgh coredns [INFO] 10.244.0.5:38006 - 38084 "A IN tkng.io.svc.cluster.local. udp 66 false 4096" NXDOMAIN qr,aa,rd 136 0.0001705s 97 | coredns-558bd4d5db-5jbgh coredns [INFO] 10.244.0.5:35302 - 4454 "A IN tkng.io.cluster.local. udp 62 false 4096" NXDOMAIN qr,aa,rd 132 0.0001219s 98 | 172.67.201.112 99 | 104.21.21.243 100 | coredns-558bd4d5db-sqhkz coredns [INFO] 10.244.0.5:47052 - 6189 "A IN tkng.io. udp 48 false 4096" NOERROR qr,rd,ad 71 0.0183829s 101 | ``` 102 | 103 | 104 | ## Optimisations 105 | 106 | DNS is widely regarded as the main [source](https://isitdns.com/) of all IT problems, and Kubernetes is no exception (see [1](https://github.com/kubernetes/kubernetes/issues/56903), [2](https://www.weave.works/blog/racy-conntrack-and-dns-lookup-timeouts), [3](https://github.com/kubernetes/kubernetes/issues/62628), [4](https://pracucci.com/kubernetes-dns-resolution-ndots-options-and-why-it-may-affect-application-performances.html)). Its way of deployment and reliance on [HPA](https://kubernetes.io/docs/tasks/run-application/horizontal-pod-autoscale/) mean that some Nodes could become connection bottlenecks while the CPU and Memory of the DNS Pods may remain relatively low. There are a number of optimisations that can be enabled to improve DNS performance at the expense of additional resource utilisation and complexity: 107 | 108 | * The [**authopath** plugin](https://coredns.io/plugins/autopath/) can be enabled in CoreDNS to make it follow the chain of search paths on behalf of a client, thereby reducing the number of queries for an external domain required by the client from 4 (see above) to just one. 109 | * Each Kubernetes Node can run a [**NodeLocal DNSCache**](https://kubernetes.io/docs/tasks/administer-cluster/nodelocaldns/) -- a daemonset of recursive DNS resolvers designed to reduce the load on a centralised CoreDNS deployment by serving as a caching layer between Pods and the DNS service. 110 | * Adjust lookups that are used frequently to add a trailing dot (example, `tkng.io.`). This is most effective for external hostnames that experience overhead as an absolute query is performed, avoiding the search path expansion through the `cluster.local`, and potentially other search domains in `/etc/resolv.conf`. 111 | 112 | 113 | ## External DNS 114 | 115 | The [DNS Specification](https://github.com/kubernetes/dns/blob/master/docs/specification.md) is only focused on the intra-cluster DNS resolution and service discovery. Anything to do with external DNS is left out of scope, despite the fact that most of the end-users are located outside of a cluster. For them, Kubernetes has to provide a way to discover external Kubernetes resources, LoadBalancer Services, Ingresses and Gateways, and there are two ways this can be accomplished: 116 | 117 | * An out-of-cluster DNS zone can be orchestrated by the [**ExternalDNS** cluster add-on](https://github.com/kubernetes-sigs/external-dns) -- a Kubernetes controller that synchronises external Kubernetes resources with any supported third-party DNS provider via an API (see the [GH page](https://github.com/kubernetes-sigs/external-dns#externaldns) for the list of supported providers). 118 | * An existing DNZ zone can be configured to delegate a subdomain to a self-hosted external DNS plugin, e.g. [**k8s_gateway**](https://github.com/ori-edge/k8s_gateway). This approach assumes that this DNS plugin is deployed inside a cluster and exposed via a LoadBalancer IP, which is then used in an NS record for the delegated zone. All queries hitting this subdomain will get forwarded to this plugin which will respond as an authoritative nameserver for the delegated subdomain. 119 | 120 | -------------------------------------------------------------------------------- /content/ingress/_index.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Ingress & Egress" 3 | date: 2020-09-13T17:33:04+01:00 4 | weight: 70 5 | summary: "North-South traffic forwarding" 6 | --- 7 | 8 | This chapter deals with anything related to North-South traffic forwarding in Kubernetes. First, let's make it clear that in both ingress (North) and egress (South) cases, traffic flows are actually bidirectional, i.e. a single flow would have packets flowing in both directions. The main distiction between ingress and egress is the direction of the original packet, i.e. where the client and server are located relative to the Kubernetes cluster boundary. 9 | 10 | These two types of traffic are treated very diffirently and almost always take asymmetric paths. This is because ingress is usually more important -- it's the revenue-generating user traffic for cluster applications, while egress is mainly non-revenue, Internet-bound traffic, e.g. DNS queries, package updates -- something that may not even be needed, depending on the application architecture. 11 | 12 | {{% notice note %}} 13 | Egress may have a slightly different meaning in the context of service meshes and multiple clusters, but this is outside of the scope of this chapter. 14 | {{% /notice %}} 15 | 16 | 17 | Because of the above differences, ingress and egress traffic needs to be examined separately and this part of the guide will be split into the following chapters: 18 | 19 | * [**Ingress API**](/ingress/ingress/) -- the original method of routing incoming traffic to different cluster applications. 20 | * [**Gateway API**](/ingress/gateway/) -- can be treated as the evolution of the Ingress API with the same goals and scope. 21 | * [**Egress**](/ingress/egress/) -- describes different options for egress traffic engineering. 22 | 23 | 24 | -------------------------------------------------------------------------------- /content/ingress/egress/_index.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Egress" 3 | date: 2020-09-13T17:33:04+01:00 4 | weight: 70 5 | summary: "Egress traffic engineering" 6 | --- 7 | 8 | Egress is a very loosely defined term in the Kubernetes ecosystem. Unlike its counterpart, egress traffic is not controlled by any standard Kubernetes API or a proxy. This is because most of the egress traffic is not revenue-generating and, in fact, can be completely optional. For situations when a Pod needs to communicate with an external service, it would make sense to do this via an API gateway rather than allow direct communication and most of the service meshes provide this functionality, e.g. Consul's [Terminating Gateway](https://www.consul.io/docs/connect/gateways/terminating-gateway) or OSM's [Egress Policy API](https://docs.openservicemesh.io/docs/guides/traffic_management/egress/). However, we still need a way to allow for Pod-initiated external communication, without a service mesh integration, and this is how it can be done: 9 | 10 | 1. By default, traffic leaving a Pod will follow the default route out of a Node and will get masqueraded (SNAT'ed) to the address of the outgoing interface. This is normally provisioned by a CNI plugin option, e.g. the `ipMasq` option of the [bridge plugin](https://www.cni.dev/plugins/current/main/bridge/#network-configuration-reference), or by a separate agent, e.g. [`ip-masq-agent`](https://github.com/kubernetes-sigs/ip-masq-agent). 11 | 2. For security reasons, some or all egress traffic can get redirected to an "egress gateway" deployed on a subset of Kubernetes Nodes. The operation, UX and redirection mechanism are implementation-specific and can work at an application level, e.g. Istio's [Egress Gateway](https://istio.io/latest/docs/tasks/traffic-management/egress/egress-gateway/), or at an IP level, e.g. Cilium's [Egress Gateway](https://docs.cilium.io/en/stable/gettingstarted/egress-gateway/). 12 | 13 | In both cases, the end result is that a packet leaves one of the Kubernetes Nodes, SNAT'ed to the address of the egress interface. The rest of the forwarding is done by the underlying network. 14 | 15 | {{< iframe "https://viewer.diagrams.net/?highlight=0000ff&edit=_blank&hide-pages=1&editable=false&layers=1&nav=0&page-id=g6ESgU9g5ULUhjZ5bZWG&title=k8s-guide.drawio#Uhttps%3A%2F%2Fraw.githubusercontent.com%2Fnetworkop%2Fk8s-guide-labs%2Fmaster%2Fdiagrams%2Fk8s-guide.drawio" >}} 16 | 17 | 18 | ## Lab 19 | 20 | The way direct local egress works has already been described in the CNI part of this guide. Refer to the respective sections of the [kindnet](http://localhost:1313/cni/kindnet/#snat-functionality), [flannel](http://localhost:1313/cni/flannel/#snat-functionality), [weave](http://localhost:1313/cni/weave/#snat-functionality), [calico](http://localhost:1313/cni/calico/#snat-functionality) and [cilium](http://localhost:1313/cni/cilium/#snat-functionality) chapters for more details. 21 | 22 | For this lab exercise, we’ll focus on how Cilium implements the Egress Gateway functionality via a custom resource called `CiliumEgressNATPolicy`. 23 | 24 | ### Preparation 25 | 26 | 27 | Assuming that the lab environment is already [set up](/lab/), Cilium can be enabled with the following command: 28 | 29 | ```bash 30 | make cilium 31 | ``` 32 | 33 | Wait for the Cilium daemonset to initialize: 34 | 35 | ```bash 36 | make cilium-wait 37 | ``` 38 | 39 | Now we need to "kick" all Pods to restart and pick up the new CNI plugin: 40 | 41 | ```bash 42 | make nuke-all-pods 43 | ``` 44 | 45 | To make sure there's is no interference from `kube-proxy` we'll remove it completely along with any IPTables rules set up by it: 46 | 47 | ``` 48 | make nuke-kube-proxy 49 | ``` 50 | 51 | Deploy an "external" [echo server](https://github.com/mpolden/echoip) that will be used to check the source IP of the incoming request: 52 | 53 | ``` 54 | make egress-prep 55 | ``` 56 | 57 | By default, we should have a `net-tshoot` daemonset running on all Nodes: 58 | 59 | ``` 60 | $ kubectl -n default get pod -owide 61 | NAME READY STATUS RESTARTS AGE IP NODE NOMINATED NODE READINESS GATES 62 | net-tshoot-5ngbc 1/1 Running 0 4h53m 10.0.0.174 k8s-guide-control-plane 63 | net-tshoot-gcj27 1/1 Running 0 4h53m 10.0.2.86 k8s-guide-worker2 64 | net-tshoot-pcgf8 1/1 Running 0 4h53m 10.0.1.42 k8s-guide-worker 65 | ``` 66 | 67 | We can use these Pods to verify the (default) local egress behaviour by sending an HTTP GET to the echo server: 68 | 69 | ``` 70 | $ kubectl -n default get pod -l name=net-tshoot -o name | xargs -I{} kubectl -n default exec {} -- wget -q -O - echo 71 | 172.18.0.5 72 | 172.18.0.3 73 | 172.18.0.6 74 | ``` 75 | 76 | These are the same IPs that are assigned to our lab Nodes: 77 | 78 | ``` 79 | $ make node-ip-1 && make node-ip-2 && make node-ip-3 80 | control-plane:172.18.0.3 81 | worker:172.18.0.5 82 | worker2:172.18.0.6 83 | ``` 84 | 85 | Finally, we can enable the `CiliumEgressNATPolicy` that will NAT all traffic from Pods in the default namespace to the IP of the control-plane node: 86 | 87 | ``` 88 | make egress-setup 89 | ``` 90 | 91 | This can be verified by re-running the earlier command: 92 | 93 | ``` 94 | $ kubectl -n default get pod -l name=net-tshoot -o name | xargs -I{} kubectl -n default exec {} -- wget -q -O - echo 95 | 172.18.0.3 96 | 172.18.0.3 97 | 172.18.0.3 98 | ``` 99 | 100 | We can see that now all three requests appear to have come from the same Node. 101 | 102 | 103 | ### Walkthrough 104 | 105 | Now let's briefly walk through how Cilium implements the above NAT policy. The Cilium CNI chapter [explains](http://localhost:1313/cni/cilium/#2-nodes-ebpf-programs) how certain eBPF programs get attached to different interfaces. In our case, we're looking at a program attached to all `lxc` interfaces and processing incoming packets a Pod called [`from-container`](https://github.com/cilium/cilium/blob/18513dbc1379a2d439163876e50dd68b009169fd/bpf/bpf_lxc.c#L970). Inside this program, a packet goes through several functions before it eventually gets to the `handle_ipv4_from_lxc` function ([source](https://github.com/cilium/cilium/blob/18513dbc1379a2d439163876e50dd68b009169fd/bpf/bpf_lxc.c#L510)) which does the bulk of work in IPv4 packet processing. The relevant part of this function is this one: 106 | 107 | ```c 108 | #ifdef ENABLE_EGRESS_GATEWAY 109 | { 110 | struct egress_info *info; 111 | struct endpoint_key key = {}; 112 | 113 | info = lookup_ip4_egress_endpoint(ip4->saddr, ip4->daddr); 114 | if (!info) 115 | goto skip_egress_gateway; 116 | 117 | /* Encap and redirect the packet to egress gateway node through a tunnel. 118 | * Even if the tunnel endpoint is on the same host, follow the same data 119 | * path to be consistent. In future, it can be optimized by directly 120 | * direct to external interface. 121 | */ 122 | ret = encap_and_redirect_lxc(ctx, info->tunnel_endpoint, encrypt_key, 123 | &key, SECLABEL, monitor); 124 | if (ret == IPSEC_ENDPOINT) 125 | goto encrypt_to_stack; 126 | else 127 | return ret; 128 | } 129 | skip_egress_gateway: 130 | #endif 131 | ``` 132 | 133 | Here, our packet's source and destination IPs get passed to the `lookup_ip4_egress_endpoint` which performs a lookup in the following map: 134 | 135 | 136 | ``` 137 | $ NODE=k8s-guide-worker2 138 | $ cilium=$(kubectl get -l k8s-app=cilium pods -n cilium --field-selector spec.nodeName=$NODE -o jsonpath='{.items[0].metadata.name}') 139 | $ kubectl -n cilium exec -it $cilium -- bpftool map dump pinned /sys/fs/bpf/tc/globals/cilium_egress_v4 140 | key: 30 00 00 00 0a 00 00 1f ac 12 00 00 value: ac 12 00 03 ac 12 00 03 141 | key: 30 00 00 00 0a 00 01 d1 ac 12 00 00 value: ac 12 00 03 ac 12 00 03 142 | key: 30 00 00 00 0a 00 02 0e ac 12 00 00 value: ac 12 00 03 ac 12 00 03 143 | Found 3 elements 144 | ``` 145 | 146 | The above can be translated as the following: 147 | 148 | * Match all packets with source IP `10.0.0.174`, `10.0.2.86` or `10.0.1.42` (all Pods in the default namespace) and destination prefix of `172.18.0.0/16` 149 | * Return the value with egress IP of `172.18.0.3` and tunnel endpoint of `172.18.0.3`. 150 | 151 | The returned value is used in the `encap_and_redirect_lxc` function call that encapsulates the packet and forwards it to the Node with IP `172.18.0.3`. 152 | 153 | On the egress Node, our packet gets processed by the `from-overlay` function ([source](https://github.com/cilium/cilium/blob/18513dbc1379a2d439163876e50dd68b009169fd/bpf/bpf_overlay.c#L289)), and eventually falls through to the local network stack. The local network stack has the default route pointing out the `eth0` interface, which is where our packet gets forwarded next. 154 | 155 | At this point, Cilium applies its configured IP masquerade [policy](https://docs.cilium.io/en/v1.9/concepts/networking/masquerading/) using either IPTables or eBPF translation. The eBPF masquerading is implemented as a part of the `to-netdev` ([source](https://github.com/cilium/cilium/blob/18513dbc1379a2d439163876e50dd68b009169fd/bpf/bpf_host.c#L1010)) program attached to the egress direction of the `eth0` interface. 156 | 157 | ```c 158 | #if defined(ENABLE_NODEPORT) && \ 159 | (!defined(ENABLE_DSR) || \ 160 | (defined(ENABLE_DSR) && defined(ENABLE_DSR_HYBRID)) || \ 161 | defined(ENABLE_MASQUERADE) || \ 162 | defined(ENABLE_EGRESS_GATEWAY)) 163 | if ((ctx->mark & MARK_MAGIC_SNAT_DONE) != MARK_MAGIC_SNAT_DONE) { 164 | ret = handle_nat_fwd(ctx); 165 | if (IS_ERR(ret)) 166 | return send_drop_notify_error(ctx, 0, ret, 167 | CTX_ACT_DROP, 168 | METRIC_EGRESS); 169 | } 170 | #endif 171 | ``` 172 | 173 | From `handle_nat_fwd` function ([source](https://github.com/cilium/cilium/blob/18513dbc1379a2d439163876e50dd68b009169fd/bpf/lib/nodeport.h#L2179)) the processing goes through `tail_handle_nat_fwd_ipv4`, `nodeport_nat_ipv4_fwd` and eventually gets to the `snat_v4_process` function ([source](https://github.com/cilium/cilium/blob/18513dbc1379a2d439163876e50dd68b009169fd/bpf/lib/nat.h#L504)) where all of the NAT translations take place. All new packets will fall through to the `snat_v4_new_mapping` function where a new random source port will be allocated to the packet: 174 | 175 | ```c 176 | #pragma unroll 177 | for (retries = 0; retries < SNAT_COLLISION_RETRIES; retries++) { 178 | if (!snat_v4_lookup(&rtuple)) { 179 | ostate->common.created = bpf_mono_now(); 180 | rstate.common.created = ostate->common.created; 181 | 182 | ret = snat_v4_update(otuple, ostate, &rtuple, &rstate); 183 | if (!ret) 184 | break; 185 | } 186 | 187 | port = __snat_clamp_port_range(target->min_port, 188 | target->max_port, 189 | retries ? port + 1 : 190 | get_prandom_u32()); 191 | rtuple.dport = ostate->to_sport = bpf_htons(port); 192 | } 193 | ``` 194 | 195 | Finally, once the new source port has been selected and the connection tracking entry for subsequent packets set up, the packet gets its headers updated and before being sent out of the egress interface: 196 | 197 | ```c 198 | return dir == NAT_DIR_EGRESS ? 199 | snat_v4_rewrite_egress(ctx, &tuple, state, off, ipv4_has_l4_header(ip4)) : 200 | snat_v4_rewrite_ingress(ctx, &tuple, state, off); 201 | ``` -------------------------------------------------------------------------------- /content/ingress/gateway/_index.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Gateway API" 3 | date: 2020-09-13T17:33:04+01:00 4 | weight: 20 5 | summary: "Evolution of Ingress API" 6 | --- 7 | 8 | Ingress API has had a very difficult history and had remained in `v1beta1` for many years. Despite having a thriving ecosystem of controller implementations, their use of Ingress API have remained largely incompatible. In addition to that, the same controller vendors have started shipping their own set of custom resources designed to address the limitations of Ingress API. At some point, Kubernetes SIG Network group even discussed the possibility of scrapping the Ingress API altogether and letting each vendor bring their own set of CRDs (see "Ingress Discussion Notes" in [Network SIG Meeting Minutes](https://docs.google.com/document/d/1_w77-zG_Xj0zYvEMfQZTQ-wPP4kXkpGD8smVtW_qqWM/edit)). Despite all that, Ingress API has survived, addressed some of the more pressing issues and finally got promoted to `v1` in Kuberntes `v1.19`. However, some of the problems could not be solved by an incremental re-design and this is why the [Gateway API](https://gateway-api.sigs.k8s.io/) project (formerly called Service API) was founded. 9 | 10 | Gateway API decomposes a single Ingress API into a set of [independent resources](https://gateway-api.sigs.k8s.io/concepts/api-overview/) that can be combined via label selectors and references to build the desired proxy state. This decomposition follows a pattern very commonly found in proxy configuration -- listener, route and backends -- and can be viewed as a hierarchy of objects: 11 | 12 | |Hierarchy | Description | 13 | |--------------|---| 14 | | Gateway Class | Identifies a single GatewayAPI controller installed in a cluster. | 15 | | Gateway | Associates listeners with Routes, belongs to one of the Gateway classes. | 16 | | Route | Defines rules for traffic routing by linking Gateways with Services. | 17 | | Service | Represents a set of Endpoints to be used as backends. | 18 | 19 | This is how the above hierarchy can be combined to expose an existing `web` Service to the outside world as `http://gateway.tkng.io` (see the Lab [walkthrough](http://localhost:1313/ingress/gateway/#walkthrough) for more details): 20 | 21 | ```yaml 22 | apiVersion: networking.x-k8s.io/v1alpha1 23 | kind: GatewayClass 24 | metadata: 25 | name: istio 26 | spec: 27 | controller: istio.io/gateway-controller 28 | --- 29 | apiVersion: networking.x-k8s.io/v1alpha1 30 | kind: Gateway 31 | metadata: 32 | name: gateway 33 | namespace: istio-system 34 | spec: 35 | gatewayClassName: istio 36 | listeners: 37 | - hostname: "*" 38 | port: 80 39 | protocol: HTTP 40 | routes: 41 | namespaces: 42 | from: All 43 | selector: 44 | matchLabels: 45 | selected: "yes" 46 | kind: HTTPRoute 47 | --- 48 | apiVersion: networking.x-k8s.io/v1alpha1 49 | kind: HTTPRoute 50 | metadata: 51 | name: http 52 | namespace: default 53 | labels: 54 | selected: "yes" 55 | spec: 56 | gateways: 57 | allow: All 58 | hostnames: ["gateway.tkng.io"] 59 | rules: 60 | - matches: 61 | - path: 62 | type: Prefix 63 | value: / 64 | forwardTo: 65 | - serviceName: web 66 | port: 80 67 | ``` 68 | 69 | Regardless of all the new features and operational benefits Gateway API brings, its final goal is exactly the same as for Ingress API -- to configure a proxy for external access to applications running in a cluster. 70 | 71 | {{< iframe "https://viewer.diagrams.net/?highlight=0000ff&edit=_blank&hide-pages=1&editable=false&layers=1&nav=0&page-id=872_TPyC9xnwDXYNSrfC&title=k8s-guide.drawio#Uhttps%3A%2F%2Fraw.githubusercontent.com%2Fnetworkop%2Fk8s-guide-labs%2Fmaster%2Fdiagrams%2Fk8s-guide.drawio" >}} 72 | 73 | 74 | ## Lab 75 | 76 | For this lab exercise, we'll use one of the Gateway API implementations from [Istio](https://kubernetes.github.io/ingress-nginx/). 77 | 78 | 79 | ### Preparation 80 | 81 | 82 | Assuming that the lab environment is already [set up](/lab/), `Istio` can be set up with the following commands: 83 | 84 | 85 | ``` 86 | make gateway-setup 87 | ``` 88 | 89 | Wait for all Istio Pods to fully initialise: 90 | 91 | ``` 92 | $ make gateway-check 93 | pod/istio-ingressgateway-574dff7b88-9cd7v condition met 94 | pod/istiod-59db6b6d9-pl6np condition met 95 | pod/metallb-controller-748756655f-zqdxn condition met 96 | pod/metallb-speaker-97tb7 condition met 97 | pod/metallb-speaker-pwvrx condition met 98 | pod/metallb-speaker-qln9k condition met 99 | ``` 100 | 101 | Set up a test Deployment to be used in the walkthrough: 102 | 103 | 104 | ``` 105 | $ make deployment && make cluster-ip 106 | ``` 107 | 108 | Make sure that the Gateway has been assigned with a LoadBalancer IP: 109 | 110 | ``` 111 | $ kubectl get -n istio-system gateways gateway -o jsonpath='{.status.addresses}' | jq 112 | [ 113 | { 114 | "type": "IPAddress", 115 | "value": "198.51.100.0" 116 | } 117 | ] 118 | ``` 119 | 120 | Now we can verify the functionality: 121 | 122 | ``` 123 | $ docker exec k8s-guide-control-plane curl -s -HHost:gateway.tkng.io http://198.51.100.0/ | grep Welcome 124 | Welcome to nginx! 125 |

Welcome to nginx!

126 | ``` 127 | 128 | ### Walkthrough 129 | 130 | One of the easiest ways to very data plane configuration is to use the [istioctl](https://istio.io/latest/docs/setup/install/istioctl/) tool. The first thing we can do is look at the current state of all data plane proxies. In our case we're not using Istio's service mesh functionality, so the only proxy will be the `istio-ingressgateway`: 131 | 132 | ``` 133 | $ istioctl proxy-status 134 | NAME CDS LDS EDS RDS ISTIOD VERSION 135 | istio-ingressgateway-574dff7b88-tnqck.istio-system SYNCED SYNCED SYNCED SYNCED istiod-59db6b6d9-j8kt8 1.12-alpha.2a768472737998f0e13cfbfec74162005c53300c 136 | ``` 137 | 138 | Let's take a close look at the `proxy-config`, starting with the current set of listeners: 139 | 140 | ``` 141 | $ istioctl proxy-config listener istio-ingressgateway-574dff7b88-tnqck.istio-system 142 | ADDRESS PORT MATCH DESTINATION 143 | 0.0.0.0 8080 ALL Route: http.8080 144 | 0.0.0.0 15021 ALL Inline Route: /healthz/ready* 145 | 0.0.0.0 15090 ALL Inline Route: /stats/prometheus* 146 | ``` 147 | 148 | The one that we're interested in is called `http.8080` and here is how we can check all of the routing currently configured for it: 149 | 150 | ```json 151 | "istioctl proxy-config route istio-ingressgateway-574dff7b88-tnqck.istio-system --name http.8080 -ojson" 152 | [ 153 | { 154 | "name": "http.8080", 155 | "virtualHosts": [ 156 | { 157 | "name": "gateway.tkng.io:80", 158 | "domains": [ 159 | "gateway.tkng.io", 160 | "gateway.tkng.io:*" 161 | ], 162 | "routes": [ 163 | { 164 | "match": { 165 | "prefix": "/", 166 | "caseSensitive": true 167 | }, 168 | "route": { 169 | "cluster": "outbound|80||web.default.svc.cluster.local", 170 | "timeout": "0s", 171 | "retryPolicy": { 172 | "retryOn": "connect-failure,refused-stream,unavailable,cancelled,retriable-status-codes", 173 | "numRetries": 2, 174 | "retryHostPredicate": [ 175 | { 176 | "name": "envoy.retry_host_predicates.previous_hosts" 177 | } 178 | ], 179 | "hostSelectionRetryMaxAttempts": "5", 180 | "retriableStatusCodes": [ 181 | 503 182 | ] 183 | }, 184 | "maxGrpcTimeout": "0s" 185 | }, 186 | "metadata": { 187 | "filterMetadata": { 188 | "istio": { 189 | "config": "/apis/networking.istio.io/v1alpha3/namespaces/default/virtual-service/http-istio-autogenerated-k8s-gateway" 190 | } 191 | } 192 | }, 193 | "decorator": { 194 | "operation": "web.default.svc.cluster.local:80/*" 195 | } 196 | } 197 | ], 198 | "includeRequestAttemptCount": true 199 | } 200 | ], 201 | "validateClusters": false 202 | } 203 | ] 204 | ``` 205 | 206 | From the above output we can see that the proxy is set up to route all HTTP requests with `Host: gateway.tkng.io` header to a cluster called `outbound|80||web.default.svc.cluster.local`. Let's check this cluster's Endpoints: 207 | 208 | ``` 209 | $ istioctl proxy-config endpoints istio-ingressgateway-574dff7b88-tnqck.istio-system --cluster "outbound|80||web.default.svc.cluster.local" 210 | ENDPOINT STATUS OUTLIER CHECK CLUSTER 211 | 10.244.1.12:80 HEALTHY OK outbound|80||web.default.svc.cluster.local 212 | ``` 213 | 214 | The above Endpoint address corresponds to the only running Pod in the `web` deployment: 215 | 216 | ``` 217 | $ kubectl get pod -owide -l app=web 218 | NAME READY STATUS RESTARTS AGE IP NODE NOMINATED NODE READINESS GATES 219 | web-96d5df5c8-p8f97 1/1 Running 0 104m 10.244.1.12 k8s-guide-worker 220 | ``` -------------------------------------------------------------------------------- /content/ingress/ingress/_index.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Ingress API" 3 | date: 2020-09-13T17:33:04+01:00 4 | weight: 10 5 | summary: "Ingress proxy routing" 6 | --- 7 | 8 | Although technically it is possible to expose internal applications via NodePort or LoadBalancer Services, this happens very rarely. There are two main reasons for that: 9 | 10 | * **Costs** -- since each LoadBalancer Services is associated with a single external address, this can translate into a sizeable fee when running in a public cloud environment. 11 | * **Functionality** -- simple L4 load balancing provided by Services lacks a lot of the features that are typically associated with an application proxy or gateway. This means that each exposed application will need to take care of things like TLS management, rate-limiting, authentication and intelligent traffic routing on its own. 12 | 13 | Ingress was designed as a generic, vendor-independent API to configure an HTTP load balancer that would be available to multiple Kubernetes applications. Running an Ingress would amortise the costs and efforts of implementing an application gateway functionality and provide an easy to consume, native Kubernetes experience to cluster operators and users. At the very least, a user is expected to define a single rule telling the Ingress which backend Service to use. This would result in all incoming HTTP requests to be routed to one of the healthy Endpoint of this Service: 14 | 15 | ```yaml 16 | apiVersion: networking.k8s.io/v1 17 | kind: Ingress 18 | metadata: 19 | name: example 20 | spec: 21 | rules: 22 | - http: 23 | paths: 24 | - backend: 25 | service: 26 | name: web 27 | port: 28 | number: 80 29 | path: / 30 | ``` 31 | 32 | Similar to Service type [LoadBalancer](/services/loadbalancer/), Kuberenetes only defines the Ingress API and leaves implementation to cluster add-ons. In public cloud environments, these functions are implemented by existing application load balancers, e.g. [Application Gateway](https://azure.microsoft.com/en-us/services/application-gateway/) in AKS, [Application Load Balancer](https://docs.aws.amazon.com/eks/latest/userguide/alb-ingress.html) in EKS or [Google Front Ends (GFEs)](https://cloud.google.com/load-balancing/docs/https) for GKE. However, unlike a LoadBalancer controller, Kubernetes distributions do not limit the type of Ingress controller that can be deployed to perform these functions. There are over a dozen of Ingress controller implementations from the major load balancer, proxy and service mesh vendors which makes choosing the right Ingress controller a very daunting task. Several attempts have been made to compile a decision matrix to help with this choice -- [one](https://docs.google.com/spreadsheets/d/1DnsHtdHbxjvHmxvlu7VhzWcWgLAn_Mc5L1WlhLDA__k/edit#gid=0) done by Flant and [one](https://docs.google.com/spreadsheets/d/191WWNpjJ2za6-nbG4ZoUMXMpUK8KlCIosvQB0f-oq3k/edit#gid=907731238) by learnk8s.io. Multiple Ingress controllers can be deployed in a single cluster and Ingress resources are associated with a particular controller based on the `.spec.ingressClassName` field. 33 | 34 | Ingress controller's implementation almost always includes the following two components: 35 | 36 | * **Controller** -- a process that communicates with the API server and collects all of the information required to successfully provision its proxies. 37 | * **Proxy** -- a data plane component, managed by the controller (via API, plugins or plain text files), can be scaled up and down by the Horizontal Pod Autoscaler. 38 | 39 | Typically, during the installation process, an Ingress Controller creates a Service type LoadBalancer and uses the allocated IP to update the `.status.loadBalancer` field of all managed Ingresses. 40 | 41 | {{< iframe "https://viewer.diagrams.net/?highlight=0000ff&edit=_blank&hide-pages=1&editable=false&layers=1&nav=0&page-id=tSopwAg3hkGCBVX-7IBd&title=k8s-guide.drawio#Uhttps%3A%2F%2Fraw.githubusercontent.com%2Fnetworkop%2Fk8s-guide-labs%2Fmaster%2Fdiagrams%2Fk8s-guide.drawio" >}} 42 | 43 | 44 | ## Lab 45 | 46 | For this lab exercise, we'll use one of the most popular open-source Ingress controllers -- [ingress-nginx](https://kubernetes.github.io/ingress-nginx/). 47 | 48 | 49 | ### Preparation 50 | 51 | 52 | Assuming that the lab environment is already [set up](/lab/), `ingress-nginx` can be set up with the following commands: 53 | 54 | ``` 55 | make ingress-setup 56 | ``` 57 | 58 | Install a LoadBalancer controller to allocate external IP for the Ingress controller 59 | 60 | ``` 61 | make metallb 62 | ``` 63 | 64 | Wait for Ingress controller to fully initialise 65 | 66 | ``` 67 | make ingress-wait 68 | ``` 69 | 70 | Set up a couple of test Deployment and associated Ingress resources to be used in the walkthrough. 71 | 72 | ``` 73 | make ingress-prep 74 | ``` 75 | 76 | The above command sets up two ingress resources -- one doing the path-based routing and one doing the host-based routing. Use the following command to confirm that both Ingresses have been set up and assigned with an external IP: 77 | 78 | ``` 79 | $ kubectl get ing 80 | NAME CLASS HOSTS ADDRESS PORTS AGE 81 | tkng-1 nginx * 198.51.100.0 80 46s 82 | tkng-2 nginx prod,dev 198.51.100.0 80 26s 83 | ``` 84 | 85 | 86 | Now we can verify the path-based routing functionality: 87 | 88 | ``` 89 | $ docker exec k8s-guide-control-plane curl -s http://198.51.100.0/dev 90 | Server address: 10.244.1.14:8080 91 | Server name: dev-694776949d-w2fw7 92 | Date: 29/Aug/2021:16:25:41 +0000 93 | URI: /dev 94 | Request ID: 6ccd350709dd92b76cdfabbcbf92d5c5 95 | 96 | $ docker exec k8s-guide-control-plane curl -s http://198.51.100.0/prod 97 | Server address: 10.244.1.13:8080 98 | Server name: prod-559ccb4b56-5krn6 99 | Date: 29/Aug/2021:16:25:50 +0000 100 | URI: /prod 101 | Request ID: 2fed2ada42daf911057c798e74504453 102 | ``` 103 | 104 | And the host-based routing: 105 | 106 | ``` 107 | $ docker exec k8s-guide-control-plane curl -s --resolve prod:80:198.51.100.0 http://prod 108 | Server address: 10.244.1.13:8080 109 | Server name: prod-559ccb4b56-5krn6 110 | Date: 29/Aug/2021:16:25:58 +0000 111 | URI: / 112 | Request ID: 8b28ba1ccab240700a6264024785356b 113 | 114 | $ docker exec k8s-guide-control-plane curl -s --resolve dev:80:198.51.100.0 http://dev 115 | Server address: 10.244.1.14:8080 116 | Server name: dev-694776949d-w2fw7 117 | Date: 29/Aug/2021:16:26:08 +0000 118 | URI: / 119 | Request ID: 5c8a8cfa037a2ece0c3cfe8fd2e1597d 120 | ``` 121 | 122 | To confirm that the HTTP routing is correct, take note of the `Server name` field of the response, which should match the name of the backend Pod: 123 | 124 | ``` 125 | $ kubectl get pod 126 | NAME READY STATUS RESTARTS AGE 127 | dev-694776949d-w2fw7 1/1 Running 0 10m 128 | prod-559ccb4b56-5krn6 1/1 Running 0 10m 129 | ``` 130 | 131 | ### Walkthrough 132 | 133 | Let's start by looking at the Ingress controller logs to see what happens when a new Ingress resource gets added to the API server: 134 | 135 | ``` 136 | $ kubectl logs deploy/ingress-controller-ingress-nginx-controller 137 | I0826 16:10:40.364640 8 main.go:101] "successfully validated configuration, accepting" ingress="tkng-1/default" 138 | I0826 16:10:40.371315 8 store.go:365] "Found valid IngressClass" ingress="default/tkng-1" ingressclass="nginx" 139 | I0826 16:10:40.371770 8 event.go:282] Event(v1.ObjectReference{Kind:"Ingress", Namespace:"default", Name:"tkng-1", UID:"8229d775-0a73-4484-91bf-fdb9053922b5", APIVersion:"networking.k8s.io/v1", ResourceVersion:"22155", FieldPath:""}): type: 'Normal' reason: 'Sync' Scheduled for sync 140 | I0826 16:10:40.372381 8 controller.go:150] "Configuration changes detected, backend reload required" 141 | ingress.networking.k8s.io/tkng-1 created 142 | I0826 16:10:40.467838 8 controller.go:167] "Backend successfully reloaded" 143 | I0826 16:10:40.468147 8 event.go:282] Event(v1.ObjectReference{Kind:"Pod", Namespace:"kube-system", Name:"ingress-controller-ingress-nginx-controller-84d5f6c695-pd54s", UID:"b6b63172-0240-41fb-a110-e18f475caddf", APIVersion:"v1", ResourceVersion:"14712", FieldPath:""}): type: 'Normal' reason: 'RELOAD' NGINX reload triggered due to a change in configuration 144 | I0826 16:11:29.812516 8 status.go:284] "updating Ingress status" namespace="default" ingress="tkng-1" currentValue=[] newValue=[{IP:198.51.100.0 Hostname: Ports:[]}] 145 | I0826 16:11:29.818436 8 event.go:282] Event(v1.ObjectReference{Kind:"Ingress", Namespace:"default", Name:"tkng-1", UID:"8229d775-0a73-4484-91bf-fdb9053922b5", APIVersion:"networking.k8s.io/v1", ResourceVersion:"22343", FieldPath:""}): type: 'Normal' reason: 'Sync' Scheduled for sync 146 | ``` 147 | 148 | Most of the above log is self-explanatory -- we see that the controller performs some initial validations, updates the configuration, triggers a proxy reload and updates the status field of the managed Ingress. We can see where the allocated IP is coming from by looking at the associated LoadBalancer service: 149 | 150 | ``` 151 | $ kubectl -n kube-system get svc -l app.kubernetes.io/name=ingress-nginx 152 | NAME TYPE CLUSTER-IP EXTERNAL-IP PORT(S) AGE 153 | ingress-controller-ingress-nginx-controller LoadBalancer 10.96.193.245 198.51.100.0 80:30881/TCP,443:31634/TCP 36m 154 | ``` 155 | 156 | Now that we know what happens when a new Ingress is processed, let's take a look inside the Ingress controller pod 157 | 158 | 159 | ``` 160 | $ kubectl -n kube-system exec -it deploy/ingress-controller-ingress-nginx-controller -- pgrep -l nginx 161 | 8 /nginx-ingress-controller 162 | 31 nginx: master process /usr/local/nginx/sbin/nginx -c /etc/nginx/nginx.conf 163 | 579 nginx: worker process 164 | 580 nginx: worker process 165 | 581 nginx: worker process 166 | 582 nginx: worker process 167 | 583 nginx: worker process 168 | 584 nginx: worker process 169 | 585 nginx: worker process 170 | 586 nginx: worker process 171 | 587 nginx: cache manager process 172 | ``` 173 | 174 | Here we see to main components described above -- a controller called `nginx-ingress-controller` and a proxy process `/usr/local/nginx/sbin/nginx`. We also see that the proxy is started with the `-c` argument, pointing it at the configuration file. If we look inside this configuration file, we should see the host-based routing [`server_name`](https://nginx.org/en/docs/http/ngx_http_core_module.html#server_name) directives: 175 | ``` 176 | $ kubectl -n kube-system exec -it deploy/ingress-controller-ingress-nginx-controller -- cat /etc/nginx/nginx.conf | grep server_name 177 | server_names_hash_max_size 1024; 178 | server_names_hash_bucket_size 32; 179 | server_name_in_redirect off; 180 | server_name _ ; 181 | server_name dev ; 182 | server_name prod ; 183 | ``` 184 | 185 | Similarly, we can view the path-based routing [`location`](https://nginx.org/en/docs/http/ngx_http_core_module.html#location) directives: 186 | 187 | ``` 188 | kubectl exec -it deploy/ingress-controller-ingress-nginx-controller -- cat /etc/nginx/nginx.conf | grep "location /" 189 | location /prod/ { 190 | location /dev/ { 191 | location / { 192 | location /healthz { 193 | location /nginx_status { 194 | location / { 195 | location / { 196 | location / { 197 | location /healthz { 198 | location /is-dynamic-lb-initialized { 199 | location /nginx_status { 200 | location /configuration { 201 | location / { 202 | ``` 203 | 204 | Examining the plain `nginx.conf` configuration can be a bit difficult, especially for large configs. A simpler way of doing it is using an [ingress-nginx plugin](https://kubernetes.github.io/ingress-nginx/kubectl-plugin/) for kubectl which can be installed with [krew](https://krew.sigs.k8s.io/docs/user-guide/setup/install/). For example, this is how we could list all active Ingress resources managed by this controller: 205 | 206 | 207 | ``` 208 | $ kubectl ingress-nginx ingresses --all-namespaces 209 | NAMESPACE INGRESS NAME HOST+PATH ADDRESSES TLS SERVICE SERVICE PORT ENDPOINTS 210 | default tkng-1 /prod 198.51.100.0 NO prod 8080 1 211 | default tkng-1 /dev 198.51.100.0 NO dev 8080 1 212 | default tkng-2 prod/ 198.51.100.0 NO prod 8080 1 213 | default tkng-2 dev/ 198.51.100.0 NO dev 8080 1 214 | ``` 215 | 216 | Backend objects are [not managed](https://kubernetes.github.io/ingress-nginx/how-it-works/#avoiding-reloads-on-endpoints-changes) via a configuration file, so you won't see them in the `nginx.conf` rendered by the controller. The only way to view them is using the `ingress-nginx` plugin, e.g.: 217 | 218 | ``` 219 | $ kubectl ingress-nginx -n kube-system backends --deployment ingress-controller-ingress-nginx-controller | jq -r '.[] | "\(.name) => \(.endpoints)"' 220 | default-dev-8080 => [{"address":"10.244.1.16","port":"8080"}] 221 | default-prod-8080 => [{"address":"10.244.2.14","port":"8080"}] 222 | upstream-default-backend => [{"address":"127.0.0.1","port":"8181"}] 223 | ``` 224 | 225 | 226 | {{% notice warning %}} 227 | The above walkthrough is only applicable to the `nginx-ingress` controller. Other controllers may implement the same functionality differently, even if the data plane proxy is the same (e.g. nginx-ingress vs F5 nginx Ingress controller). Ingress API changes do not necessarily result in a complete proxy reload, assuming the underlying proxy supports hot restarts, e.g. [Envoy](https://www.envoyproxy.io/docs/envoy/latest/intro/arch_overview/operations/hot_restart). 228 | {{% /notice %}} -------------------------------------------------------------------------------- /content/lab/_index.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Lab Setup" 3 | menutitle: "Lab Setup" 4 | date: 2020-09-13T17:33:04+01:00 5 | summary: "Prerequisites and setup of the lab environment" 6 | --- 7 | 8 | {{% notice info %}} 9 | All labs are stored in a separate Github repository -- [k8s-guide-labs](https://github.com/networkop/k8s-guide-labs) 10 | {{% /notice %}} 11 | 12 | ## Prerequisites 13 | 14 | In order to interact with the lab, the following set of tools need to be pre-installed: 15 | 16 | * **Docker** with `containerd` runtime. This is what you get by default when you install [docker-ce](https://docs.docker.com/engine/install/). 17 | * **kubectl** to interact with a Kubernetes cluster. Installation instructions can be found [here](https://kubernetes.io/docs/tasks/tools/install-kubectl/). 18 | * **helm** to bootstrap the cluster with Flux. Installation instructions can be found [here](https://github.com/helm/helm#install) 19 | * **make** is used to automate and orchestrate manual tasks. Most instructions will be provided as a series of make commands. 20 | 21 | 22 | {{% notice info %}} 23 | A number of additional tools (e.g. kind) will be installed automatically during the Setup phase 24 | {{% /notice %}} 25 | 26 | Some **optional extras** that may make your life a lot easier: 27 | 28 | * [kubectl](https://kubernetes.io/docs/tasks/tools/install-kubectl/#optional-kubectl-configurations) and [docker](https://github.com/docker/docker-ce/tree/master/components/cli/contrib/completion) commands auto-completion. 29 | * [kubens/kubectx](https://github.com/ahmetb/kubectx) to easily switch between namespaces and contexts. 30 | * [stern](https://github.com/stern/stern) to read logs from multiple Pods at the same time. 31 | * [k9s](https://github.com/derailed/k9s) is a very convinient terminal dashboard for a Kubernetes cluster. 32 | 33 | Installation instructions will depend on the operating system. Here's an example [bash bootstrap script](https://gist.github.com/hellt/61242c680c78c3c813f20ecb9577a93e) for Ubuntu 20.4 LTS x86_64. 34 | 35 | ## Supported Operating Systems 36 | 37 | The main supported operating system is **Linux**. The kernel version can be anything that's `>=4.19`. 38 | 39 | {{% notice note %}} 40 | Most of the things should also be supported on Darwin. If you find a discrepancy and know how to fix it, please submit a PR. 41 | {{% /notice %}} 42 | 43 | 44 | ## Setup instructions 45 | 46 | Clone the k8s-guide-labs repository: 47 | 48 | ```bash 49 | git clone https://github.com/networkop/k8s-guide-labs.git && cd k8s-guide-labs 50 | ``` 51 | 52 | To view the list of available operations do: 53 | 54 | ```bash 55 | $ make 56 | 57 | check Check prerequisites 58 | setup Setup the lab environment 59 | up Bring up the cluster 60 | connect Connect to Weave Scope 61 | tshoot Connect to the troubleshooting pod 62 | reset Reset k8s cluster 63 | down Shutdown 64 | cleanup Destroy the lab environment 65 | ``` 66 | 67 | Check and install the required prerequisites: 68 | 69 | ```bash 70 | $ make check 71 | all good 72 | ``` 73 | 74 | Setup the lab environment with: 75 | 76 | ```bash 77 | make setup 78 | ``` 79 | 80 | Finally, bootstrap the cluster with Flux: 81 | 82 | 83 | ```bash 84 | make up 85 | ``` 86 | 87 | {{% notice tip %}} 88 | All labs are built in [GitOps](https://www.weave.works/technologies/gitops/) style using [Flux](https://github.com/fluxcd/flux) as the controller that manages the state of the cluster. 89 | {{% /notice %}} 90 | 91 | ## Interacting with the Lab 92 | 93 | The lab consists of a local Kubernetes cluster along with a caching pull-through Docker registry to speed up download times. The cluster is built with [kind](https://github.com/kubernetes-sigs/kind) and the caching registry is a standalone container running alongside of it. 94 | 95 | To build the cluster for the first time run: 96 | 97 | ``` 98 | make up 99 | ``` 100 | 101 | In order to stop the cluster (e.g. to free up resources) run: 102 | 103 | ``` 104 | make down 105 | ```` 106 | 107 | In order to rebuild the cluster (combined `up` and `down`) run: 108 | 109 | ``` 110 | make reset 111 | ``` 112 | 113 | To completely destroy the lab environment, including the caching registry run: 114 | 115 | 116 | ``` 117 | make cleanup 118 | ``` 119 | 120 | 121 | ## Default applications 122 | 123 | The lab cluster is setup with a couple of applications that will be used throughout this guide: 124 | 125 | 1. **[Weave Scope](https://github.com/weaveworks/scope)** -- a tool to visualise and monitor Kubernetes cluster workloads. 126 | 127 | {{% notice tip %}} 128 | To connect to Weave Scope's front-end, run `make connect` and go to [http://localhost:8080](http://localhost:8080) 129 | {{% /notice %}} 130 | 131 | 132 | 2. **[netshoot](https://github.com/nicolaka/netshoot)** -- deployed as a Daemonset, a docker image pre-installed with a wide range of network troubleshooting tools. 133 | 134 | {{% notice tip %}} 135 | To connect to a Pod running on a particular Node (e.g. k8s-guide-worker), run `NODE=k8s-guide-worker make tshoot` 136 | {{% /notice %}} 137 | -------------------------------------------------------------------------------- /content/security/_index.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Network Policies" 3 | date: 2020-09-13T17:33:04+01:00 4 | weight: 70 5 | summary: "Network Policies & Access Control" 6 | --- 7 | 8 | # Under construction [help needed] -------------------------------------------------------------------------------- /content/services/Headless/_index.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Headless" 3 | date: 2020-09-13T17:33:04+01:00 4 | weight: 30 5 | draft: false 6 | --- 7 | 8 | This type of service does not perform any load-balancing and only implements DNS Service Discovery, based on the Kubernetes [DNS Spec](https://github.com/kubernetes/dns/blob/master/docs/specification.md#24---records-for-a-headless-service). Although this is the simplest and the most basic type of Service, its use is mainly limited to stateful applications like databases and clusters. In these use case the assumption is that clients have some prior knowledge about the application they're going to be communicating with, e.g. number of nodes, naming structure, and can handle failover and load-balancing on their own. 9 | 10 | Some typical examples of stateful applications that use this kind of service are: 11 | 12 | * [zookeeper](https://github.com/bitnami/charts/blob/master/bitnami/zookeeper/templates/svc-headless.yaml) 13 | * [etcd](https://github.com/bitnami/charts/blob/master/bitnami/etcd/templates/svc-headless.yaml) 14 | * [consul](https://github.com/hashicorp/consul-helm/blob/master/templates/server-service.yaml) 15 | 16 | The only thing that makes a service "Headless" is the `clusterIP: None` which, on the one hand, tells dataplane agents to ignore this resource and, on the other hand, tells the DNS plugin that it needs [special type of processing](https://github.com/coredns/coredns/blob/5b9b079dabc7f71463cea3f0c6a92f338935039d/plugin/kubernetes/kubernetes.go#L461). The rest of the API parameters look similar to any other Service: 17 | 18 | 19 | ```yaml 20 | apiVersion: v1 21 | kind: Service 22 | metadata: 23 | name: headless 24 | namespace: default 25 | spec: 26 | clusterIP: None 27 | ports: 28 | - name: http 29 | port: 8080 30 | selector: 31 | app: database 32 | ``` 33 | 34 | The corresponding Endpoints resources are still creates for every healthy backend Pod, with the only notable distinction being the absence of hash in Pods name and presence of the hostname field. 35 | 36 | ```yaml 37 | apiVersion: v1 38 | kind: Endpoints 39 | metadata: 40 | labels: 41 | service.kubernetes.io/headless: "" 42 | name: headless 43 | namespace: default 44 | subsets: 45 | - addresses: 46 | - hostname: database-0 47 | ip: 10.244.0.12 48 | nodeName: k8s-guide-control-plane 49 | targetRef: 50 | kind: Pod 51 | name: database-0 52 | namespace: default 53 | ports: 54 | - name: http 55 | port: 8080 56 | protocol: TCP 57 | ``` 58 | {{% notice info %}} 59 | 60 | In order to optimise the work of kube-proxy and other controllers that may need to read Endpoints, their Controller annotates all objects with the `service.kubernetes.io/headless` label. 61 | {{% /notice %}} 62 | 63 | 64 | ## Implementation 65 | 66 | This type of service is implemented entirely within a DNS plugin. The following is a simplified version of the [actual code](https://github.com/coredns/coredns/blob/5b9b079dabc7f71463cea3f0c6a92f338935039d/plugin/kubernetes/kubernetes.go#L383) from CoreDNS's kubernetes plugin: 67 | 68 | {{< gist networkop cc2f49248321e6547d880ea1406704ea >}} 69 | 70 | 71 | CoreDNS builds an internal representation of Services, containing only the information that may be relevant to DNS (IPs, port numbers) and dropping all of the other details. This information is later used to build a DNS response. 72 | 73 | 74 | ### Lab 75 | 76 | Assuming that the lab is already [setup](/lab/), we can install a stateful application (consul) with the following command: 77 | 78 | ```bash 79 | make headless 80 | ``` 81 | 82 | Check that the consul statefulset has been deployed: 83 | 84 | ```bash 85 | $ kubect get sts 86 | NAME READY AGE 87 | consul-server 3/3 25m 88 | ``` 89 | 90 | Now we should be able to see one Headless Services in the default namespace: 91 | 92 | ```bash 93 | $ kubect get svc consul-server 94 | NAME TYPE CLUSTER-IP EXTERNAL-IP PORT(S) AGE 95 | consul-server ClusterIP None 8500/TCP,8301/TCP,8301/UDP,8302/TCP,8302/UDP,8300/TCP,8600/TCP,8600/UDP 29m 96 | ``` 97 | 98 | To interact with this service, we can do a DNS query from any of the `net-tshoot` Pods: 99 | 100 | ``` 101 | kubectl exec -it net-tshoot-8kqh6 -- dig consul-server +search 102 | 103 | ; <<>> DiG 9.16.11 <<>> consul-server +search 104 | ;; global options: +cmd 105 | ;; Got answer: 106 | ;; WARNING: .local is reserved for Multicast DNS 107 | ;; You are currently testing what happens when an mDNS query is leaked to DNS 108 | ;; ->>HEADER<<- opcode: QUERY, status: NOERROR, id: 2841 109 | ;; flags: qr aa rd; QUERY: 1, ANSWER: 3, AUTHORITY: 0, ADDITIONAL: 1 110 | ;; WARNING: recursion requested but not available 111 | 112 | ;; OPT PSEUDOSECTION: 113 | ; EDNS: version: 0, flags:; udp: 4096 114 | ; COOKIE: fe116ac7ab444725 (echoed) 115 | ;; QUESTION SECTION: 116 | ;consul-server.default.svc.cluster.local. IN A 117 | 118 | ;; ANSWER SECTION: 119 | consul-server.default.svc.cluster.local. 13 IN A 10.244.2.8 120 | consul-server.default.svc.cluster.local. 13 IN A 10.244.1.8 121 | consul-server.default.svc.cluster.local. 13 IN A 10.244.0.6 122 | 123 | ;; Query time: 0 msec 124 | ;; SERVER: 10.96.0.10#53(10.96.0.10) 125 | ;; WHEN: Sat Jun 05 15:30:09 UTC 2021 126 | ;; MSG SIZE rcvd: 245 127 | ``` 128 | 129 | Application interacting with this StatefulSet can make use of DNS SRV lookup to find individual hostnames and port numbers exposed by the backend Pods: 130 | 131 | ``` 132 | $ kubectl exec -it net-tshoot-8kqh6 -- dig consul-server +search srv +short 133 | 0 4 8301 consul-server-2.consul-server.default.svc.cluster.local. 134 | 0 4 8600 consul-server-2.consul-server.default.svc.cluster.local. 135 | 0 4 8300 consul-server-2.consul-server.default.svc.cluster.local. 136 | 0 4 8500 consul-server-2.consul-server.default.svc.cluster.local. 137 | 0 4 8302 consul-server-2.consul-server.default.svc.cluster.local. 138 | 0 4 8301 consul-server-1.consul-server.default.svc.cluster.local. 139 | 0 4 8600 consul-server-1.consul-server.default.svc.cluster.local. 140 | 0 4 8300 consul-server-1.consul-server.default.svc.cluster.local. 141 | 0 4 8500 consul-server-1.consul-server.default.svc.cluster.local. 142 | 0 4 8302 consul-server-1.consul-server.default.svc.cluster.local. 143 | 0 4 8301 consul-server-0.consul-server.default.svc.cluster.local. 144 | 0 4 8600 consul-server-0.consul-server.default.svc.cluster.local. 145 | 0 4 8300 consul-server-0.consul-server.default.svc.cluster.local. 146 | 0 4 8500 consul-server-0.consul-server.default.svc.cluster.local. 147 | 0 4 8302 consul-server-0.consul-server.default.svc.cluster.local. 148 | ``` 149 | 150 | -------------------------------------------------------------------------------- /content/services/Optimisations/_index.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Optimisations" 3 | date: 2020-09-13T17:33:04+01:00 4 | weight: 99 5 | --- 6 | 7 | 8 | # Under construction [help needed] 9 | 10 | Endpoint Slices 11 | 12 | Topology Aware Hints 13 | 14 | SessionAffinity 15 | 16 | externalTrafficPolicy 17 | 18 | DSR (with Cilium) 19 | 20 | Maglev (with Cilium) 21 | 22 | Traffic policies 23 | 24 | [ServiceAppProtocol ](https://github.com/kubernetes/enhancements/blob/0e4d5df19d396511fe41ed0860b0ab9b96f46a2d/keps/sig-network/1507-app-protocol/README.md#risks-and-mitigations) -------------------------------------------------------------------------------- /content/services/_index.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Services" 3 | date: 2020-09-13T17:33:04+01:00 4 | summary: "Cluster load-balancing solutions" 5 | weight: 20 6 | --- 7 | 8 | "Service" is one of the most powerful and, as a result, complex abstractions in Kubernetes. It is, also, a very heavily overloaded term which makes it even more confusing for people approaching Kubernetes for the first time. This chapter will provide a high-level overview of different types of Services, their goals and how they relate to other cluster elements and APIs. 9 | 10 | {{% notice info %}} 11 | A lot of ideas and concepts in this chapter are based on numerous talks and presentations on this topic. It's difficult to make concrete attributions, however most credit goes to members of [Network Special Interest Group](https://github.com/kubernetes/community/tree/master/sig-network). 12 | {{% /notice %}} 13 | 14 | ## Services Hierarchy 15 | 16 | A good starting point to understand a Kubernetes Service is to think of it as a distributed load-balancer. Similar to traditional load-balancers, its data model can be reduced to the following two components: 17 | 18 | 1. **Grouping of backend Pods** -- all Pods with similar labels represent a single service and can receive and process incoming traffic for that service. 19 | 2. **Methods of exposure** -- each group of Pods can be exposed either internally, to other Pods in a cluster, or externally, to end-users or external services in many different ways. 20 | 21 | All Services implement the above functionality, but each in its own way designed for its unique use case. In order to understand various Service types, it helps to view them as an "hierarchy" -- starting from the simplest, with each subsequent type building on top of the previous one. The table below is an attempt to explore and explain this hierarchy: 22 | 23 | | Type | Description | 24 | | ----------| ----------- | 25 | | **Headless** | The simplest form of load-balancing involving only DNS. Nothing is programmed in the data plane and no load-balancer VIP is assigned, however DNS query will return IPs for all backend Pods. The most typical use-case for this is stateful workloads (e.g. databases), where clients need stable and predictable DNS name and can handle the loss of connectivity and failover on their own. | 26 | | **ClusterIP** | The most common type, assigns a unique ClusterIP (VIP) to a set of matching backend Pods. DNS lookup of a Service name returns the allocated ClusterIP. All ClusterIPs are configured in the data plane of each node as DNAT rules -- destination ClusterIP is translated to one of the PodIPs. These NAT translations always happen on the egress (client-side) node which means that Node-to-Pod reachability must be provided externally (by a [CNI plugin](/cni)). | 27 | | **NodePort** | Builds on top of the ClusterIP Service by allocating a unique static port in the root network namespace of each Node and mapping it (via Port Translation) to the port exposed by the backend Pods. The incoming traffic can hit _any_ cluster Node and, as long as the destination port matches the NodePort, it will get forwarded to one of the healthy backend Pods. | 28 | | **LoadBalancer** | Attracts external user traffic to a Kubernetes cluster. Each LoadBalancer Service instance is assigned a unique, externally routable IP address which is advertised to the underlying physical network via BGP or gratuitous ARP. This Service type is implemented outside of the main kube controller -- either by the underlying cloud as an external L4 load-balancer or with a cluster add-on like [MetalLB](https://github.com/metallb/metallb), [Porter](https://github.com/kubesphere/porter) or [kube-vip](https://kube-vip.io/). | 29 | 30 | {{% notice note %}} 31 | One Service type that doesn't fit with the rest is `ExternalName`. It instructs DNS cluster add-on (e.g. CoreDNS) to respond with a CNAME, redirecting all queries for this service's domain name to an external FQDN, which can simplify interacting with external services (for more details see the [Design Spec](https://github.com/kubernetes/community/blob/b3349d5b1354df814b67bbdee6890477f3c250cb/contributors/design-proposals/network/service-external-name.md#motivation)). 32 | {{% /notice %}} 33 | 34 | The following diagram illustrates how different Service types can be combined to expose a stateful application: 35 | 36 | {{< iframe "https://viewer.diagrams.net/?highlight=0000ff&edit=_blank&hide-pages=1&editable=false&layers=1&nav=0&page-id=xy2cxxoLWAjYxmtAeYh4&title=k8s-guide.drawio#Uhttps%3A%2F%2Fraw.githubusercontent.com%2Fnetworkop%2Fk8s-guide-labs%2Fmaster%2Fdiagrams%2Fk8s-guide.drawio" >}} 37 | 38 | 39 | {{% notice info %}} 40 | Although not directly connected, most Services rely on Deployments and StatefulSets to create the required number of Pods with a unique set of labels. 41 | {{% /notice %}} 42 | 43 | ## Service APIs and Implementation 44 | 45 | Services have a relatively small and simple API. At the very least they expect the following to be defined: 46 | 47 | * Explicit list of backend **ports** that needs to be exposed. 48 | * Label **selector** to understand which Pods are potential upstream candidates. 49 | * A Service **type** which defaults to `ClusterIP`. 50 | 51 | ```yaml 52 | kind: Service 53 | apiVersion: v1 54 | metadata: 55 | name: service-example 56 | spec: 57 | ports: 58 | - name: http 59 | port: 80 60 | targetPort: 80 61 | selector: 62 | app: nginx 63 | type: LoadBalancer 64 | ``` 65 | 66 | {{% notice note %}} 67 | Some services may not have any label selectors, in which case the list of backend Pods can still be constructed manually. This is often used to interconnect with services outside of the Kubernetes cluster while still relying on internal mechanisms of service discovery. 68 | {{% /notice %}} 69 | 70 | Service's internal architecture consists of two loosely-coupled components: 71 | 72 | * Kubernetes **control plane** -- a process running inside the `kube-controller-manager` binary, that reacts to API events and builds an internal representation of each service instance. This internal representation is a special **Endpoints** object that gets created for every Service instance and contains a list of healthy backend endpoints (PodIP + port). 73 | * Distributed **data plane** -- a set of Node-local agents that read **Endpoints** objects and program their local data plane. This is most commonly implemented with `kube-proxy` with various competing implementations from 3rd-party Kubernetes networking providers like Cilium, Calico, kube-router and others. 74 | 75 | Another less critical, but nonetheless important components is DNS. Internally, DNS add-on is just a Pod running in a cluster that caches `Service` and `Endpoints` objects and responds to incoming queries according to the DNS-Based Service Discovery [specification](https://github.com/kubernetes/dns/blob/master/docs/specification.md), which defines the format for incoming queries and the expected structure for responses. 76 | 77 | {{< iframe "https://viewer.diagrams.net/?highlight=0000ff&edit=_blank&hide-pages=1&editable=false&layers=1&nav=0&page-id=HR_OWBqgmX47NSTQvTWL&title=k8s-guide.drawio#Uhttps%3A%2F%2Fraw.githubusercontent.com%2Fnetworkop%2Fk8s-guide-labs%2Fmaster%2Fdiagrams%2Fk8s-guide.drawio" >}} 78 | 79 | {{% notice info %}} 80 | This following series of [short youtube videos](https://www.youtube.com/playlist?list=PLoWxE_5hnZUZMWrEON3wxMBoIZvweGeiq) provide a very good, albeit a vendor-centric, overview of various types of Kubernetes Networking. 81 | {{% /notice %}} 82 | 83 | -------------------------------------------------------------------------------- /content/services/clusterIP/_index.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: "ClusterIP" 3 | date: 2020-09-13T17:33:04+01:00 4 | weight: 40 5 | --- 6 | 7 | When people say that Kubernetes networking is difficult, they very often refer to this type of service. One of the reasons for this perception is that all of its complexity is hidden behind a very minimalistic API. A common way of defining a Service only takes 5 lines of configuration (plus the standard metadata): 8 | 9 | ```yaml 10 | apiVersion: v1 11 | kind: Service 12 | metadata: 13 | name: clusterIP-example 14 | spec: 15 | ports: 16 | - name: http 17 | port: 80 18 | selector: 19 | app: my-backend-app 20 | ``` 21 | 22 | Quite unexpectedly, these 5 lines can generate a large amount of state inside the cluster as each Service has to be implemented on all Nodes and its state grows proportionally to the number of backend Endpoints. In order to better understand the networking behind it, the remainder of this chapter will be broken down into the following sections: 23 | 24 | - **Control Plane** will examine the mechanics of interaction between the user input, the API server processing it and a distributed set of node-local agents ultimately consuming it. 25 | - **Data Plane** will cover some of the standard implementations including iptables, ipvs and eBPF. 26 | -------------------------------------------------------------------------------- /content/services/clusterIP/control-plane.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Control Plane" 3 | date: 2020-09-13T17:33:04+01:00 4 | weight: 10 5 | --- 6 | 7 | Let's start our exploration with the first step of any Kubernetes cluster's lifecycle -- bootstrapping. At this stage, a cluster admin is expected to provide a number of parameters one of which will be called `service-cidr` (or something similar depending on the orchestrator) which gets mapped to a `service-cluster-ip-range` argument of the `kube-apiserver`. 8 | 9 | {{% notice note %}} 10 | For the sake of simplicity we'll assume `kubeadm` is used to orchestrate a cluster. 11 | {{% /notice %}} 12 | 13 | An Orchestrator will suggest a default value for this range (e.g. `10.96.0.0/12`) which most of the times is safe to use. As we'll see later, this range is completely "virtual", i.e. does not need to have any coordination with the underlying network and can be re-used between clusters (one notable exception being [this Calico feature](https://docs.projectcalico.org/networking/advertise-service-ips#advertise-service-cluster-ip-addresses)). The only constraints for this value are: 14 | 15 | - It must not overlap with any of the Pod IP ranges or Node IPs of the same cluster. 16 | - It must not be loopback (127.0.0.0/8 for IPv4, ::1/128 for IPv6) or link-local (169.254.0.0/16 and 224.0.0.0/24 for IPv4, fe80::/64 for IPv6). 17 | 18 | Once a Kubernetes cluster has been bootstrapped, every new `ClusterIP` service type will get a unique IP allocated from this range, for example: 19 | 20 | 21 | ```yaml 22 | $ kubectl create svc clusterip test --tcp=80 && kubectl get svc test 23 | service/test created 24 | NAME TYPE CLUSTER-IP EXTERNAL-IP PORT(S) AGE 25 | test ClusterIP 10.96.37.70 80/TCP 0s 26 | ``` 27 | 28 | {{% notice info %}} 29 | The first IP from the Service CIDR range is reserved and always assigned to a special `kubernetes` service. See [this explanation](https://networkop.co.uk/post/2020-06-kubernetes-default/) for more details. 30 | {{% /notice %}} 31 | 32 | 33 | Inside the [`kube-controller-manager`](https://kubernetes.io/docs/reference/command-line-tools-reference/kube-controller-manager/)'s [reconciliation loop](https://github.com/kubernetes/kubernetes/blob/52eea971c57580c6b1b74f0a12bf9cc6083a4d6b/pkg/controller/endpoint/endpoints_controller.go#L378), it builds an internal representation for each Service which includes a list of all associated Endpoints. From then on, both `Service` and `Endpoints` resources co-exist, with the former being the user-facing, aggregated view of a load-balancer and the latter being the detailed, low-level set of IP and port details that will be programmed in the dataplane. There are two ways to compile a list of Endpoints: 34 | 35 | 36 | - **Label selectors** is the most common approach, relies on labels to [identify](https://github.com/kubernetes/kubernetes/blob/52eea971c57580c6b1b74f0a12bf9cc6083a4d6b/pkg/controller/endpoint/endpoints_controller.go#L414) all matching Pods, and collect their [IP](https://github.com/kubernetes/kubernetes/blob/52eea971c57580c6b1b74f0a12bf9cc6083a4d6b/pkg/controller/endpoint/endpoints_controller.go#L259) and [port](https://github.com/kubernetes/kubernetes/blob/52eea971c57580c6b1b74f0a12bf9cc6083a4d6b/pkg/controller/endpoint/endpoints_controller.go#L479) information. 37 | - **Manual configuration** relies on users to assemble their own set of Endpoints; this approach is very rarely used but can give an intra-cluster address and hostname to any external service. 38 | 39 | All Endpoints are stored in an `Endpoints` resource that bears the same name as its parent Service. Below is an example of how it might look for the `kubernetes` service: 40 | 41 | ```yaml 42 | apiVersion: v1 43 | kind: Endpoints 44 | metadata: 45 | labels: 46 | endpointslice.kubernetes.io/skip-mirror: "true" 47 | name: kubernetes 48 | namespace: default 49 | subsets: 50 | - addresses: 51 | - ip: 172.18.0.4 52 | ports: 53 | - name: https 54 | port: 6443 55 | protocol: TCP 56 | ``` 57 | 58 | {{% notice info %}} 59 | Under the hood Endpoints are implemented as a set of slices; this will be covered in the Optimisations sections. 60 | {{% /notice %}} 61 | 62 | It is worth noting that the [DNS Spec](https://github.com/kubernetes/dns/blob/master/docs/specification.md#23---records-for-a-service-with-clusterip), mentioned briefly in the previous chapter, also defines the behaviour for the `ClusterIP` type services. Specifically, the following 3 query types must be supported: 63 | 64 | * **A/AAAA** Records -- will return a single ClusterIP for any query matching the Service Name (`metadata.name`) in the same namespace or `..svc.` in a different namespace. 65 | * **SRV** Record -- will return an SRV record for each unique port + protocol combination. 66 | * **PTR** Record -- can be used to lookup a service name based on provided `ClusterIP`. 67 | 68 | 69 | --- 70 | 71 | The Kubernetes' `kube-controller-manager` is constantly collecting, processing and updating all Endpoints and Service resources, however nothing is being done with this yet. Ultimate consumers of this information are a set of node-local agents (controllers) that will use it to program their local dataplane. Most of these node-local agents are using 72 | [client-go](https://github.com/kubernetes/sample-controller/blob/master/docs/controller-client-go.md) library to synchronize and process updates coming from the API server, which means they will all share the following behaviour: 73 | 74 | * Each node-local agent maintains a local cache of all interesting objects, which gets sync'ed in the beginning (via `List` operation) and observed for the remainder of the their lifecycle (via `Watch` operation). 75 | * The [architecture](https://github.com/kubernetes/sample-controller/blob/master/docs/controller-client-go.md) with two queues and a local cache ensures that controllers can absorb multiple frequent changes of the same object thereby minimising the churn in the dataplane. 76 | 77 | -------------------------------------------------------------------------------- /content/services/clusterIP/dataplane/IPVS.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: "IPVS" 3 | date: 2020-09-13T17:33:04+01:00 4 | weight: 20 5 | --- 6 | 7 | IPTables has been the first implementation of kube-proxy's dataplane, however, over time its limitations have become more pronounced, especially when operating at scale. There are several side-effects of implementing a proxy with something that was designed to be a firewall, the main one being a limited set of data structures. The way it manifests itself is that every ClusterIP Service needs to have a unique entry, these entries can't be grouped and have to be processed sequentially as chains of tables. This means that any dataplane lookup or a create/update/delete operation needs to traverse the chain until a match is found which, at a large-enough scale can result in [minutes](https://docs.google.com/presentation/d/1BaIAywY2qqeHtyGZtlyAp89JIZs59MZLKcFLxKE6LyM/edit#slide=id.p20) of added processing time. 8 | 9 | {{% notice note %}} 10 | Detailed performance analysis and measurement results of running iptables at scale can be found in the [Additional Reading](#additional-reading) section at the bottom of the page. 11 | {{% /notice %}} 12 | 13 | All this led to `ipvs` being added as an [enhancement proposal](https://github.com/kubernetes/enhancements/tree/0e4d5df19d396511fe41ed0860b0ab9b96f46a2d/keps/sig-network/265-ipvs-based-load-balancing) and eventually graduating to GA in Kubernetes version 1.11. The new dataplane implementation offers a number of improvements over the existing `iptables` mode: 14 | 15 | * All Service load-balancing is migrated to IPVS which can perform in-kernel lookups and masquerading in constant time, regardless of the number of configured Services or Endpoints. 16 | 17 | * The remaining rules in IPTables have been re-engineered to make use of [ipset](https://wiki.archlinux.org/title/Ipset), making the lookups more efficient. 18 | 19 | * Multiple additional load-balancer [scheduling modes](https://kubernetes.io/blog/2018/07/09/ipvs-based-in-cluster-load-balancing-deep-dive/#parameter-changes) are now available, with the default one being a simple round-robin. 20 | 21 | 22 | On the surface, this makes the decision to use `ipvs` an obvious one, however, since `iptables` have been the default mode for so long, some of its quirks and undocumented side-effects have become the standard. One of the fortunate side-effects of the `iptables` mode is that `ClusterIP` is never bound to any kernel interface and remains completely virtual (as a NAT rule). So when `ipvs` changed this behaviour by introducing a dummy `kube-ipvs0` interface, it [made it possible](https://github.com/kubernetes/kubernetes/issues/72236) for processes inside Pods to access any host-local services bound to `0.0.0.0` by targeting any existing `ClusterIP`. Although this does make `ipvs` less safe by default, it doesn't mean that these risks can't be mitigated (e.g. by not binding to `0.0.0.0`). 23 | 24 | The diagram below is a high-level and simplified view of two distinct datapaths for the same `ClusterIP` virtual service -- one from a remote Pod and one from a host-local interface. 25 | 26 | 27 | {{< iframe "https://viewer.diagrams.net/?highlight=0000ff&edit=_blank&hide-pages=1&editable=false&layers=1&nav=0&page-id=BucKDkpFbDgBnzcmmJd5&title=k8s-guide.drawio#Uhttps%3A%2F%2Fraw.githubusercontent.com%2Fnetworkop%2Fk8s-guide-labs%2Fmaster%2Fdiagrams%2Fk8s-guide.drawio" >}} 28 | 29 | 30 | 31 | ### Lab Setup 32 | 33 | Assuming that the lab environment is already [set up](/lab/), ipvs can be enabled with the following command: 34 | 35 | ```bash 36 | make ipvs 37 | ``` 38 | 39 | Under the covers, the above command updates the proxier mode in kube-proxy's ConfigMap so in order for this change to get picked up, we need to restart all of the agents and flush out any existing iptable rules: 40 | 41 | ```bash 42 | make flush-nat 43 | ``` 44 | 45 | Check the logs to make sure kube-proxy has loaded all of the [required kernel modules](https://github.com/kubernetes/kubernetes/blob/2f753ec4c826895e4ccd3d6bdda2b1ab777ceeb8/pkg/util/ipvs/ipvs.go#L130). In case of a failure, the following error will be present in the logs and kube-proxy will fall back to the `iptables` mode: 46 | 47 | 48 | ```bash 49 | $ make kube-proxy-logs | grep -i ipvs 50 | E0626 17:19:43.491383 1 server_others.go:127] Can't use the IPVS proxier: IPVS proxier will not be used because the following required kernel modules are not loaded: [ip_vs ip_vs_rr ip_vs_wrr ip_vs_sh] 51 | ``` 52 | 53 | Another way to confirm that the change has succeeded is to check that Nodes now have a new dummy ipvs device: 54 | 55 | {{< highlight bash "linenos=false,hl_lines=2" >}} 56 | $ docker exec -it k8s-guide-worker ip link show kube-ipvs0 57 | 7: kube-ipvs0: mtu 1500 qdisc noop state DOWN mode DEFAULT group default 58 | link/ether 22:76:01:f0:71:9f brd ff:ff:ff:ff:ff:ff promiscuity 0 minmtu 0 maxmtu 0 59 | dummy addrgenmode eui64 numtxqueues 1 numrxqueues 1 gso_max_size 65536 gso_max_segs 65535 60 | {{< / highlight >}} 61 | 62 | 63 | {{% notice note %}} 64 | One thing to remember when migrating from iptables to ipvs on an existing cluster (as opposed to rebuilding it from scratch), is that all of the KUBE-SVC/KUBE-SEP chains will still be there at least until they cleaned up manually or a node is rebooted. 65 | {{% /notice %}} 66 | 67 | Spin up a test deployment and expose it as a `ClusterIP` Service: 68 | 69 | 70 | ```bash 71 | kubectl create deploy web --image=nginx --replicas=2 72 | kubectl expose deploy web --port 80 73 | ``` 74 | 75 | Check that all Pods are up and note the IP allocated to our Service: 76 | 77 | {{< highlight bash "linenos=false,hl_lines=3-4 7" >}} 78 | $ kubectl get pod -owide -l app=web 79 | NAME READY STATUS RESTARTS AGE IP NODE NOMINATED NODE READINESS GATES 80 | web-96d5df5c8-6bgpr 1/1 Running 0 111s 10.244.1.6 k8s-guide-worker 81 | web-96d5df5c8-wkfrb 1/1 Running 0 111s 10.244.2.4 k8s-guide-worker2 82 | $ kubectl get svc web 83 | NAME TYPE CLUSTER-IP EXTERNAL-IP PORT(S) AGE 84 | web ClusterIP 10.96.119.228 80/TCP 92s 85 | {{< / highlight >}} 86 | 87 | Before we move forward, there are a couple of dependencies we need to satisfy: 88 | 89 | 1. Pick one of the Nodes hosting a test deployment and install the following packages: 90 | 91 | ```bash 92 | docker exec k8s-guide-worker apt update 93 | docker exec k8s-guide-worker apt install ipset ipvsadm -y 94 | ``` 95 | 96 | 2. On the same Node set up the following set of aliases to simplify access to iptables, ipvs and ipset: 97 | 98 | ```bash 99 | alias ipt="docker exec k8s-guide-worker iptables -t nat -nvL" 100 | alias ipv="docker exec k8s-guide-worker ipvsadm -ln" 101 | alias ips="docker exec k8s-guide-worker ipset list" 102 | ``` 103 | 104 | ### Use case #1: Pod-to-Service communication 105 | 106 | Any packet leaving a Pod will first pass through the `PREROUTING` chain which is where kube-proxy intercepts all Service-bound traffic: 107 | 108 | {{< highlight bash "linenos=false,hl_lines=3-4 7" >}} 109 | $ ipt PREROUTING 110 | Chain PREROUTING (policy ACCEPT 0 packets, 0 bytes) 111 | pkts bytes target prot opt in out source destination 112 | 128 12020 KUBE-SERVICES all -- * * 0.0.0.0/0 0.0.0.0/0 /* kubernetes service portals */ 113 | 0 0 DOCKER_OUTPUT all -- * * 0.0.0.0/0 192.168.224.1 114 | {{< / highlight >}} 115 | 116 | The size of the `KUBE-SERVICES` chain is reduced compared to the [`iptables`](/services/clusterip/dataplane/iptables/) mode and the lookup stops once the destination IP is matched against the `KUBE-CLUSTER-IP` ipset: 117 | 118 | {{< highlight bash "linenos=false,hl_lines=6" >}} 119 | $ ipt KUBE-SERVICES 120 | Chain KUBE-SERVICES (2 references) 121 | pkts bytes target prot opt in out source destination 122 | 0 0 KUBE-MARK-MASQ all -- * * !10.244.0.0/16 0.0.0.0/0 /* Kubernetes service cluster ip + port for masquerade purpose */ match-set KUBE-CLUSTER-IP dst,dst 123 | 0 0 KUBE-NODE-PORT all -- * * 0.0.0.0/0 0.0.0.0/0 ADDRTYPE match dst-type LOCAL 124 | 0 0 ACCEPT all -- * * 0.0.0.0/0 0.0.0.0/0 match-set KUBE-CLUSTER-IP dst,dst 125 | {{< / highlight >}} 126 | 127 | This ipset contains all existing ClusterIPs and the lookup is performed in [O(1)](https://en.wikipedia.org/wiki/Time_complexity#Constant_time) time: 128 | 129 | {{< highlight bash "linenos=false,hl_lines=18" >}} 130 | $ ips KUBE-CLUSTER-IP 131 | Name: KUBE-CLUSTER-IP 132 | Type: hash:ip,port 133 | Revision: 5 134 | Header: family inet hashsize 1024 maxelem 65536 135 | Size in memory: 768 136 | References: 2 137 | Number of entries: 9 138 | Members: 139 | 10.96.0.10,udp:53 140 | 10.96.0.1,tcp:443 141 | 10.96.0.10,tcp:53 142 | 10.96.148.225,tcp:80 143 | 10.96.68.46,tcp:3030 144 | 10.96.10.207,tcp:3030 145 | 10.96.0.10,tcp:9153 146 | 10.96.159.35,tcp:11211 147 | 10.96.119.228,tcp:80 148 | {{< / highlight >}} 149 | 150 | Following the lookup in the `PREROUTING` chain, our packet gets to the [routing decision](https://upload.wikimedia.org/wikipedia/commons/3/37/Netfilter-packet-flow.svg) stage which is where it gets intercepted by Netfilter's `NF_INET_LOCAL_IN` hook and redirected to IPVS. 151 | 152 | 153 | 154 | {{< highlight bash "linenos=false,hl_lines=20-22" >}} 155 | $ ipv 156 | IP Virtual Server version 1.2.1 (size=4096) 157 | Prot LocalAddress:Port Scheduler Flags 158 | -> RemoteAddress:Port Forward Weight ActiveConn InActConn 159 | TCP 192.168.224.4:31730 rr 160 | -> 10.244.1.6:80 Masq 1 0 0 161 | -> 10.244.2.4:80 Masq 1 0 0 162 | TCP 10.96.0.1:443 rr 163 | -> 192.168.224.3:6443 Masq 1 0 0 164 | TCP 10.96.0.10:53 rr 165 | -> 10.244.0.3:53 Masq 1 0 0 166 | -> 10.244.0.4:53 Masq 1 0 0 167 | TCP 10.96.0.10:9153 rr 168 | -> 10.244.0.3:9153 Masq 1 0 0 169 | -> 10.244.0.4:9153 Masq 1 0 0 170 | TCP 10.96.10.207:3030 rr 171 | -> 10.244.1.4:3030 Masq 1 0 0 172 | TCP 10.96.68.46:3030 rr 173 | -> 10.244.2.2:3030 Masq 1 0 0 174 | TCP 10.96.119.228:80 rr 175 | -> 10.244.1.6:80 Masq 1 0 0 176 | -> 10.244.2.4:80 Masq 1 0 0 177 | TCP 10.96.148.225:80 rr 178 | -> 10.244.1.6:80 Masq 1 0 0 179 | -> 10.244.2.4:80 Masq 1 0 0 180 | TCP 10.96.159.35:11211 rr 181 | -> 10.244.1.3:11211 Masq 1 0 0 182 | TCP 10.244.2.1:31730 rr 183 | -> 10.244.1.6:80 Masq 1 0 0 184 | -> 10.244.2.4:80 Masq 1 0 0 185 | TCP 127.0.0.1:31730 rr 186 | -> 10.244.1.6:80 Masq 1 0 0 187 | -> 10.244.2.4:80 Masq 1 0 0 188 | UDP 10.96.0.10:53 rr 189 | -> 10.244.0.3:53 Masq 1 0 8 190 | -> 10.244.0.4:53 Masq 1 0 8 191 | {{< / highlight >}} 192 | 193 | 194 | This is where the packet gets DNAT'ed to the IP of one of the selected backend Pods (`10.244.1.6` in our case) and continues on to its destination unmodified, following the forwarding path built by a CNI plugin. 195 | 196 | ### Use case #2: Any-to-Service communication 197 | 198 | Any host-local service trying to communicate with a ClusterIP will first get its packet through `OUTPUT` and `KUBE-SERVICES` chains: 199 | 200 | {{< highlight bash "linenos=false,hl_lines=4" >}} 201 | $ ipt OUTPUT 202 | Chain OUTPUT (policy ACCEPT 5 packets, 300 bytes) 203 | pkts bytes target prot opt in out source destination 204 | 1062 68221 KUBE-SERVICES all -- * * 0.0.0.0/0 0.0.0.0/0 /* kubernetes service portals */ 205 | 287 19636 DOCKER_OUTPUT all -- * * 0.0.0.0/0 192.168.224.1 206 | {{< / highlight >}} 207 | 208 | Since source IP does not belong to the PodCIDR range, our packet gets a de-tour via the `KUBE-MARK-MASQ` chain: 209 | 210 | {{< highlight bash "linenos=false,hl_lines=4" >}} 211 | $ ipt KUBE-SERVICES 212 | Chain KUBE-SERVICES (2 references) 213 | pkts bytes target prot opt in out source destination 214 | 0 0 KUBE-MARK-MASQ all -- * * !10.244.0.0/16 0.0.0.0/0 /* Kubernetes service cluster ip + port for masquerade purpose */ match-set KUBE-CLUSTER-IP dst,dst 215 | 0 0 KUBE-NODE-PORT all -- * * 0.0.0.0/0 0.0.0.0/0 ADDRTYPE match dst-type LOCAL 216 | 0 0 ACCEPT all -- * * 0.0.0.0/0 0.0.0.0/0 match-set KUBE-CLUSTER-IP dst,dst 217 | {{< / highlight >}} 218 | 219 | Here the packet gets marked for future SNAT, to make sure it will have a return path from the Pod: 220 | 221 | {{< highlight bash "linenos=false,hl_lines=4" >}} 222 | $ ipt KUBE-MARK-MASQ 223 | Chain KUBE-MARK-MASQ (13 references) 224 | pkts bytes target prot opt in out source destination 225 | 0 0 MARK all -- * * 0.0.0.0/0 0.0.0.0/0 MARK or 0x4000 226 | {{< / highlight >}} 227 | 228 | The following few steps are exactly the same as described for the previous use case: 229 | 230 | * The packet reaches the end of the `KUBE-SERVICES` chain. 231 | * The routing lookup returns a local dummy ipvs interface. 232 | * IPVS intercepts the packet and performs the backend selection and NATs the destination IP address. 233 | 234 | The modified packet metadata continues along the forwarding path until it hits the egress `veth` interface where it gets picked up by the `POSTROUTING` chain: 235 | 236 | {{< highlight bash "linenos=false,hl_lines=4" >}} 237 | $ ipt POSTROUTING 238 | Chain POSTROUTING (policy ACCEPT 5 packets, 300 bytes) 239 | pkts bytes target prot opt in out source destination 240 | 1199 80799 KUBE-POSTROUTING all -- * * 0.0.0.0/0 0.0.0.0/0 /* kubernetes postrouting rules */ 241 | 0 0 DOCKER_POSTROUTING all -- * * 0.0.0.0/0 192.168.224.1 242 | 920 61751 KIND-MASQ-AGENT all -- * * 0.0.0.0/0 0.0.0.0/0 ADDRTYPE match dst-type !LOCAL /* kind-masq-agent: ensure nat POSTROUTING directs all non-LOCAL destination traffic to our custom KIND-MASQ-AGENT chain */ 243 | {{< / highlight >}} 244 | 245 | This is where the source IP of the packet gets modified to match the one of the egress interface, so the destination Pod knows where to send a reply: 246 | 247 | {{< highlight bash "linenos=false,hl_lines=4 " >}} 248 | $ ipt KUBE-POSTROUTING 249 | Chain KUBE-POSTROUTING (1 references) 250 | pkts bytes target prot opt in out source destination 251 | 0 0 MASQUERADE all -- * * 0.0.0.0/0 0.0.0.0/0 /* Kubernetes endpoints dst ip:port, source ip for solving hairpin purpose */ match-set KUBE-LOOP-BACK dst,dst,src 252 | 1 60 RETURN all -- * * 0.0.0.0/0 0.0.0.0/0 mark match ! 0x4000/0x4000 253 | 0 0 MARK all -- * * 0.0.0.0/0 0.0.0.0/0 MARK xor 0x4000 254 | 0 0 MASQUERADE all -- * * 0.0.0.0/0 0.0.0.0/0 /* kubernetes service traffic requiring SNAT */ random-fully 255 | {{< / highlight >}} 256 | 257 | The final masquerading action is performed if the destination IP and Port match one of the local Endpoints which are stored in the `KUBE-LOOP-BACK` ipset: 258 | 259 | {{< highlight bash "linenos=false,hl_lines=11" >}} 260 | $ ips KUBE-LOOP-BACK 261 | Name: KUBE-LOOP-BACK 262 | Type: hash:ip,port,ip 263 | Revision: 5 264 | Header: family inet hashsize 1024 maxelem 65536 265 | Size in memory: 360 266 | References: 1 267 | Number of entries: 2 268 | Members: 269 | 10.244.1.2,tcp:3030,10.244.1.2 270 | 10.244.1.6,tcp:80,10.244.1.6 271 | {{< / highlight >}} 272 | 273 | {{% notice info %}} 274 | It should be noted that, similar to the iptables mode, all of the above lookups are only performed for the first packet of the session and all subsequent packets follow a much shorter path in the conntrack subsystem. 275 | {{% /notice %}} 276 | 277 | 278 | ### Additional reading 279 | 280 | [Scaling Kubernetes to Support 50,000 Services](https://github.com/sbueringer/kubecon-slides/blob/4a793c54a5bb31ededb2ec3ba230aaa94bc003d7/slides/2017-kubecon-eu/Scale%20Kubernetes%20to%20Support%2050,000%20Services%20%5BI%5D%20-%20Haibin%20Xie%20&%20Quinton%20Hoole,%20Huawei%20Technologies%20-%20Scale%20Kubernetes%20to%20Support%2050000%20Services.pdf) 281 | 282 | [Comparing kube-proxy modes: iptables or IPVS?](https://www.projectcalico.org/comparing-kube-proxy-modes-iptables-or-ipvs/) 283 | 284 | [IPVS-Based In-Cluster Load Balancing Deep Dive](https://kubernetes.io/blog/2018/07/09/ipvs-based-in-cluster-load-balancing-deep-dive/) -------------------------------------------------------------------------------- /content/services/clusterIP/dataplane/eBPF.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: "eBPF" 3 | date: 2020-09-13T17:33:04+01:00 4 | weight: 40 5 | --- 6 | 7 | eBPF has emerged as a new alternative to IPTables and IPVS mechanisms implemented by `kube-proxy` with the promise to reduce CPU utilization and latency, improve throughput and increase scale. 8 | As of today, there are two implementations of Kubernetes Service's data plane in eBPF -- one from [Calico](https://docs.projectcalico.org/maintenance/ebpf/enabling-bpf) and one from [Cilium](https://docs.cilium.io/en/latest/gettingstarted/kubeproxy-free/). 9 | Since Cilium was the first product to introduce `kube-proxy`-less data plane, we'll focus on its implementation in this chapter. However it should be noted that there is no "standard" way to implement the Services data plane in eBPF, so Calico's approach may be different. 10 | 11 | Cilium's `kube-proxy` replacement is called [Host-Reachable Services](https://docs.cilium.io/en/v1.10/gettingstarted/host-services/#host-services) and it literally makes any ClusterIP reachable from the host (Kubernetes Node). It does that by attaching eBPF programs to cgroup hooks, intercepting all system calls and transparently modifying the ones that are destined to ClusterIP VIPs. Since Cilium attaches them to the root cgroup, it affects all sockets of all processes on the host. As of today, Cilium's implementation supports the following syscalls, which cover most of the use-cases but [depend](https://docs.cilium.io/en/latest/gettingstarted/kubeproxy-free/#limitations) on the underlying Linux kernel version: 12 | 13 | ``` 14 | $ bpftool cgroup tree /run/cilium/cgroupv2/ 15 | CgroupPath 16 | ID AttachType AttachFlags Name 17 | /run/cilium/cgroupv2 18 | 2005 connect4 19 | 1970 connect6 20 | 2007 post_bind4 21 | 2002 post_bind6 22 | 2008 sendmsg4 23 | 2003 sendmsg6 24 | 2009 recvmsg4 25 | 2004 recvmsg6 26 | 2006 getpeername4 27 | 1991 getpeername6 28 | ``` 29 | 30 | This is what typically happens when a client, e.g. a process inside a Pod, tries to communicate with a remote ClusterIP: 31 | 32 | * Client's network application invokes one of the syscalls. 33 | * eBPF program attached to this syscall's hook is executed. 34 | * The input to this eBPF program contains a number of socket parameters like destination IP and port number. 35 | * These input details are compared to existing ClusterIP Services and if no match is found, control flow is returned to the Linux kernel. 36 | * In case one of the existing Services did match, the eBPF program selects one of the backend Endpoints and "redirects" the syscall to it by modifying its destination address, before passing it back to the Linux kernel. 37 | * Subsequent data is exchanged over the opened socket by calling `read()` and `write()` without any involvement from the eBPF program. 38 | 39 | It's very important to understand that in this case, the destination NAT translation happens at the syscall level, before the packet is even built by the kernel. What this means is that the first packet to leave the client network namespace already has the right destination IP and port number and can be forwarded by a separate data plane managed by a CNI plugin (in most cases though the entire data plane is managed by the same plugin). 40 | 41 | {{% notice info %}} 42 | A somewhat similar idea has previously been implemented by a product called Appswitch. See [1](https://hci.stanford.edu/cstr/reports/2017-01.pdf), [2](https://appswitch.readthedocs.io/en/latest/index.html), [3](https://networkop.co.uk/post/2018-05-29-appswitch-sdn/) for more details. 43 | {{% /notice %}} 44 | 45 | Below is a high-level diagram of what happens when a Pod on Node `worker-2` tries to communicate with a ClusterIP `10.96.32.28:80`. See [section below](/services/clusterip/dataplane/ebpf/#a-day-in-the-life-of-a-packet) for a detailed code walkthrough. 46 | 47 | {{< iframe "https://viewer.diagrams.net/?highlight=0000ff&edit=_blank&hide-pages=1&editable=false&layers=1&nav=0&page-id=oxqjjDhMhjtZh66px_17&title=k8s-guide.drawio#Uhttps%3A%2F%2Fraw.githubusercontent.com%2Fnetworkop%2Fk8s-guide-labs%2Fmaster%2Fdiagrams%2Fk8s-guide.drawio" >}} 48 | 49 | 50 | ## Lab 51 | 52 | 53 | ### Preparation 54 | 55 | Assuming that the lab environment is already [set up](/lab/), Cilium can be enabled with the following command: 56 | 57 | ```bash 58 | make cilium 59 | ``` 60 | 61 | Wait for Cilium daemonset to initialize: 62 | 63 | ```bash 64 | make cilium-wait 65 | ``` 66 | 67 | Now we need to "kick" all Pods to restart and pick up the new CNI plugin: 68 | 69 | ```bash 70 | make nuke-all-pods 71 | ``` 72 | 73 | To make sure there's is no interference from `kube-proxy` we'll remove it completely along with any IPTables rules set up by it: 74 | 75 | ``` 76 | make nuke-kube-proxy 77 | ``` 78 | 79 | Check that the cilium is healthy: 80 | 81 | ```bash 82 | $ make cilium-check | grep health 83 | Cilium health daemon: Ok 84 | Controller Status: 40/40 healthy 85 | Cluster health: 3/3 reachable (2021-08-02T19:52:07Z) 86 | ``` 87 | 88 | In order to have a working ClusterIP to test against, create a deployment with 3 nginx Pods and examine the assigned ClusterIP and IPs of the backend Pods: 89 | 90 | ``` 91 | make deployment && make scale-up && make cluster-ip 92 | $ kubectl get svc web 93 | NAME TYPE CLUSTER-IP EXTERNAL-IP PORT(S) AGE 94 | web ClusterIP 10.96.32.28 80/TCP 5s 95 | $ kubectl get ep web 96 | NAME ENDPOINTS AGE 97 | web 10.0.0.234:80,10.0.0.27:80,10.0.2.76:80 11m 98 | ``` 99 | 100 | Now let's see what happens when a client tries to communicate with this Service. 101 | 102 | ## A day in the life of a Packet 103 | 104 | First, let's take a look at the first few packets of a client session. Keep a close eye on the destination IP of the captured packets: 105 | ``` 106 | $ NODE=k8s-guide-worker2 make tshoot 107 | bash-5.1# tcpdump -enni any -q & 108 | bash-5.1# tcpdump: verbose output suppressed, use -v or -vv for full protocol decode 109 | listening on any, link-type LINUX_SLL (Linux cooked v1), capture size 262144 bytes 110 | 111 | bash-5.1# curl -s 10.96.32.28 | grep Welcome 112 | Welcome to nginx! 113 |

Welcome to nginx!

114 | 20:11:29.780374 eth0 Out ifindex 24 aa:24:9c:63:2e:7d 10.0.2.202.45676 > 10.0.0.27.80: tcp 0 115 | 20:11:29.781996 eth0 In ifindex 24 2a:89:e2:43:42:6e 10.0.0.27.80 > 10.0.2.202.45676: tcp 0 116 | 20:11:29.782014 eth0 Out ifindex 24 aa:24:9c:63:2e:7d 10.0.2.202.45676 > 10.0.0.27.80: tcp 0 117 | 20:11:29.782297 eth0 Out ifindex 24 aa:24:9c:63:2e:7d 10.0.2.202.45676 > 10.0.0.27.80: tcp 75 118 | ``` 119 | 120 | The first TCP packet sent at `20:11:29.780374` already contains the destination IP of one of the backend Pods. This kind of behaviour can very easily [enhance](https://cilium.io/blog/2018/08/07/istio-10-cilium) but also [trip up](https://github.com/linkerd/linkerd2/issues/5932#issuecomment-811747872) applications relying on [traffic interception](https://docs.openservicemesh.io/docs/tasks/traffic_management/iptables_redirection/). 121 | 122 | Now let's take a close look at the "happy path" of the eBPF program responsible for this. The above `curl` command would try to connect to an IPv4 address and would invoke the [`connect()`](https://man7.org/linux/man-pages/man2/connect.2.html) syscall, to which the `connect4` eBPF program is attached ([source](https://github.com/cilium/cilium/blob/4145278ccc6e90739aa100c9ea8990a0f561ca95/bpf/bpf_sock.c#L446)). 123 | 124 | {{< highlight c "linenos=false,hl_lines=7 " >}} 125 | __section("connect4") 126 | int sock4_connect(struct bpf_sock_addr *ctx) 127 | { 128 | if (sock_is_health_check(ctx)) 129 | return __sock4_health_fwd(ctx); 130 | 131 | __sock4_xlate_fwd(ctx, ctx, false); 132 | return SYS_PROCEED; 133 | } 134 | {{< / highlight >}} 135 | 136 | 137 | Most of the processing is done inside the [`__sock4_xlate_fwd`](https://github.com/cilium/cilium/blob/4145278ccc6e90739aa100c9ea8990a0f561ca95/bpf/bpf_sock.c#L328) function; we'll break it down into multiple parts for simplicity and omit some of the less important bits that cover special use cases like `sessionAffinity` and `externalTrafficPolicy`. Note that regardless of what happens in the above function, the returned value is always `SYS_PROCEED`, which returns the control flow back to the kernel. 138 | 139 | The first thing that happens inside this function is the Services map lookup based on the destination IP and port: 140 | 141 | {{< highlight c "linenos=false,hl_lines=8-9 13 " >}} 142 | static __always_inline int __sock4_xlate_fwd(struct bpf_sock_addr *ctx, 143 | struct bpf_sock_addr *ctx_full, 144 | const bool udp_only) 145 | { 146 | struct lb4_backend *backend; 147 | struct lb4_service *svc; 148 | struct lb4_key key = { 149 | .address = ctx->user_ip4, 150 | .dport = ctx_dst_port(ctx), 151 | }, orig_key = key; 152 | struct lb4_service *backend_slot; 153 | 154 | svc = lb4_lookup_service(&key, true); 155 | if (!svc) 156 | svc = sock4_wildcard_lookup_full(&key, in_hostns); 157 | if (!svc) 158 | return -ENXIO; 159 | {{< / highlight >}} 160 | 161 | Kubernetes Services can have an arbitrary number of Endpoints, depending on the number of matching Pods, however eBPF maps have [fixed size](https://docs.cilium.io/en/latest/concepts/ebpf/maps/#ebpf-maps), so storing variable-size values is not possible. In order to overcome that, the lookup process is broken into two steps: 162 | 163 | * The first lookup is done just with the destination IP and port and the returned value tells how many Endpoints are currently associated with the Service. 164 | * The second lookup is done with the same destination IP and port _plus_ an additional field called `backend_slot` which corresponds to one of the backend Endpoints. 165 | 166 | During the first lookup `backend_slot` is set to 0. The returned value contains [a number of fields](https://github.com/cilium/cilium/blob/4145278ccc6e90739aa100c9ea8990a0f561ca95/bpf/lib/common.h#L767) but the most important one at this stage is `count` -- the total number of Endpoints for this Service. 167 | 168 | {{< highlight c "linenos=false,hl_lines=8-9 15 " >}} 169 | static __always_inline 170 | struct lb4_service *lb4_lookup_service(struct lb4_key *key, 171 | const bool scope_switch) 172 | { 173 | struct lb4_service *svc; 174 | 175 | key->scope = LB_LOOKUP_SCOPE_EXT; 176 | key->backend_slot = 0; 177 | svc = map_lookup_elem(&LB4_SERVICES_MAP_V2, key); 178 | if (svc) { 179 | if (!scope_switch || !lb4_svc_is_local_scope(svc)) 180 | return svc->count ? svc : NULL; 181 | key->scope = LB_LOOKUP_SCOPE_INT; 182 | svc = map_lookup_elem(&LB4_SERVICES_MAP_V2, key); 183 | if (svc && svc->count) 184 | return svc; 185 | } 186 | 187 | return NULL; 188 | } 189 | {{< / highlight >}} 190 | 191 | Let's look inside the eBPF map and see what entries match that last two octets of our ClusterIP `10.96.32.28`: 192 | 193 | {{< highlight bash "linenos=false,hl_lines=5 " >}} 194 | $ NODE=k8s-guide-worker2 195 | $ cilium=$(kubectl get -l k8s-app=cilium pods -n cilium --field-selector spec.nodeName=$NODE -o jsonpath='{.items[0].metadata.name}') 196 | $ kubectl -n cilium exec -it $cilium -- bpftool map dump pinned /sys/fs/bpf/tc/globals/cilium_lb4_services_v2 | grep "20 1c" 197 | key: 0a 60 20 1c 00 50 03 00 00 00 00 00 value: 0b 00 00 00 00 00 00 07 00 00 00 00 198 | key: 0a 60 20 1c 00 50 00 00 00 00 00 00 value: 00 00 00 00 03 00 00 07 00 00 00 00 199 | key: 0a 60 20 1c 00 50 01 00 00 00 00 00 value: 09 00 00 00 00 00 00 07 00 00 00 00 200 | key: 0a 60 20 1c 00 50 02 00 00 00 00 00 value: 0a 00 00 00 00 00 00 07 00 00 00 00 201 | {{< / highlight >}} 202 | 203 | If the `backend_slot` is set to 0, the key would only contain the IP and port of the Service, so that second line would match the first lookup and the [returned value](https://github.com/cilium/cilium/blob/4145278ccc6e90739aa100c9ea8990a0f561ca95/bpf/lib/common.h#L767) can be interpreted as: 204 | 205 | * `backend_id = 0` 206 | * `count = 3` 207 | 208 | Now the eBPF program knows that the total number of Endpoints is 3 but it still hasn't picked one yet. The control returns to the `__sock4_xlate_fwd` function where the `count` information is used to update the lookup `key.backend_slot`: 209 | 210 | {{< highlight c "linenos=false,hl_lines=4 " >}} 211 | if (backend_id == 0) { 212 | backend_from_affinity = false; 213 | 214 | key.backend_slot = (sock_select_slot(ctx_full) % svc->count) + 1; 215 | backend_slot = __lb4_lookup_backend_slot(&key); 216 | if (!backend_slot) { 217 | update_metrics(0, METRIC_EGRESS, REASON_LB_NO_BACKEND_SLOT); 218 | return -ENOENT; 219 | } 220 | 221 | backend_id = backend_slot->backend_id; 222 | backend = __lb4_lookup_backend(backend_id); 223 | } 224 | {{< / highlight >}} 225 | 226 | This is where the backend selection takes place either randomly (for TCP) or based on the [socket cookie](https://github.com/cilium/cilium/blob/4145278ccc6e90739aa100c9ea8990a0f561ca95/bpf/bpf_sock.c#L101) (for UDP): 227 | 228 | {{< highlight c "linenos=false,hl_lines=5 " >}} 229 | static __always_inline __maybe_unused 230 | __u64 sock_select_slot(struct bpf_sock_addr *ctx) 231 | { 232 | return ctx->protocol == IPPROTO_TCP ? 233 | get_prandom_u32() : sock_local_cookie(ctx); 234 | } 235 | {{< / highlight >}} 236 | 237 | The [second lookup](https://github.com/cilium/cilium/blob/4145278ccc6e90739aa100c9ea8990a0f561ca95/bpf/lib/lb.h#L1095) is performed in the same map, but now the key contains the previously selected `backend_slot`: 238 | 239 | {{< highlight c "linenos=false,hl_lines=4 " >}} 240 | static __always_inline 241 | struct lb4_service *__lb4_lookup_backend_slot(struct lb4_key *key) 242 | { 243 | return map_lookup_elem(&LB4_SERVICES_MAP_V2, key); 244 | } 245 | {{< / highlight >}} 246 | 247 | The lookup result will contain either one of the values from rows 1, 3 or 4 and will have a non-zero value for `backend_id` -- `0b 00`, `09 00` or `0a 00`: 248 | 249 | {{< highlight c "linenos=false,hl_lines=2 4 5 " >}} 250 | $ kubectl -n cilium exec -it $cilium -- bpftool map dump pinned /sys/fs/bpf/tc/globals/cilium_lb4_services_v2 | grep "20 1c" 251 | key: 0a 60 20 1c 00 50 03 00 00 00 00 00 value: 0b 00 00 00 00 00 00 07 00 00 00 00 252 | key: 0a 60 20 1c 00 50 00 00 00 00 00 00 value: 00 00 00 00 03 00 00 07 00 00 00 00 253 | key: 0a 60 20 1c 00 50 01 00 00 00 00 00 value: 09 00 00 00 00 00 00 07 00 00 00 00 254 | key: 0a 60 20 1c 00 50 02 00 00 00 00 00 value: 0a 00 00 00 00 00 00 07 00 00 00 00 255 | {{< / highlight >}} 256 | 257 | Using this value we can now extract IP and port details of the backend Pod: 258 | 259 | 260 | ```c 261 | static __always_inline struct lb4_backend *__lb4_lookup_backend(__u16 backend_id) 262 | { 263 | return map_lookup_elem(&LB4_BACKEND_MAP, &backend_id); 264 | } 265 | ``` 266 | 267 | Let's assume that the `backend_id` that got chosen before was `0a 00` and look up the details in the eBPF map: 268 | 269 | ``` 270 | $ kubectl -n cilium exec -it $cilium -- bpftool map lookup pinned /sys/fs/bpf/tc/globals/cilium_lb4_backends key 0x0a 0x00 271 | key: 0a 00 value: 0a 00 00 1b 00 50 00 00 272 | ``` 273 | 274 | The [returned value](https://github.com/cilium/cilium/blob/4145278ccc6e90739aa100c9ea8990a0f561ca95/bpf/lib/common.h#L782) can be interpreted as: 275 | 276 | * **Address** = `10.0.0.27` 277 | * **Port** = `80` 278 | 279 | Finally, the eBPF program does the socket-based NAT translation, i.e. re-writing of the destination IP and port with the values returned from the eariler lookup: 280 | 281 | {{< highlight c "linenos=false,hl_lines=1 2 " >}} 282 | 283 | ctx->user_ip4 = backend->address; 284 | ctx_set_port(ctx, backend->port); 285 | 286 | return 0; 287 | {{< / highlight >}} 288 | 289 | 290 | At this stage, the eBPF program returns and execution flow continues inside the Linux kernel networking stack all the way until the packet is built and sent out of the egress interface. The packet continues along the path built by the [CNI portion](/cni/cilium) of Cilium. 291 | 292 | This is all that's required to replace the biggest part of `kube-proxy`'s functionality. One big difference with `kube-proxy` implementation is that NAT translation only happens for traffic originating from one of the Kubernetes nodes, e.g. [externally originated](https://docs.projectcalico.org/networking/advertise-service-ips) ClusterIP traffic is not currently supported. This is why we haven't considered the **Any-to-Service** communication use case, as we did for IPTables and IPVS. 293 | 294 | 295 | {{% notice info %}} 296 | Due to a [known issue](https://docs.cilium.io/en/v1.9/gettingstarted/kind/#unable-to-contact-k8s-api-server) with kind, make sure to run `make cilium-unhook` when you're finished with this Cilium lab to detach eBPF programs from the host cgroup. 297 | {{% /notice %}} 298 | 299 | 300 | ### Additional reading 301 | 302 | [Cilium socket LB presentation](https://docs.google.com/presentation/d/1w2zlpGWV7JUhHYd37El_AUZzyUNSvDfktrF5MJ5G8Bs/edit#slide=id.g746fc02b5b_2_0) 303 | 304 | [Kubernetes Without kube-proxy](https://docs.cilium.io/en/latest/gettingstarted/kubeproxy-free/) 305 | 306 | -------------------------------------------------------------------------------- /content/services/clusterIP/dataplane/iptables.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: "IPTABLES" 3 | date: 2020-09-13T17:33:04+01:00 4 | weight: 10 5 | --- 6 | 7 | Most of the focus of this section will be on the standard node-local proxy implementation called [`kube-proxy`](https://kubernetes.io/docs/concepts/overview/components/#kube-proxy). It is used by default by most of the Kubernetes orchestrators and is installed as a daemonset on top of an newly bootstrapped cluster: 8 | 9 | 10 | ``` 11 | $ kubectl get daemonset -n kube-system -l k8s-app=kube-proxy 12 | NAME DESIRED CURRENT READY UP-TO-DATE AVAILABLE NODE SELECTOR AGE 13 | kube-proxy 3 3 3 3 3 kubernetes.io/os=linux 2d16h 14 | ``` 15 | 16 | The default mode of operation for `kube-proxy` is `iptables`, as it provides support for a wider set of operating systems without requiring extra kernel modules and has a "good enough" performance characteristics for the majority of small to medium-sized clusters. 17 | 18 | This area of Kubernetes networking is one of the most poorly documented. On the one hand, there are [blogposts](https://medium.com/google-cloud/understanding-kubernetes-networking-services-f0cb48e4cc82) that cover parts of the `kube-proxy` dataplane, on the other hand there's an amazing [diagram](https://docs.google.com/drawings/d/1MtWL8qRTs6PlnJrW4dh8135_S9e2SaawT410bJuoBPk/edit) created by [Tim Hockin](https://twitter.com/thockin) that shows a complete logical flow of packet forwarding decisions but provides very little context and is quite difficult to trace for specific flows. The goal of this article is to bridge the gap between these two extremes and provide a high level of detail while maintaining an easily consumable format. 19 | 20 | So for demonstration purposes, we'll use the following topology with a "web" deployment and two pods scheduled on different worker nodes. The packet forwarding logic for ClusterIP-type services has two distinct paths within the dataplane, which is what we're gonna be focusing on next: 21 | 22 | 1. **Pod-to-Service** communication (purple packets) -- implemented entirely within an egress node and relies on CNI for pod-to-pod reachability. 23 | 2. **Any-to-Service** communication (grey packets) -- includes any externally-originated and, most notable, node-to-service traffic flows. 24 | 25 | {{< iframe "https://viewer.diagrams.net/?highlight=0000ff&edit=_blank&hide-pages=1&editable=false&layers=1&nav=0&page-id=nEL34B1qbs_s_G34E68V&title=k8s-guide.drawio#Uhttps%3A%2F%2Fraw.githubusercontent.com%2Fnetworkop%2Fk8s-guide-labs%2Fmaster%2Fdiagrams%2Fk8s-guide.drawio" >}} 26 | 27 | 28 | The above diagram shows a slightly simplified sequence of match/set actions implemented inside Netfilter's NAT table. The lab section below will show a more detailed view of this dataplane along verification commands. 29 | 30 | {{% notice note %}} 31 | One key thing to remember is that none of the ClusterIPs implemented this way are visible in the Linux routing table. The whole dataplane is implemented entirely within iptable's NAT table, which makes it both very flexible and extremely difficult to troubleshoot at the same time. 32 | {{% /notice %}} 33 | 34 | ### Lab Setup 35 | 36 | To make sure that lab is in the right state, reset it to a blank state: 37 | 38 | ```bash 39 | make up && make reset 40 | ``` 41 | 42 | Now let's spin up a new deployment and expose it with a ClusterIP service: 43 | 44 | ```bash 45 | $ kubectl create deploy web --image=nginx --replicas=2 46 | $ kubectl expose deploy web --port 80 47 | ``` 48 | 49 | The result of the above two commands can be verified like this: 50 | 51 | ```bash 52 | $ kubectl get deploy web 53 | NAME READY UP-TO-DATE AVAILABLE AGE 54 | web 2/2 2 2 160m 55 | $ kubectl get svc web 56 | NAME TYPE CLUSTER-IP EXTERNAL-IP PORT(S) AGE 57 | web ClusterIP 10.96.94.225 8080/TCP 31s 58 | ``` 59 | 60 | The simplest way to test connectivity would be to connect to the assigned ClusterIP `10.96.94.225` from one of the nodes, e.g.: 61 | 62 | ```bash 63 | $ docker exec k8s-guide-worker curl -s 10.96.94.225 | grep Welcome 64 | Welcome to nginx! 65 |

Welcome to nginx!

66 | ``` 67 | 68 | One last thing before moving on, let's set up the following bash alias as a shortcut to `k8s-guide-worker`'s NAT iptable: 69 | 70 | ```bash 71 | $ alias d="docker exec k8s-guide-worker iptables -t nat -nvL" 72 | ``` 73 | 74 | ### Use case #1: Pod-to-Service communication 75 | 76 | According to Tim's [diagram](https://docs.google.com/drawings/d/1MtWL8qRTs6PlnJrW4dh8135_S9e2SaawT410bJuoBPk/edit) all Pod-to-Service packets get intercepted by the `PREROUTING` chain: 77 | 78 | {{< highlight bash "linenos=false,hl_lines=4" >}} 79 | $ d PREROUTING 80 | Chain PREROUTING (policy ACCEPT 0 packets, 0 bytes) 81 | pkts bytes target prot opt in out source destination 82 | 313 18736 KUBE-SERVICES all -- * * 0.0.0.0/0 0.0.0.0/0 /* kubernetes service portals */ 83 | 36 2242 DOCKER_OUTPUT all -- * * 0.0.0.0/0 172.16.0.190 84 | {{< / highlight >}} 85 | 86 | These packets get redirected to the `KUBE-SERVICES` chain, where they get matched against _all_ configured ClusterIPs, eventually reaching these lines: 87 | 88 | {{< highlight bash "linenos=false,hl_lines=3" >}} 89 | $ d KUBE-SERVICES | grep 10.96.94.225 90 | 3 180 KUBE-MARK-MASQ tcp -- * * !10.244.0.0/16 10.96.94.225 /* default/web cluster IP */ tcp dpt:80 91 | 3 180 KUBE-SVC-LOLE4ISW44XBNF3G tcp -- * * 0.0.0.0/0 10.96.94.225 /* default/web cluster IP */ tcp dpt:80 92 | {{< / highlight >}} 93 | 94 | Since the sourceIP of the packet belongs to a Pod (`10.244.0.0/16` is the PodCIDR range), the second line gets matched and the lookup continues in the service-specific chain. Here we have two Pods matching the same label-selector (`--replicas=2`) and both chains are configured with equal distribution probability: 95 | 96 | {{< highlight bash "linenos=false,hl_lines=4 12" >}} 97 | $ d KUBE-SVC-LOLE4ISW44XBNF3G 98 | Chain KUBE-SVC-LOLE4ISW44XBNF3G (1 references) 99 | pkts bytes target prot opt in out source destination 100 | 0 0 KUBE-SEP-MHDQ23KUGG7EGFMW all -- * * 0.0.0.0/0 0.0.0.0/0 /* default/web */ statistic mode random probability 0.50000000000 101 | 0 0 KUBE-SEP-ZA2JI7K7LSQNKDOS all -- * * 0.0.0.0/0 0.0.0.0/0 /* default/web */ 102 | {{< / highlight >}} 103 | 104 | Let's assume that in this case the first rule gets matched, so our packet continues on to the next chain where it gets DNAT'ed to the target IP of the destination Pod (`10.244.1.3`): 105 | 106 | {{< highlight bash "linenos=false,hl_lines=5" >}} 107 | $ d KUBE-SEP-MHDQ23KUGG7EGFMW 108 | Chain KUBE-SEP-MHDQ23KUGG7EGFMW (1 references) 109 | pkts bytes target prot opt in out source destination 110 | 0 0 KUBE-MARK-MASQ all -- * * 10.244.1.3 0.0.0.0/0 /* default/web */ 111 | 3 180 DNAT tcp -- * * 0.0.0.0/0 0.0.0.0/0 /* default/web */ tcp to:10.244.1.3:80 112 | {{< / highlight >}} 113 | 114 | From here on our packet remains unmodified and continues along its forwarding path set up by a [CNI plugin](/cni/kindnet/) until it reaches the target Node and gets sent directly to the destination Pod. 115 | 116 | 117 | 118 | ### Use case #2: Any-to-Service communication 119 | 120 | Let's assume that the `k8s-guide-worker` node (IP `172.18.0.12`) is sending a packet to our ClusterIP service. This packet gets intercepted in the `OUTPUT` chain and continues to the `KUBE-SERVICES` where it gets redirected via the `KUBE-MARK-MASQ` chain: 121 | 122 | {{< highlight bash "linenos=false,hl_lines=4 8" >}} 123 | $ d OUTPUT 124 | Chain OUTPUT (policy ACCEPT 224 packets, 13440 bytes) 125 | pkts bytes target prot opt in out source destination 126 | 4540 272K KUBE-SERVICES all -- * * 0.0.0.0/0 0.0.0.0/0 /* kubernetes service portals */ 127 | 42 2661 DOCKER_OUTPUT all -- * * 0.0.0.0/0 172.16.0.190 128 | 129 | $ d KUBE-SERVICES | grep 10.96.94.225 130 | 3 180 KUBE-MARK-MASQ tcp -- * * !10.244.0.0/16 10.96.94.225 /* default/web cluster IP */ tcp dpt:80 131 | 3 180 KUBE-SVC-LOLE4ISW44XBNF3G tcp -- * * 0.0.0.0/0 10.96.94.225 /* default/web cluster IP */ tcp dpt:80 132 | {{< / highlight >}} 133 | 134 | The purpose of this chain is to mark all packets that will need to get SNAT'ed before they get sent to the final destination: 135 | 136 | {{< highlight bash "linenos=false,hl_lines=4" >}} 137 | $ d KUBE-MARK-MASQ 138 | Chain KUBE-MARK-MASQ (19 references) 139 | pkts bytes target prot opt in out source destination 140 | 3 180 MARK all -- * * 0.0.0.0/0 0.0.0.0/0 MARK or 0x4000 141 | {{< / highlight >}} 142 | 143 | Since `MARK` is not a [terminating target](https://gist.github.com/mcastelino/c38e71eb0809d1427a6650d843c42ac2#targets), the lookup continues down the `KUBE-SERVICES` chain where our packets gets DNAT'ed to one of the randomly selected backend endpoints (as shown above). 144 | 145 | However, this time, before it gets sent to its final destination, the packet gets another detour via the `KUBE-POSTROUTING` chain: 146 | 147 | 148 | {{< highlight bash "linenos=false,hl_lines=4" >}} 149 | $ d POSTROUTING 150 | Chain POSTROUTING (policy ACCEPT 140 packets, 9413 bytes) 151 | pkts bytes target prot opt in out source destination 152 | 715 47663 KUBE-POSTROUTING all -- * * 0.0.0.0/0 0.0.0.0/0 /* kubernetes postrouting rules */ 153 | 0 0 DOCKER_POSTROUTING all -- * * 0.0.0.0/0 172.16.0.190 154 | 657 44150 KIND-MASQ-AGENT all -- * * 0.0.0.0/0 0.0.0.0/0 ADDRTYPE match dst-type !LOCAL /* kind-masq-agent: ensure nat POSTROUTING directs all non-LOCAL destination traffic to our custom KIND-MASQ-AGENT chain */ 155 | {{< / highlight >}} 156 | 157 | Here all packets with a special SNAT mark (0x4000) fall through to the last rule and get SNAT'ed to the IP of the outgoing interface, which in this case is the veth interface connected to the Pod: 158 | 159 | {{< highlight bash "linenos=false,hl_lines=6" >}} 160 | $ d KUBE-POSTROUTING 161 | Chain KUBE-POSTROUTING (1 references) 162 | pkts bytes target prot opt in out source destination 163 | 463 31166 RETURN all -- * * 0.0.0.0/0 0.0.0.0/0 mark match ! 0x4000/0x4000 164 | 2 120 MARK all -- * * 0.0.0.0/0 0.0.0.0/0 MARK xor 0x4000 165 | 2 120 MASQUERADE all -- * * 0.0.0.0/0 0.0.0.0/0 /* kubernetes service traffic requiring SNAT */ random-fully 166 | {{< / highlight >}} 167 | 168 | 169 | The final `MASQUERADE` action ensures that the return packets follow the same way back, even if they were originated outside of the Kubernetes cluster. 170 | 171 | {{% notice info %}} 172 | The above sequence of lookups may look long an inefficient but bear in mind that this is only done once, for the first packet of the flow and the remainder of the session gets offloaded to Netfilter's connection tracking system. 173 | {{% /notice %}} 174 | 175 | 176 | 177 | ### Additional Reading 178 | 179 | * [**Netfilter Packet flow** ](https://upload.wikimedia.org/wikipedia/commons/3/37/Netfilter-packet-flow.svg) 180 | * [**Logical diagram of kube-proxy in iptables mode**](https://docs.google.com/drawings/d/1MtWL8qRTs6PlnJrW4dh8135_S9e2SaawT410bJuoBPk/edit) 181 | * [**Alternative kube-proxy implementations**](https://arthurchiao.art/blog/cracking-k8s-node-proxy/) 182 | * [**Kubernetes networking demystified**](https://www.cncf.io/blog/2020/01/30/kubernetes-networking-demystified-a-brief-guide/) -------------------------------------------------------------------------------- /content/services/mesh/_index.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Service Meshes" 3 | date: 2020-09-13T17:33:04+01:00 4 | weight: 80 5 | --- 6 | 7 | # Under construction [help needed] -------------------------------------------------------------------------------- /layouts/partials/favicon.html: -------------------------------------------------------------------------------- 1 | 2 | {{ template "_internal/twitter_cards.html" . }} -------------------------------------------------------------------------------- /layouts/partials/logo.html: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /layouts/partials/menu-footer.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | Star 5 | 6 | 7 | Fork 8 | 9 | 10 | 11 | 12 | 13 | 14 | -------------------------------------------------------------------------------- /layouts/shortcodes/div.html: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/networkop/tkng/a3667e05c00fff40a82adcfd6178ca6fd9db3618/layouts/shortcodes/div.html -------------------------------------------------------------------------------- /layouts/shortcodes/iframe.html: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /license.md: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /netlify.toml: -------------------------------------------------------------------------------- 1 | 2 | [build] 3 | publish = "public" 4 | command = "hugo --gc --minify" 5 | 6 | [context.production.environment] 7 | HUGO_VERSION = "0.74.3" 8 | HUGO_ENV = "production" 9 | HUGO_ENABLEGITINFO = "true" 10 | 11 | [context.split1] 12 | command = "hugo --gc --minify --enableGitInfo" 13 | 14 | [context.split1.environment] 15 | HUGO_VERSION = "0.74.3" 16 | HUGO_ENV = "production" 17 | 18 | [context.deploy-preview] 19 | command = "hugo --gc --minify --buildFuture -b $DEPLOY_PRIME_URL" 20 | 21 | [context.deploy-preview.environment] 22 | HUGO_VERSION = "0.74.3" 23 | 24 | [context.branch-deploy] 25 | command = "hugo --gc --minify -b $DEPLOY_PRIME_URL" 26 | 27 | [context.branch-deploy.environment] 28 | HUGO_VERSION = "0.74.3" 29 | 30 | [context.next.environment] 31 | HUGO_ENABLEGITINFO = "true" 32 | 33 | [[redirects]] 34 | from = "https://k8s.networkop.co.uk/*" 35 | to = "https://www.tkng.io/:splat" 36 | status = 301 37 | force = true -------------------------------------------------------------------------------- /static/images/favicon.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/networkop/tkng/a3667e05c00fff40a82adcfd6178ca6fd9db3618/static/images/favicon.png -------------------------------------------------------------------------------- /static/images/k8s-guide-logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/networkop/tkng/a3667e05c00fff40a82adcfd6178ca6fd9db3618/static/images/k8s-guide-logo.png --------------------------------------------------------------------------------