├── .github
└── FUNDING.yml
├── .gitmodules
├── Makefile
├── _redirects
├── archetypes
└── default.md
├── config.toml
├── content
├── IPv6
│ └── _index.md
├── _index.md
├── arch
│ └── _index.md
├── cni
│ ├── _index.md
│ ├── calico.md
│ ├── cilium.md
│ ├── flannel.md
│ ├── iaas
│ │ └── _index.md
│ ├── kindnet.md
│ └── weave.md
├── credits.md
├── dns
│ └── _index.md
├── ingress
│ ├── _index.md
│ ├── egress
│ │ └── _index.md
│ ├── gateway
│ │ └── _index.md
│ └── ingress
│ │ └── _index.md
├── lab
│ └── _index.md
├── security
│ └── _index.md
└── services
│ ├── Headless
│ └── _index.md
│ ├── Optimisations
│ └── _index.md
│ ├── _index.md
│ ├── clusterIP
│ ├── _index.md
│ ├── control-plane.md
│ └── dataplane
│ │ ├── IPVS.md
│ │ ├── eBPF.md
│ │ └── iptables.md
│ ├── loadBalancer
│ └── _index.md
│ ├── mesh
│ └── _index.md
│ └── nodeport
│ └── _index.md
├── layouts
├── partials
│ ├── favicon.html
│ ├── logo.html
│ └── menu-footer.html
└── shortcodes
│ ├── div.html
│ └── iframe.html
├── license.md
├── netlify.toml
└── static
└── images
├── favicon.png
└── k8s-guide-logo.png
/.github/FUNDING.yml:
--------------------------------------------------------------------------------
1 | # These are supported funding model platforms
2 |
3 | github: [networkop] # Replace with up to 4 GitHub Sponsors-enabled usernames e.g., [user1, user2]
4 | patreon: # Replace with a single Patreon username
5 | open_collective: # Replace with a single Open Collective username
6 | ko_fi: # Replace with a single Ko-fi username
7 | tidelift: # Replace with a single Tidelift platform-name/package-name e.g., npm/babel
8 | community_bridge: # Replace with a single Community Bridge project-name e.g., cloud-foundry
9 | liberapay: # Replace with a single Liberapay username
10 | issuehunt: # Replace with a single IssueHunt username
11 | otechie: # Replace with a single Otechie username
12 | lfx_crowdfunding: # Replace with a single LFX Crowdfunding project-name e.g., cloud-foundry
13 | custom: # Replace with up to 4 custom sponsorship URLs e.g., ['link1', 'link2']
14 |
--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
1 | [submodule "themes/hugo-theme-learn"]
2 | path = themes/hugo-theme-learn
3 | url = https://github.com/matcornic/hugo-theme-learn.git
4 |
--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
1 | # That's because i have two hugo versions
2 | HUGO := hugo-0.74.3
3 |
4 | RANDOM_STR =
5 |
6 | DEFAULT: local
7 |
8 | ## Start a local server
9 | local:
10 | ${HUGO} server -D --bind 0.0.0.0
11 |
12 | ## Push the latest commit upstream
13 | release:
14 | git add .
15 | git commit -m "$$(date)"
16 | git push
17 |
18 | ## Create a new chapter
19 | chapter:
20 | hugo new --kind chapter newchapter/_index.md
21 |
22 | # From: https://gist.github.com/klmr/575726c7e05d8780505a
23 | help:
24 | @echo "$$(tput sgr0)";sed -ne"/^## /{h;s/.*//;:d" -e"H;n;s/^## //;td" -e"s/:.*//;G;s/\\n## /---/;s/\\n/ /g;p;}" ${MAKEFILE_LIST}|awk -F --- -v n=$$(tput cols) -v i=15 -v a="$$(tput setaf 6)" -v z="$$(tput sgr0)" '{printf"%s%*s%s ",a,-i,$$1,z;m=split($$2,w," ");l=n-i;for(j=1;j<=m;j++){l-=length(w[j])+1;if(l<= 0){l=n-i-length(w[j])-1;printf"\n%*s ",-i," ";}printf"%s ",w[j];}printf"\n";}'
25 |
26 |
27 | # https://desk.draw.io/support/solutions/articles/16000042542-embed-html
28 |
29 |
30 |
--------------------------------------------------------------------------------
/_redirects :
--------------------------------------------------------------------------------
1 | https://k8s.networkop.co.uk/arch/* https://www.tkng.io/:splat 301!
2 |
--------------------------------------------------------------------------------
/archetypes/default.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: "{{ replace .Name "-" " " | title }}"
3 | date: {{ .Date }}
4 | draft: true
5 | ---
6 |
7 |
--------------------------------------------------------------------------------
/config.toml:
--------------------------------------------------------------------------------
1 | baseURL = "http://www.tkng.io/"
2 | languageCode = "en-us"
3 | title = "The Kubernetes Networking Guide"
4 | theme = "hugo-theme-learn"
5 |
6 | [outputs]
7 | home = [ "HTML", "RSS", "JSON"]
8 |
9 |
10 | [params]
11 | description = "The Kubernetes Networking Guide"
12 | showVisitedLinks = false
13 | images = ["images/k8s-guide-logo.png"]
14 | disableLanguageSwitchingButton = true
15 | disableMermaid = true
16 | editURL = "https://github.com/networkop/k8s-networking-guide/tree/master/content/"
17 | disableInlineCopyToClipBoard = true
18 |
19 |
20 | [[menu.shortcuts]]
21 | name = " Github repo"
22 | identifier = "ds"
23 | url = "https://github.com/networkop/k8s-networking-guide"
24 | weight = 10
25 |
26 | [[menu.shortcuts]]
27 | name = " Lab repo"
28 | identifier = "labs"
29 | url = "https://github.com/networkop/k8s-guide-labs"
30 | weight = 20
31 |
32 | [[menu.shortcuts]]
33 | name = " Credits"
34 | url = "/credits"
35 | weight = 30
36 |
--------------------------------------------------------------------------------
/content/IPv6/_index.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: "IPv6"
3 | date: 2020-09-13T17:33:04+01:00
4 | weight: 90
5 | summary: "The state and readiness of IPv6 networking"
6 | ---
7 |
8 |
9 | # Under construction [help needed]
--------------------------------------------------------------------------------
/content/_index.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: "The Kubernetes Networking Guide"
3 | ---
4 |
5 | # The Kubernetes Networking Guide
6 |
7 | The purpose of this website is to provide an overview of various **Kubernetes networking components** with a specific focus on **exactly how** they implement the required functionality.
8 |
9 | The information here can be used for educational purposes, however, the main goal is to provide a single point of reference for designing, operating and troubleshooting cluster networking solutions.
10 |
11 | {{% notice warning %}}
12 | This is not a generic Kubernetes learning resource. The assumption is that the reader is already familiar with basic concepts and building blocks of a Kubernetes cluster -- pods, deployments, services.
13 | {{% /notice %}}
14 |
15 |
16 |
17 | ## Structure
18 |
19 | The guide is split into multiple parts which can be studied mostly independently, however they all work together to provide a complete end-to-end cluster network abstractions.
20 |
21 | {{% children description="true" %}}
22 | {{% /children %}}
23 |
24 | {{% notice info %}}
25 | **Why this structure?** -- To explain Kubernetes from a network-centric view in a language understandable to people with a traditional network engineering background. This structure is also based on how [#sig-network](https://github.com/kubernetes/community/tree/master/sig-network) is organised into interest groups.
26 | {{% /notice %}}
27 |
28 |
29 | ## Hands-on Labs {#labs}
30 |
31 | Where possible, every topic in this guide will include a dedicated hands-on lab which can be spun up locally in a matter of minutes. Refer to the [Lab](lab/) page for setup instructions.
32 |
33 |
34 |
35 | ## Contributing
36 | If you found an error or want to add something to this guide, just click the **Edit this page** link displayed on top right of each page (except this one), and submit a pull request.
37 |
38 | {{% notice note %}}
39 | When submitting brand new content, please consider adding a corresponding lab to the [Labs repo](https://github.com/networkop/k8s-guide-labs)
40 | {{% /notice %}}
41 |
42 |
43 |
--------------------------------------------------------------------------------
/content/arch/_index.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: The Kubernetes Network Model
3 | menuTitle: Network Model
4 | weight: 5
5 | summary: "High-level overview of cluster networking components"
6 | ---
7 |
8 | The [official documentation](https://kubernetes.io/docs/concepts/cluster-administration/networking/#the-kubernetes-network-model) does a very good job of describing the cluster network assumptions and requirements. I'll repeat the main ideas here for completeness and to lay the foundation for the rest of the article. Kubernetes networking can be seen as several (more or less) orthogonal problems:
9 |
10 | * **Local** communications between containers in the same Pod -- solved by the local loopback interface.
11 | * **Pod-to-Pod** East-West communication -- solved by a CNI plugin and discussed in the [CNI](/cni/) chapter of this guide.
12 | * Multi-pod **service** abstraction -- a way to group similar Pods and load-balance traffic to them, discussed in the [Services](/services/)
13 | chapter of this guide.
14 | * **Ingress** & Egress communication -- getting the traffic in and out of the Kubernetes cluster, discussed in the [Ingress & Egress](/ingress/) chapter of this guide.
15 |
16 | In addition to the above, there are a number of auxiliary problems that are covered in their separate chapters:
17 |
18 | * **Network Policies** -- a way to filter traffic going to and from Pods.
19 | * **DNS** -- the foundation of cluster service discovery.
20 | * **IPv6** -- unfortunately still requires a separate chapter to discuss the multitude of caveats and limitations.
21 |
22 | Despite their orthogonality, each layer builds on top of abstractions provided by another, for example:
23 |
24 | * **Ingress** -- associates a URL with a backend Service, learns the associated Endpoints and sends the traffic to one of the PodIPs, relying on the Pod-to-Pod connectivity.
25 | * **Service** -- performs the client-side load-balancing on the originating Node and sends the traffic to the destination PodIP, effectively relying on the Node-to-Pod connectivity.
26 |
27 | Here's an example of how different Kubernetes Resources are stacked together to provide a **North-South** connectivity:
28 |
29 | {{< iframe "https://viewer.diagrams.net/?highlight=0000ff&edit=_blank&hide-pages=1&editable=false&layers=1&nav=0&page-id=ydZ7vDq7JmuY7Tl_GMgH&title=k8s-guide.drawio#Uhttps%3A%2F%2Fraw.githubusercontent.com%2Fnetworkop%2Fk8s-guide-labs%2Fmaster%2Fdiagrams%2Fk8s-guide.drawio" >}}
30 |
31 | While the above is the canonical way of exposing an application in Kubernetes, it is by no way the only one. Like in any typical cloud infrastructure, the functions of different layers overlap and, thus, create a space for additional deployment scenarios:
32 |
33 | * **Ingress** can **proxy TCP and UDP** traffic to the backend ports. This works handy when the application protocol is not HTTP or if you want to string multiple Ingress proxies together. While Ingress controllers support this through custom [ConfigMaps](https://kubernetes.github.io/ingress-nginx/user-guide/exposing-tcp-udp-services/) or [annotations](https://docs.citrix.com/en-us/citrix-k8s-ingress-controller/how-to/tcp-udp-ingress.html), the gateway API project (which can be viewed as an evolution of Ingress) supports these features [natively](https://gateway-api.sigs.k8s.io/guides/tcp/).
34 | * **Service** of type **LoadBalancer or NodePort** can be used to expose backend ports without an Ingress. This can be useful when the pods need to expose an esoteric protocol (e.g. NETCONF) or when application proxy functions are simply not needed, e.g. small-scale, internal cluster, with no need for TLS termination or traffic rate-limiting.
35 |
36 |
37 | {{% notice note %}}
38 | The main point is that Kubernetes Networking is not just a CNI or a kube-proxy or an Ingress controller. It's all of the above working in unison to provide a consistent network abstraction for hosted applications and external users.
39 | {{% /notice %}}
--------------------------------------------------------------------------------
/content/cni/_index.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: CNI
3 | menuTitle: "CNI"
4 | weight: 10
5 | summary: Pod Networking within and between Nodes
6 | ---
7 |
8 | ## Main Goals
9 |
10 | The official documentation [outlines](https://kubernetes.io/docs/concepts/cluster-administration/networking/#the-kubernetes-network-model) a number of requirements that any CNI plugin implementation should support. Rephrasing it in a slightly different way, a CNI plugin must provide at least the following two things:
11 |
12 | * **Connectivity** - making sure that a Pod gets its default `eth0` interface with IP reachable from the root network namespace of the hosting Node.
13 | * **Reachability** - making sure that Pods from other Nodes can reach each other directly (without NAT).
14 |
15 | Connectivity requirement is the most straight-forward one to understand -- every Pod must have a NIC to communicate with anything outside of its own network namespace. Some local processes on the Node (e.g. kubelet) need to reach PodIP from the root network namespace (e.g. to perform health and readiness checks), hence the root NS connectivity requirement.
16 |
17 | There are a number of [reference](https://github.com/containernetworking/plugins#main-interface-creating) CNI plugins that can be used to setup connectivity, most notable examples are:
18 |
19 | * **ptp** -- creates a veth link in the root namespace and plugs the other end into the Pod's namespace.
20 | * **bridge** -- does the same but also connects the rootNS end of the link to the bridge.
21 | * **macvlan/ipvlan** -- use the corresponding drivers to connect containers directly to the NIC of the Node.
22 |
23 | {{% notice info %}}
24 | These reference plugins are very often combined and re-used by other, more complicated CNI plugins (see [kindnet](/cni/kindnet/) or [flannel](/cni/flannel)).
25 | {{% /notice %}}
26 |
27 | Reachability, on the other hand, may require a bit of unpacking:
28 |
29 | * Every Pod gets a unique IP from a `PodCIDR` range configured on the Node.
30 | * This range is assigned to the Node during kubelet bootstrapping phase.
31 | * Nodes are not aware of `PodCIDRs` assigned to other Nodes, allocations are normally managed by the controller-manager based on the `--cluster-cidr` configuration flag.
32 | * Depending on the type of underlying connectivity, establishing end-to-end reachability between `PodCIDRs` may require different methods:
33 | - If all Nodes are in the **same Layer2 domain**, the connectivity can be established by configuring a **full mesh of static routes** on all Nodes with NextHop set to the internal IP of the peer Nodes.
34 | - If some Nodes are in **different Layer2 domains**, the connectivity can be established with either:
35 | * **Orchestrating the underlay** -- usually done with BGP for on-prem or some form of dynamically-provisioned static routes for public cloud environments.
36 | * **Encapsulating in the overlay** -- VXLAN is still the most popular encap type.
37 |
38 | {{% notice info %}}
39 | The above mechanisms are not determined exclusively by the underlying network. Plugins can use a mixture of different methods (e.g. host-based static routes for the same L2 segment and overlays for anything else) and the choice can be made purely based on operational complexity (e.g. overlays over BGP).
40 | {{% /notice %}}
41 |
42 | {{% notice note %}}
43 | It goes without saying that the base underlying assumption is that Nodes can reach each other using their Internal IPs. It is the responsibility of the infrastructure provider (IaaS) to fulfil this requirement.
44 | {{% /notice %}}
45 |
46 | ## Secondary Goals
47 |
48 | In addition to the base functionality described above, there's always a need to do things like:
49 |
50 | * **IP address management** to keep track of IPs allocated to each individual Pod.
51 | * **Port mappings** to expose Pods to the outside world.
52 | * **Bandwidth control** to control egress/ingress traffic rates.
53 | * **Source NAT** for traffic leaving the cluster (e.g. Internet)
54 |
55 | These functions can be performed by the same monolithic plugin or via a **plugin chaining**, where multiple plugins are specified in the configuration file and get invoked sequentially by the container runtime.
56 |
57 |
58 | {{% notice info %}}
59 | [CNI plugins repository](https://github.com/containernetworking/plugins) provides reference implementations of the most commonly used plugins.
60 | {{% /notice %}}
61 |
62 | ## Operation
63 |
64 | Contrary to the typical network plugin design approach that includes a long-lived stateful daemon, [CNI Specification](https://github.com/containernetworking/cni/blob/master/SPEC.md) defines an interface -- a set of input/output parameters that a CNI binary is expected to ingest/produce. This makes for a very clean design that is also very easy to swap and upgrade. The most beautiful thing is that the plugin becomes completely stateless -- it's just a binary file on a disk that gets invoked whenever a Pod gets created or deleted. Here's a sequence of steps that a container runtime has to do whenever a new Pod gets created:
65 |
66 | 1. It creates a new network namespace.
67 | 2. It reads and parses the CNI configuration file -- the (numerically) first file from `/etc/cni/net.d`
68 | 3. For every plugin specified in the configuration file, it invokes the corresponding binary, passing it the following information:
69 | * Environment variables `CNI_COMMAND`, `CNI_CONTAINERID`, `CNI_NETNS`, `CNI_IFNAME`, `CNI_PATH` and `CNI_ARGS`.
70 | * A minified version of the CNI configuration file (excluding any other plugins).
71 |
72 | The last step, if done manually, would look something like this:
73 |
74 | ```bash
75 | CNI_COMMAND=ADD \
76 | CNI_CONTAINERID=cid \
77 | CNI_NETNS=/var/run/netns/id \
78 | CNI_IFNAME=eth0 \
79 | CNI_PATH=/opt/bin/bin \
80 | CNI_ARGS=K8S_POD_NAMESPACE=foo;K8S_POD_NAME=bar; \
81 | cni_plugin < /etc/cni/net.d/01-cni.conf
82 | ```
83 |
84 | The CNI plugin then does all of the required interface plumbing and IP allocation and returns back (prints to stdout) the resulting [data structure](https://github.com/containernetworking/cni/blob/master/SPEC.md#result). In the case of plugin chaining, all this information (original inputs + result) gets passed to all plugins along the chain.
85 |
86 | Despite its design simplicity, unless you have something else that takes care of establishing end-to-end reachability (e.g. cloud controller), a CNI binary must be accompanied by a long-running stateful daemon/agent. This daemon usually runs in the root network namespace and manages the Node's network stack between CNI binary invocations -- at the very least it adds and removes static routes as Nodes are added to or removed from the cluster. Its operation is not dictated by any standard and the only requirement is to establish Pod-to-Pod reachability.
87 |
88 | {{% notice note %}}
89 | In reality, this daemon does a lot more than just manage reachability and may include a kube-proxy replacement, Kubernetes controller, IPAM etc.
90 | {{% /notice %}}
91 |
92 |
93 | {{% notice tip %}}
94 | See [meshnet-cni](https://github.com/networkop/meshnet-cni#architecture) for an example of binary+daemon architecture.
95 | {{% /notice %}}
96 |
97 |
98 |
99 | ## What to know more?
100 |
101 | To learn more about CNI, you can search for the "Kubernetes and the CNI: Where We Are and What's Next", which I cannot recommend highly enough. It is what's shaped my current view of the CNI and heavily inspired the current article. Some other links I can recommend:
102 |
103 | * [Slides: Kubernetes and the CNI: Where We Are and What's Next](https://www.caseyc.net/cni-talk-kubecon-18.pdf)
104 | * [CNI Specificaion](https://github.com/containernetworking/cni/blob/master/SPEC.md)
105 | * [CNI plugin implemented in bash](https://www.altoros.com/blog/kubernetes-networking-writing-your-own-simple-cni-plug-in-with-bash/)
106 | * [EVPN CNI plugin](http://logingood.github.io/kubernetes/cni/2016/05/14/netns-and-cni.html)
107 | * [Writing your first CNI plugin](http://dougbtv.com/nfvpe/2017/06/22/cni-tutorial/)
108 | * [Building a meshnet-cni](https://networkop.co.uk/post/2018-11-k8s-topo-p1/)
109 | * [CNI plugin chaining](https://karampok.me/posts/chained-plugins-cni/)
110 |
--------------------------------------------------------------------------------
/content/cni/calico.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: "calico"
3 | menuTitle: "calico"
4 | date: 2020-11-16T12:33:04+01:00
5 | weight: 15
6 | ---
7 |
8 | [Calico](https://docs.projectcalico.org/about/about-calico) is another example of a full-blown Kubernetes "networking solution" with functionality including network policy controller, kube-proxy replacement and network traffic observability. CNI functionality is still the core element of Calico and the focus of this chapter will be on how it satisfies the Kubernetes network model [requirements](/cni/#main-goals).
9 |
10 |
11 | * **Connectivity** is set up by creating a `veth` link and moving one side of that link into a Pod's namespace. The other side of the link is left dangling in the node's root namespace. For each local Pod, Calico sets up a PodIP host-route pointing over the veth link.
12 |
13 | {{% notice note %}}
14 | One oddity of Calico CNI is that the node end of the veth link does not have an IP address. In order to provide Pod-to-Node egress connectivity, each `veth` link is set up with `proxy_arp` which makes root NS respond to any ARP request coming from the Pod (assuming that the node has a default route itself).
15 | {{% /notice %}}
16 |
17 | * **Reachability** can be established in two different ways:
18 |
19 | 1. Static routes and overlays -- Calico supports IPIP and VXLAN and has an option to only setup tunnels for traffic crossing the L3 subnet boundary.
20 |
21 | 2. BGP -- the most popular choice for on-prem deployments, it works by configuring a [Bird](https://bird.network.cz/) BGP speaker on every node and setting up peerings to ensure that reachability information gets propagated to every node. There are several [options](https://docs.projectcalico.org/networking/bgp) for how to set up this peering, including full-mesh between nodes, dedicated route-reflector node and external peering with the physical network.
22 |
23 | {{% notice info %}}
24 | The above two modes are not mutually exclusive, BGP can be used with IPIP in public cloud environments. For a complete list of networking options for both on-prem and public cloud environments, refer to [this guide](https://docs.projectcalico.org/networking/determine-best-networking).
25 | {{% /notice %}}
26 |
27 | For demonstration purposes, we'll use a BGP-based configuration option with external off-cluster route-reflector. The fully converged and populated IP and MAC tables will look like this:
28 |
29 | {{< iframe "https://viewer.diagrams.net/?highlight=0000ff&edit=_blank&hide-pages=1&editable=false&layers=1&nav=0&page-id=5Q_VDU4fQs1RRTjQc7gX&title=k8s-guide.drawio#Uhttps%3A%2F%2Fraw.githubusercontent.com%2Fnetworkop%2Fk8s-guide-labs%2Fmaster%2Fdiagrams%2Fk8s-guide.drawio" >}}
30 |
31 |
32 | ### Lab
33 |
34 | Assuming that the lab environment is already [set up](/lab/), calico can be enabled with the following commands:
35 |
36 | ```bash
37 | make calico
38 | ```
39 |
40 | Check that the calico-node daemonset has all pods in `READY` state:
41 |
42 | ```bash
43 | $ kubectl -n calico-system get daemonset
44 | NAME DESIRED CURRENT READY UP-TO-DATE AVAILABLE NODE SELECTOR AGE
45 | calico-node 3 3 3 3 3 kubernetes.io/os=linux 61s
46 | ```
47 |
48 | Now we need to "kick" all Pods to restart and pick up the new CNI plugin:
49 |
50 | ```bash
51 | make nuke-all-pods
52 | ```
53 |
54 | To make sure kube-proxy and calico set up the right set of NAT rules, existing NAT tables need to be flushed and re-populated:
55 |
56 | ```
57 | make flush-nat && make calico-restart
58 | ```
59 |
60 | Build and start a GoBGP-based route reflector:
61 |
62 | ```
63 | make gobgp-build && make gobgp-rr
64 | ```
65 |
66 | Finally, reconfigure Calico's BGP daemonset to peer with the GoBGP route reflector:
67 |
68 | ```
69 | make gobgp-calico-patch
70 | ```
71 |
72 | ---
73 |
74 | Here's how the information from the diagram can be validated (using `worker2` as an example):
75 |
76 | 1. Pod IP and default route
77 |
78 | ```bash
79 | $ NODE=k8s-guide-worker2 make tshoot
80 | bash-5.0# ip -4 -br addr show dev eth0
81 | eth0@if2 UP 10.244.190.5/32
82 |
83 | bash-5.0# ip route
84 | default via 169.254.1.1 dev eth0
85 | 169.254.1.1 dev eth0 scope link
86 | ```
87 |
88 | Note how the default route is pointing to the fake next-hop address `169.254.1.1`. This will be the same for all Pods and this IP will resolve to the same MAC address configured on all veth links:
89 |
90 | ```
91 | bash-5.0# ip neigh
92 | 169.254.1.1 dev eth0 lladdr ee:ee:ee:ee:ee:ee REACHABLE
93 | ```
94 |
95 | 2. Node's routing table
96 |
97 | ```bash
98 | $ docker exec k8s-guide-worker2 ip route
99 | default via 172.18.0.1 dev eth0
100 | 10.244.175.0/24 via 172.18.0.4 dev eth0 proto bird
101 | 10.244.190.0 dev calid7f7f4e15dd scope link
102 | blackhole 10.244.190.0/24 proto bird
103 | 10.244.190.1 dev calid599cd3d268 scope link
104 | 10.244.190.2 dev cali82aeec08a68 scope link
105 | 10.244.190.3 dev calid2e34ad38c6 scope link
106 | 10.244.190.4 dev cali4a822ce5458 scope link
107 | 10.244.190.5 dev cali0ad20b06c15 scope link
108 | 10.244.236.0/24 via 172.18.0.5 dev eth0 proto bird
109 | 172.18.0.0/16 dev eth0 proto kernel scope link src 172.18.0.3
110 | ```
111 |
112 | A few interesting things to note in the above output:
113 |
114 | * The 2 x /24 routes programmed by `bird` are the PodCIDR ranges of the other two nodes.
115 | * The blackhole /24 route is the PodCIDR of the local node.
116 | * Inside the local PodCIDR there's a /32 host-route configured for each running Pod.
117 |
118 | 3. BGP RIB of the GoBGP route reflector
119 |
120 | ```
121 | docker exec gobgp gobgp global rib
122 |
123 | Network Next Hop AS_PATH Age Attrs
124 | *> 10.244.175.0/24 172.18.0.4 00:05:04 [{Origin: i} {LocalPref: 100}]
125 | *> 10.244.190.0/24 172.18.0.3 00:05:04 [{Origin: i} {LocalPref: 100}]
126 | *> 10.244.236.0/24 172.18.0.5 00:05:03 [{Origin: i} {LocalPref: 100}]
127 |
128 | ```
129 |
130 | ### A day in the life of a Packet
131 |
132 | Let's track what happens when Pod-1 (actual name is net-tshoot-rg2lp) tries to talk to Pod-3 (net-tshoot-6wszq).
133 |
134 | {{% notice note %}}
135 | We'll assume that the ARP and MAC tables are converged and fully populated. In order to do that issue a ping from Pod-1 to Pod-3's IP (10.244.236.0)
136 | {{% /notice %}}
137 |
138 | 0. Check the peer interface index of the veth link of Pod-1:
139 |
140 | ```
141 | $ kubectl -n default exec net-tshoot-rg2lp -- ip -br addr show dev eth0
142 | 3: eth0@if14: mtu 1410 qdisc noqueue state UP mode DEFAULT group default
143 | link/ether b2:24:13:ec:77:42 brd ff:ff:ff:ff:ff:ff link-netnsid 0
144 | ```
145 |
146 | This information (if14) will be used in step 2 to identify the node side of the veth link.
147 |
148 | 1. Pod-1 wants to send a packet to `10.244.236.0`. Its network stack performs a route lookup:
149 |
150 | ```bash
151 | $ kubectl -n default exec net-tshoot-rg2lp -- ip route get 10.244.236.0
152 | 10.244.236.0 via 169.254.1.1 dev eth0 src 10.244.175.4 uid 0
153 | cache
154 | ```
155 |
156 | 2. The nexthop IP is `169.254.1.1` on `eth0`, ARP table lookup is needed to get the destination MAC:
157 |
158 | ```bash
159 | $ kubectl -n default exec net-tshoot-rg2lp -- ip neigh show 169.254.1.1
160 | 169.254.1.1 dev eth0 lladdr ee:ee:ee:ee:ee:ee STALE
161 | ```
162 |
163 | As mentioned above, the node side of the veth link doesn't have any IP configured:
164 |
165 | ```
166 | $ docker exec k8s-guide-worker ip addr show dev if14
167 | 14: calic8441ae7134@if3: mtu 1410 qdisc noqueue state UP group default
168 | link/ether ee:ee:ee:ee:ee:ee brd ff:ff:ff:ff:ff:ff link-netns cni-262ff521-1b00-b1c9-f0d5-0943a48a2ddc
169 | ```
170 |
171 | So in order to respond to an ARP request for `169.254.1.1`, all veth links have proxy ARP enabled:
172 | ```
173 | $ docker exec k8s-guide-worker cat /proc/sys/net/ipv4/conf/calic8441ae7134/proxy_arp
174 | 1
175 | ```
176 |
177 | 3. The packet reaches the root namespace of the ingress node, where another L3 lookup takes place:
178 |
179 | ```
180 | $ docker exec k8s-guide-worker ip route get 10.244.236.0 fibmatch
181 | 10.244.236.0/24 via 172.18.0.5 dev eth0 proto bird
182 | ```
183 |
184 | 4. The packet is sent to the target node where another FIB lookup is performed:
185 |
186 | ```
187 | $ docker exec k8s-guide-control-plane ip route get 10.244.236.0 fibmatch
188 | 10.244.236.0 dev cali0ec6986a945 scope link
189 | ```
190 |
191 | The target IP is reachable over the `veth` link so ARP is used to determine the destination MAC address:
192 |
193 | ```
194 | docker exec k8s-guide-control-plane ip neigh show 10.244.236.0
195 | 10.244.236.0 dev cali0ec6986a945 lladdr de:85:25:60:86:5b STALE
196 | ```
197 |
198 | 5. Finally, the packet gets delivered to the `eth0` interface of the target pod:
199 |
200 | ```
201 | kubectl exec net-tshoot-6wszq -- ip -br addr show dev eth0
202 | eth0@if2 UP 10.244.236.0/32 fe80::dc85:25ff:fe60:865b/64
203 | ```
204 |
205 | ### SNAT functionality
206 |
207 | SNAT functionality for traffic egressing the cluster is done in two stages:
208 |
209 | 1. `cali-POSTROUTING` chain is inserted at the top of the POSTROUTING chain.
210 |
211 | 2. Inside that chain `cali-nat-outgoin` is SNAT'ing all egress traffic originating from `cali40masq-ipam-pools`.
212 |
213 | ```
214 | iptables -t nat -vnL
215 | <...>
216 | Chain POSTROUTING (policy ACCEPT 5315 packets, 319K bytes)
217 | pkts bytes target prot opt in out source destination
218 | 7844 529K cali-POSTROUTING all -- * * 0.0.0.0/0 0.0.0.0/0 /* cali:O3lYWMrLQYEMJtB5 */
219 | <...>
220 | Chain cali-POSTROUTING (1 references)
221 | pkts bytes target prot opt in out source destination
222 | 7844 529K cali-fip-snat all -- * * 0.0.0.0/0 0.0.0.0/0 /* cali:Z-c7XtVd2Bq7s_hA */
223 | 7844 529K cali-nat-outgoing all -- * * 0.0.0.0/0 0.0.0.0/0 /* cali:nYKhEzDlr11Jccal */
224 | <...>
225 | Chain cali-nat-outgoing (1 references)
226 | pkts bytes target prot opt in out source destination
227 | 1 84 MASQUERADE all -- * * 0.0.0.0/0 0.0.0.0/0 /* cali:flqWnvo8yq4ULQLa */ match-set cali40masq-ipam-pools src ! match-set cali40all-ipam-pools dst random-fully
228 |
229 | ```
230 |
231 | Calico configures all IPAM pools as ipsets for a more efficient matching within iptables. These pools can be viewed on each individual node:
232 |
233 | ```
234 | $ docker exec k8s-guide-control-plane ipset -L cali40masq-ipam-pools
235 | Name: cali40masq-ipam-pools
236 | Type: hash:net
237 | Revision: 6
238 | Header: family inet hashsize 1024 maxelem 1048576
239 | Size in memory: 512
240 | References: 1
241 | Number of entries: 1
242 | Members:
243 | 10.244.128.0/17
244 | ```
245 |
246 | ### Caveats and Gotchas
247 |
248 | * Calico support GoBGP-based routing, but only as an [experimental feature](https://github.com/projectcalico/calico-bgp-daemon).
249 | * BGP configs are generated from templates based on the contents of the Calico [datastore](https://docs.projectcalico.org/getting-started/kubernetes/hardway/the-calico-datastore). This makes the customization of the generated BGP config very [problematic](https://github.com/projectcalico/calico/issues/1604).
250 |
251 |
252 |
253 |
--------------------------------------------------------------------------------
/content/cni/flannel.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: "flannel"
3 | menuTitle: "flannel"
4 | date: 2020-09-13T17:33:04+01:00
5 | weight: 12
6 | ---
7 |
8 | [Flannel](https://github.com/coreos/flannel) is another example of a dual CNI plugin design:
9 |
10 | * **Connectivity** is taken care of by the `flannel` binary. This binary is a `metaplugin` -- a plugin that wraps other reference CNI plugins. In the [simplest case](https://github.com/containernetworking/plugins/tree/master/plugins/meta/flannel#operation), it generates a `bridge` plugin configuration and "delegates" the connectivity setup to it.
11 |
12 | * **Reachability** is taken care of by the Daemonset running `flanneld`. Here's an approximate sequence of actions of what happens when the daemon starts:
13 | 1. It queries the Kubernetes Node API to discover its local `PodCIDR` and `ClusterCIDR`. This information is saved in the `/run/flannel/subnet.env` and is used by the flannel metaplugin to generate the `host-local` IPAM configuration.
14 | 2. It creates a vxlan interfaces called `flannel.1` and updates the Kubernetes Node object with its MAC address (along with its own Node IP).
15 | 3. Using Kubernetes API, it discovers the VXLAN MAC information of other Nodes and builds a local unicast head-end replication (HER) table for its vxlan interface.
16 |
17 | {{% notice info %}}
18 | This plugin assumes that daemons have a way to exchange information (e.g. VXLAN MAC). Previously, this required a separate database (hosted etcd) which was considered a big disadvantage. The new version of the plugin uses Kubernetes API to store that information in annotations of a Node API object.
19 | {{% /notice %}}
20 |
21 | The fully converged IP and MAC tables will look like this:
22 |
23 | {{< iframe "https://viewer.diagrams.net/?highlight=0000ff&edit=_blank&hide-pages=1&editable=false&layers=1&nav=0&page-id=jdjgs82ws8dfcGyB_vlg&title=k8s-guide.drawio#Uhttps%3A%2F%2Fraw.githubusercontent.com%2Fnetworkop%2Fk8s-guide-labs%2Fmaster%2Fdiagrams%2Fk8s-guide.drawio" >}}
24 |
25 |
26 |
27 |
28 |
29 | ### Lab
30 |
31 | Assuming that the lab is already [setup](/lab/), flannel can be enabled with the following 3 commands:
32 |
33 | ```bash
34 | make flannel
35 | ```
36 |
37 | Check that the flannel daemonset has reached the `READY` state:
38 |
39 | ```bash
40 | $ kubectl -n kube-system get daemonset -l app=flannel
41 | NAME DESIRED CURRENT READY UP-TO-DATE AVAILABLE NODE SELECTOR AGE
42 | kube-flannel-ds 3 3 3 3 3 90s
43 | ```
44 |
45 | Now we need to "kick" all Pods to restart and pick up the new CNI plugin:
46 |
47 | ```bash
48 | make nuke-all-pods
49 | ```
50 |
51 | Here's how the information from the diagram can be validated (using `worker2` as an example):
52 |
53 | 1. Pod IP and default route
54 |
55 | ```bash
56 | $ NODE=k8s-guide-worker2 make tshoot
57 | bash-5.0# ip route get 1.1
58 | 1.1.0.0 via 10.244.2.1 dev eth0 src 10.244.2.6 uid 0
59 | ```
60 |
61 | 2. Node routing table
62 |
63 | ```bash
64 | $ docker exec -it k8s-guide-worker2 ip route
65 | default via 172.18.0.1 dev eth0
66 | 10.244.0.0/24 via 10.244.0.0 dev flannel.1 onlink
67 | 10.244.1.0/24 via 10.244.1.0 dev flannel.1 onlink
68 | 10.244.2.0/24 dev cni0 proto kernel scope link src 10.244.2.1
69 | 172.18.0.0/16 dev eth0 proto kernel scope link src 172.18.0.2
70 | ```
71 |
72 | 3. Static ARP entries for NextHops
73 |
74 | ```bash
75 | $ docker exec -it k8s-guide-worker2 ip neigh | grep PERM
76 | 10.244.1.0 dev flannel.1 lladdr ce:0a:4f:22:a4:2a PERMANENT
77 | 10.244.0.0 dev flannel.1 lladdr 5a:11:99:ab:8c:22 PERMANENT
78 |
79 | ```
80 |
81 | 4. VXLAN forwarding database
82 |
83 | ```bash
84 | $ docker exec -it k8s-guide-worker2 bridge fdb show dev flannel.1
85 | 5a:11:99:ab:8c:22 dst 172.18.0.3 self permanent
86 | ce:0a:4f:22:a4:2a dst 172.18.0.4 self permanent
87 | ```
88 |
89 | ### A day in the life of a Packet
90 |
91 | Let's track what happens when Pod-1 tries to talk to Pod-3.
92 |
93 | {{% notice note %}}
94 | We'll assume that the ARP and MAC tables are converged and fully populated.
95 | {{% /notice %}}
96 |
97 | 1\. Pod-1 wants to send a packet to `10.244.0.2`. Its network stack looks up the routing table to find the NextHop IP:
98 |
99 | ```bash
100 | $ kubectl exec -it net-tshoot-4sg7g -- ip route get 10.244.0.2
101 | 10.244.0.2 via 10.244.1.1 dev eth0 src 10.244.1.6 uid 0
102 | ```
103 |
104 | 2\. The packet reaches the `cbr0` bridge in the root network namespace, where the lookup is performed again:
105 |
106 | ```bash
107 | $ docker exec -it k8s-guide-worker ip route get 10.244.0.2
108 | 10.244.0.2 via 10.244.0.0 dev flannel.1 src 10.244.1.0 uid 0
109 | ```
110 |
111 | 3\. The NextHop and the outgoing interfaces are set, the ARP table lookup returns the static entry provisioned by the `flanneld`:
112 |
113 | ```bash
114 | $ docker exec -it k8s-guide-worker ip neigh get 10.244.0.0 dev flannel.1
115 | 10.244.0.0 dev flannel.1 lladdr 5a:11:99:ab:8c:22 PERMANENT
116 | ```
117 |
118 | 4\. Next, the FDB of the VXLAN interface is consulted to find out the destination VTEP IP:
119 |
120 | ```bash
121 | $ docker exec -it k8s-guide-worker bridge fdb | grep 5a:11:99:ab:8c:22
122 | 5a:11:99:ab:8c:22 dev flannel.1 dst 172.18.0.3 self permanent
123 | ```
124 |
125 | 5\. The packet is VXLAN-encapsulated and sent to the `control-node` where `flannel.1` matches the VNI and the VXLAN MAC:
126 |
127 | ```bash
128 | $ docker exec -it k8s-guide-control-plane ip link show flannel.1
129 | 3: flannel.1: mtu 1450 qdisc noqueue state UNKNOWN mode DEFAULT group default
130 | link/ether 5a:11:99:ab:8c:22 brd ff:ff:ff:ff:ff:ff
131 | ```
132 |
133 | 6\. The packet gets decapsulated and its original destination IP looked up in the main routing table:
134 |
135 | ```bash
136 | $ docker exec -it k8s-guide-control-plane ip route get 10.244.0.2
137 | 10.244.0.2 dev cni0 src 10.244.0.1 uid 0
138 | ```
139 |
140 | 7\. The ARP and bridge tables are then consulted to find the outgoing veth interface:
141 |
142 | ```bash
143 | $ docker exec -it k8s-guide-control-plane ip neigh get 10.244.0.2 dev cni0
144 | 10.244.0.2 dev cni0 lladdr 7e:46:23:43:6f:ec REACHABLE
145 | $ docker exec -it k8s-guide-control-plane bridge fdb get 7e:46:23:43:6f:ec br cni0
146 | 7e:46:23:43:6f:ec dev vethaabf9eb2 master cni0
147 | ```
148 |
149 | 8\. Finally, the packet arrives in the Pod-3's network namespace where it gets processed by the local network stack:
150 |
151 | ```bash
152 | $ kubectl exec -it net-tshoot-rkg46 -- ip route get 10.244.0.2
153 | local 10.244.0.2 dev lo src 10.244.0.2 uid 0
154 | ```
155 |
156 | ### SNAT functionality
157 |
158 | Similar to [kindnet](/cni/kindnet/) `flanneld` sets up the SNAT rules to enable egress connectivity for the Pods, the only difference is that it does this directly inside the `POSTROUTING` chain:
159 |
160 | ```bash
161 | Chain POSTROUTING (policy ACCEPT 327 packets, 20536 bytes)
162 | pkts bytes target prot opt in out source destination
163 | 0 0 RETURN all -- * * 10.244.0.0/16 10.244.0.0/16
164 | 0 0 MASQUERADE all -- * * 10.244.0.0/16 !224.0.0.0/4 random-fully
165 | 0 0 RETURN all -- * * !10.244.0.0/16 10.244.0.0/24
166 | 0 0 MASQUERADE all -- * * !10.244.0.0/16 10.244.0.0/16 random-fully
167 | ```
168 |
169 | ### Caveats and Gotchas
170 |
171 | * The official [installation manifest](https://raw.githubusercontent.com/coreos/flannel/master/Documentation/kube-flannel.yml) does not install the CNI binary by default. This binary is distributed as a part of [reference CNI plugins](https://github.com/containernetworking/plugins/releases) and needs to be installed separately.
172 | * flannel can run in a `direct routing` mode, which acts by installing static routes for hosts on the same subnet.
173 | * flannel can use generic UDP encapsulation instead of VXLAN
--------------------------------------------------------------------------------
/content/cni/iaas/_index.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: Public and Private Clouds
3 | menuTitle: "IaaS"
4 | weight: 13
5 | summary: Cloud-based Kubernetes deployments
6 | ---
7 |
8 | Kubernetes was designed to run inside a cloud environment. The idea is that the IaaS layer can provide resources that Kubernetes can consume without having to implement them internally. These resources include VMs (for Node management), L4 load-balancers (for service type LoadBalancer) and persistent storage (for PersistentVolumes). The reason why it's important for networking is that the underlying cloud SDN is also programmable and can be managed by the Kubernetes itself.
9 |
10 | {{% notice note %}}
11 | Although it is possible to run Kubernetes directly on baremetal, all of these problems will still need to be addressed individually by the cluster administrator.
12 | {{% /notice %}}
13 |
14 | ## Operation
15 |
16 | A typical managed Kubernetes deployment includes a simple CNI plugin called [kubenet](https://kubernetes.io/docs/concepts/extend-kubernetes/compute-storage-net/network-plugins/#kubenet) which is another example of a `metaplugin` -- it [re-uses](https://github.com/kubernetes/kubernetes/blob/master/pkg/kubelet/dockershim/network/kubenet/kubenet_linux.go#L88) `bridge`, `host-local` and `loopback` reference CNI plugins and orchestrates them to provide **connectivity**.
17 |
18 | {{% notice note %}}
19 | It is enabled with a kubelet argument `--network-plugin=kubenet` which, for managed Kubernetes, means that it cannot be replaced with a different CNI plugin.
20 | {{% /notice %}}
21 |
22 |
23 | One notable difference with `kubenet` is that there is no daemon component in the plugin. In this case, **reachability** is provided by the underlying SDN and orchestrated by a [Cloud Controller Manager](https://kubernetes.io/docs/concepts/architecture/cloud-controller/). Behind the scenes, for each PodCIDR it installs a **static route** pointing to the Node IP -- this way traffic between Pods can just follow the default route in the root namespace, safely assuming that the underlying virtual router will know where to forward the packets.
24 |
25 | {{< iframe "https://viewer.diagrams.net/?highlight=0000ff&edit=_blank&hide-pages=1&editable=false&layers=1&nav=0&page-id=A5cMEZUylDs-XIrDOgQv&title=k8s-guide.drawio#Uhttps%3A%2F%2Fraw.githubusercontent.com%2Fnetworkop%2Fk8s-guide-labs%2Fmaster%2Fdiagrams%2Fk8s-guide.drawio" >}}
26 |
27 | {{% notice note %}}
28 | If you're interested in using BGP to establish reachability in cloud environment, be sure to check out [cloudroutesync](https://github.com/networkop/cloudroutesync).
29 | {{% /notice %}}
30 |
31 |
32 | ### GKE
33 |
34 | Google's CCM uses [IP Alias ranges](https://cloud.google.com/vpc/docs/alias-ip) to provide reachability. The VPC subnet gets configured with a secondary address range that is the same as the cluster CIDR:
35 |
36 | ```
37 | $ gcloud compute networks subnets describe private-subnet | grep -A 3 secondaryIpRanges
38 | secondaryIpRanges:
39 | - ipCidrRange: 10.244.0.0/22
40 | rangeName: private-secondary
41 | ```
42 |
43 | Each new Node VM gets created with an alias range set to the Node's PodCIDR:
44 |
45 | ```
46 | $ gcloud compute instances describe gke-node | grep -A 3 aliasIpRanges
47 | networkInterfaces:
48 | - aliasIpRanges:
49 | - ipCidrRange: 10.224.1.0/24
50 | subnetworkRangeName: private-secondary
51 | ```
52 |
53 | Inside of the Node VM there's a standard set of interfaces:
54 |
55 | ```
56 | $ ip -4 -br add show
57 | lo UNKNOWN 127.0.0.1/8
58 | eth0 UP 172.16.0.12/32
59 | cbr0 UP 10.224.1.1/24
60 | ```
61 |
62 | The routing table only has a single non-directly connected default route:
63 |
64 | ```
65 | $ ip route
66 | default via 172.16.0.1 dev eth0 proto dhcp metric 1024
67 | 172.16.0.12 dev eth0 proto dhcp scope link metric 1024
68 | 10.224.1.0/24 dev cbr0 proto kernel scope link src 10.224.1.1
69 | ```
70 |
71 | {{% notice info %}}
72 | IP Alias is a special kind of static route that, amongst [other benefits](https://cloud.google.com/vpc/docs/alias-ip#key_benefits_of_alias_ip_ranges), gets checked for potential conflicts and automatically updates the corresponding anti-spoofing rules to allow VM to emit packets with non-native IPs.
73 | {{% /notice %}}
74 |
75 | ### AKS
76 |
77 | Azure uses normal static routes to setup reachability:
78 |
79 | ```
80 | az network route-table show --ids "id" | grep -A 5 10.224.1.0
81 | "addressPrefix": "10.224.1.0/24",
82 | "etag": "W/\"tag\"",
83 | "id": "id",
84 | "name": "name",
85 | "nextHopIpAddress": "172.16.0.12",
86 | "nextHopType": "VirtualAppliance",
87 | ```
88 |
89 | Inside of the Node VM there's a standard set of interfaces:
90 |
91 |
92 | ```
93 | # ip -4 -br add show
94 | lo UNKNOWN 127.0.0.1/8
95 | eth0 UP 172.16.0.12/16
96 | cbr0 UP 10.224.1.1/24
97 | ```
98 |
99 | And there is only a single non-directly connected route pointing out the primary interface:
100 |
101 | ```
102 | # ip route
103 | default via 172.16.0.1 dev eth0
104 | 172.16.0.0/16 dev eth0 proto kernel scope link src 172.16.0.12
105 | 10.224.1.0/24 dev cbr0 proto kernel scope link src 10.224.1.1
106 | 168.63.129.16 via 172.16.0.1 dev eth0
107 | 169.254.169.254 via 172.16.0.1 dev eth0
108 | ```
109 |
110 | {{% notice info %}}
111 | Azure Nodes can also be configured with ["Azure CNI"](https://docs.microsoft.com/en-us/azure/aks/configure-azure-cni) where Pod IPs get allocated from the same range as the underlying VNET.
112 | {{% /notice %}}
113 |
114 |
115 | ### EKS
116 |
117 | EKS takes a slightly different approach and runs as special [AWS CNI plugin](https://github.com/aws/amazon-vpc-cni-k8s) as a daemonset on all nodes. The functionality of this plugin is documented in the [proposal](https://github.com/aws/amazon-vpc-cni-k8s/blob/master/docs/cni-proposal.md) in a lot of detail.
118 |
119 | {{% notice info %}}
120 | The VPC-native routing is achieved by assigning each Node's ENI with secondary IPs, and adding more ENIs as the max number of IPs per ENI [limit](https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/using-eni.html#AvailableIpPerENI) is exceeded.
121 | {{% /notice %}}
122 |
123 |
124 | One thing worth mentioning here is that in EKS's case, it's possible to replace the AWS CNI plugin with a number of [3rd party plugins](https://docs.aws.amazon.com/eks/latest/userguide/alternate-cni-plugins.html). In this case, VPC-native routing is not available since VPC virtual router won't be aware of the PodCIDRs and the only option is to run those plugins in the overlay mode -- by building a full-mesh of VXLAN tunnels and static routes that forward traffic over them.
--------------------------------------------------------------------------------
/content/cni/kindnet.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: "kindnet"
3 | menuTitle: "kindnet"
4 | weight: 11
5 | date: 2020-09-13T17:33:04+01:00
6 | ---
7 |
8 |
9 | Here is how [kindnet](https://github.com/aojea/kindnet#kindnet-components) satisfies the two main CNI plugin [requirements](/cni/):
10 |
11 | * **Reachability** is established by installing one static route per peer Node with NextHops pointing to the internal Node IPs. These routes get checked every 10 seconds to detect if there were any changes.
12 | * **Connectivity** is established by a mix of reference CNI plugins -- [`ptp`]((https://www.cni.dev/plugins/current/main/ptp/)) is used to create veth links, [`host-local`](https://www.cni.dev/plugins/current/ipam/host-local/) to allocate IPs and [`portmap`](https://www.cni.dev/plugins/current/meta/portmap/) to configure port mappings. The configuration file gets generated by each of the `kindnetd` daemons on startup.
13 |
14 | The diagram below shows how a fully converged routing table will look like:
15 |
16 |
17 | {{< iframe "https://viewer.diagrams.net/?highlight=0000ff&edit=_blank&hide-pages=1&editable=false&layers=1&nav=0&page-id=a-ASnmM8o81X1hkJ6S8l&title=k8s-guide.drawio#Uhttps%3A%2F%2Fraw.githubusercontent.com%2Fnetworkop%2Fk8s-guide-labs%2Fmaster%2Fdiagrams%2Fk8s-guide.drawio" >}}
18 |
19 | ### Lab
20 |
21 | This plugin is built into the Lab cluster by default, so the only thing required is to bring up the [Lab environment](/lab/)
22 |
23 | ```
24 | make setup && make up
25 | ```
26 |
27 | Here's how to validate and verify the above diagram in the Lab environment, using the second Node as an example:
28 |
29 | 1. Pod IP and default route
30 |
31 | Pod IP should have a /24 subnet mask (same as `PodCIDR`) and the default route pointing to the first IP of that subnet.
32 |
33 | ```
34 | $ NODE=k8s-guide-worker2 make tshoot
35 | bash-5.0# ip -br -4 add show eth0
36 | eth0@if5 UP 10.244.2.8/24
37 | bash-5.1# ip route
38 | default via 10.244.2.1 dev eth0
39 | 10.244.2.0/24 via 10.244.2.1 dev eth0 src 10.244.2.8
40 | 10.244.2.1 dev eth0 scope link src 10.244.2.8
41 | ```
42 |
43 | {{% notice note %}}
44 | Note how the Pod routing is set up so that all the traffic, including the intra-subnet Pod-to-Pod communication, is sent over the same next-hop. This allows for all Pods to be interconnected via L3 without relying on a bridge or ARP for neighbor discovery.
45 | {{% /notice %}}
46 |
47 |
48 | 2. Node routing table
49 |
50 | It should contain one /32 host-route per local Pod and one /24 per peer node.
51 |
52 | ```
53 | docker exec -it k8s-guide-worker2 bash
54 | root@k8s-guide-worker2:/# ip route
55 | default via 172.18.0.1 dev eth0
56 | 10.244.0.0/24 via 172.18.0.10 dev eth0
57 | 10.244.1.0/24 via 172.18.0.11 dev eth0
58 | 10.244.2.2 dev vethf821f7f9 scope host
59 | 10.244.2.3 dev veth87514986 scope host
60 | 10.244.2.4 dev veth9829983c scope host
61 | 10.244.2.5 dev veth010c83ae scope host
62 | 10.244.2.8 dev vetha1079faf scope host
63 | ```
64 |
65 | 3. PodCIDR gateway
66 |
67 | One notable thing is that the root namespace side of all veth links has the same IP address:
68 |
69 | ```
70 | root@k8s-guide-worker2:/# ip -br -4 addr show | grep veth
71 | vethf821f7f9@if3 UP 10.244.2.1/32
72 | veth87514986@if3 UP 10.244.2.1/32
73 | veth9829983c@if3 UP 10.244.2.1/32
74 | veth010c83ae@if3 UP 10.244.2.1/32
75 | vetha1079faf@if3 UP 10.244.2.1/32
76 | ```
77 |
78 | They each act as the default gateway for their peer Pods and don't have to be attached to a bridge.
79 |
80 | ### A day in the life of a Packet
81 |
82 | Let's track what happens when Pod-1 tries to talk to Pod-3.
83 |
84 | {{% notice note %}}
85 | We'll assume that the ARP and MAC tables are converged and fully populated.
86 | {{% /notice %}}
87 |
88 | 1. Pod-1 wants to send a packet to `10.244.0.5`. Its network stack looks up the routing table to find the NextHop IP:
89 |
90 | ```
91 | $ kubectl exec -it net-tshoot-wxgcw -- ip route get 10.244.0.5
92 | 10.244.0.5 via 10.244.1.1 dev eth0 src 10.244.1.3 uid 0
93 | ```
94 |
95 | 2. The packet is sent down the veth link and pops out in the root network namespace of the host, which repeats the lookup:
96 |
97 | ```
98 | $ docker exec -it k8s-guide-worker ip route get 10.244.0.5
99 | 10.244.0.5 via 172.18.0.10 dev eth0 src 172.18.0.11 uid 0
100 | ```
101 |
102 | 3. The packet gets L2-switches by the `kind` bridge and enters the control-plane's root network namespace:
103 |
104 | ```
105 | docker exec -it k8s-guide-control-plane ip route get 10.244.0.5
106 | 10.244.0.5 dev veth9f517bf3 src 10.244.0.1 uid 0
107 | ```
108 |
109 | 4. Finally, the packet arrives in the Pod-3's network namespace where it gets processed by the local network stack:
110 |
111 | ```
112 | kubectl exec -it net-tshoot-x6wv9 -- ip route get 10.244.0.5
113 | local 10.244.0.5 dev lo src 10.244.0.5 uid 0
114 | ```
115 |
116 | ### SNAT functionality
117 |
118 | In addition to the main CNI functionality, `kindnet` also sets up a number of IP masquerade (Source NAT) rules. These rules allow Pods to access the same networks as the hosting Node (e.g. Internet). The new `KIND-MASQ-AGENT` chain is inserted into the NAT's `POSTROUTING` chain and includes a special `RETURN` rule to exclude all traffic in the cluster-cidr range (10.244.0.0/16):
119 |
120 | ```
121 | root@k8s-guide-worker2:/# iptables -t nat -nvL | grep -B 4 -A 4 KIND-MASQ
122 | Chain POSTROUTING (policy ACCEPT 3073 packets, 233K bytes)
123 | pkts bytes target prot opt in out source destination
124 | 61703 4686K KUBE-POSTROUTING all -- * * 0.0.0.0/0 0.0.0.0/0 /* kubernetes postrouting rules */
125 | 0 0 DOCKER_POSTROUTING all -- * * 0.0.0.0/0 172.18.0.1
126 | 54462 4060K KIND-MASQ-AGENT all -- * * 0.0.0.0/0 0.0.0.0/0 ADDRTYPE match dst-type !LOCAL /* kind-masq-agent: ensure nat POSTROUTING directs all non-LOCAL destination traffic to our custom KIND-MASQ-AGENT chain */
127 |
128 | Chain KIND-MASQ-AGENT (1 references)
129 | pkts bytes target prot opt in out source destination
130 | 46558 3587K RETURN all -- * * 0.0.0.0/0 10.244.0.0/16 /* kind-masq-agent: local traffic is not subject to MASQUERADE */
131 | 7904 473K MASQUERADE all -- * * 0.0.0.0/0 0.0.0.0/0 /* kind-masq-agent: outbound traffic is subject to MASQUERADE (must be last in chain) */
132 | ```
133 |
134 | ### Caveats and Gotchas
135 |
136 | * Assumes all Nodes are in the same L2 domain.
137 | * Relies on host-local, ptp, portmap and loopback [reference plugins](https://github.com/containernetworking/plugins#plugins-supplied).
138 |
--------------------------------------------------------------------------------
/content/cni/weave.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: "weave"
3 | menuTitle: "weave"
4 | date: 2020-10-17T12:33:04+01:00
5 | weight: 14
6 | ---
7 |
8 | [Weave Net](https://www.weave.works/docs/net/latest/overview/) is one of the "heavyweight" CNI plugins with a wide range of features and its own proprietary control plane to disseminate routing information between nodes. The scope of the plugin extends far beyond the base CNI functionality examined in this chapter and includes Network Policies, Encryption, Multicast and support for other container orchestration platforms (Swarm, Mesos).
9 |
10 | Following a similar pattern, let's examine how `weave` achieves the base CNI plugin functionality:
11 |
12 | * **Connectivity** is set up by the `weave-net` binary by attaching pods to the `weave` Linux bridge. The bridge is, in turn, attached to the Open vSwitch's kernel datapath which forwards the packets over the vxlan interface towards the target node.
13 |
14 | {{% notice info %}}
15 | Although it would have been possible to attach containers directly to the OVS datapath (ODP), Linux bridge plays the role of an egress router for all local pods so that ODP is only used for pod-to-pod forwarding.
16 | {{% /notice %}}
17 |
18 | * **Reachability** is established by two separate mechanisms:
19 |
20 | 1. [Weave Mesh](https://github.com/weaveworks/mesh) helps agents discover each other, check health, connectivity and exchange node-local details, e.g. IPs for VXLAN tunnel endpoint.
21 | 2. OVS datapath acts as a standard learning L2 switch with flood-and-learn behaviour being [programmed](https://github.com/weaveworks/go-odp) by the local agent (based on information distributed by the Mesh). All pods get their IPs from a single cluster-wide subnet and see their peers as if they were attached to a single broadcast domain.
22 |
23 |
24 | {{% notice info %}}
25 | The cluster-wide CIDR range is still split into multiple non-overlapping ranges, which may look like a node-local pod CIDRs, however, all Pod IPs still have the same prefix length as the cluster CIDR, effectively making them part of the same L3 subnet.
26 | {{% /notice %}}
27 |
28 |
29 | The fully converged and populated IP and MAC tables will look like this:
30 |
31 | {{< iframe "https://viewer.diagrams.net/?highlight=0000ff&edit=_blank&hide-pages=1&editable=false&layers=1&nav=0&page-id=GzriSjSBuyDBEbTt2saz&title=k8s-guide.drawio#Uhttps%3A%2F%2Fraw.githubusercontent.com%2Fnetworkop%2Fk8s-guide-labs%2Fmaster%2Fdiagrams%2Fk8s-guide.drawio" >}}
32 |
33 |
34 |
35 |
36 | ### Lab
37 |
38 |
39 | Assuming that the lab is already [set up](/lab/), weave can be enabled with the following commands:
40 |
41 | ```bash
42 | make weave
43 | ```
44 |
45 | Check that the weave daemonset has reached the `READY` state:
46 |
47 | ```bash
48 | $ kubectl -n kube-system get daemonset -l name=weave-net
49 | NAME DESIRED CURRENT READY UP-TO-DATE AVAILABLE NODE SELECTOR AGE
50 | weave-net 3 3 3 3 3 30s
51 | ```
52 |
53 | Now we need to "kick" all Pods to restart and pick up the new CNI plugin:
54 |
55 | ```bash
56 | make nuke-all-pods
57 | ```
58 |
59 | To make sure kube-proxy and weave set up the right set of NAT rules, existing NAT tables need to be flushed and repopulated:
60 |
61 | ```
62 | make flush-nat && make weave-restart
63 | ```
64 |
65 | ---
66 |
67 | Here's how the information from the diagram can be validated (using `worker2` as an example):
68 |
69 | 1. Pod IP and default route
70 |
71 | ```bash
72 | $ NODE=k8s-guide-worker2 make tshoot
73 | bash-5.0# ip route
74 | default via 10.44.0.0 dev eth0
75 | 10.32.0.0/12 dev eth0 proto kernel scope link src 10.44.0.7
76 | ```
77 |
78 | 2. Node routing table
79 |
80 | ```bash
81 | $ docker exec -it k8s-guide-worker2 ip route
82 | default via 172.18.0.1 dev eth0
83 | 10.32.0.0/12 dev weave proto kernel scope link src 10.44.0.0
84 | 172.18.0.0/16 dev eth0 proto kernel scope link src 172.18.0.4
85 | ```
86 |
87 | 3. ODP configuration and flows (output omitted for brevity)
88 |
89 |
90 | ```
91 | WEAVEPOD=$(kubectl get pods -n kube-system -l name=weave-net --field-selector spec.nodeName=k8s-guide-worker2 -o jsonpath='{.items[0].metadata.name}')
92 | kubectl exec -it $WEAVEPOD -n kube-system -- /home/weave/weave --local report
93 | ```
94 |
95 | ### A day in the life of a Packet
96 |
97 | Let's track what happens when Pod-1 (actual name is net-tshoot-22drp) tries to talk to Pod-3 (net-tshoot-pbp7z).
98 |
99 | {{% notice note %}}
100 | We'll assume that the ARP and MAC tables are converged and fully populated. In order to do that issue a ping command from Pod-1 to Pod-3's IP (10.40.0.1)
101 | {{% /notice %}}
102 |
103 |
104 | 1. Pod-1 wants to send a packet to `10.40.0.1`. Its network stack looks up the routing table:
105 |
106 | ```bash
107 | $ kubectl exec -it net-tshoot-22drp -- ip route get 10.40.0.1
108 | 10.40.0.1 dev eth0 src 10.32.0.4 uid 0
109 | cache
110 | ```
111 |
112 | 2. Since the target IP is from a directly-connected network, the next step is to check its local ARP table:
113 |
114 | ```bash
115 | $ kubectl exec -it net-tshoot-22drp -- ip neigh show 10.40.0.1
116 | 10.40.0.1 dev eth0 lladdr d6:8d:31:c4:95:85 STALE
117 | ```
118 |
119 | 3. The packet is sent out of the veth interface and hits the `weave` bridge in the root NS, where a L2 lookup is performed:
120 |
121 | ```
122 | $ docker exec -it k8s-guide-worker bridge fdb get d6:8d:31:c4:95:85 br weave
123 | d6:8d:31:c4:95:85 dev vethwe-bridge master weave
124 | ```
125 |
126 | 4. The packet is sent from the `weave` bridge down to the OVS kernel datapath over a veth link:
127 |
128 | ```
129 | $ docker exec -it k8s-guide-worker ip link | grep vethwe-
130 | 12: vethwe-datapath@vethwe-bridge: mtu 1376 qdisc noqueue master datapath state UP mode DEFAULT group default
131 | 13: vethwe-bridge@vethwe-datapath: mtu 1376 qdisc noqueue master weave state UP mode DEFAULT group default
132 | ```
133 |
134 | 5. The ODP does a flow lookup to determine what actions to apply to the packet (the output is redacted for brevity)
135 |
136 | ```
137 | $ WEAVEPOD=$(kubectl get pods -n kube-system -l name=weave-net --field-selector spec.nodeName=k8s-guide-worker -o jsonpath='{.items[0].metadata.name}')
138 | $ kubectl exec -it $WEAVEPOD -n kube-system -- /home/weave/weave --local report
139 | <...>
140 | {
141 | "FlowKeys": [
142 | "UnknownFlowKey{type: 22, key: 00000000, mask: 00000000}",
143 | "EthernetFlowKey{src: 0a:75:b7:d0:31:58, dst: d6:8d:31:c4:95:85}",
144 | "UnknownFlowKey{type: 25, key: 00000000000000000000000000000000, mask: 00000000000000000000000000000000}",
145 | "UnknownFlowKey{type: 23, key: 0000, mask: 0000}",
146 | "InPortFlowKey{vport: 1}",
147 | "UnknownFlowKey{type: 24, key: 00000000, mask: 00000000}"
148 | ],
149 | "Actions": [
150 | "SetTunnelAction{id: 0000000000ade6da, ipv4src: 172.18.0.3, ipv4dst: 172.18.0.2, ttl: 64, df: true}",
151 | "OutputAction{vport: 2}"
152 | ],
153 | "Packets": 2,
154 | "Bytes": 84,
155 | "Used": 258933878
156 | },
157 | <...>
158 | ```
159 |
160 | 6. ODP encapsulates the original packet into a VXLAN frame ands sends the packet out of its local vxlan port:
161 |
162 | ```
163 | $ kubectl exec -it $WEAVEPOD -n kube-system -- /home/weave/weave --local report | jq '.Router.OverlayDiagnostics.fastdp.Vports[2]'
164 | {
165 | "ID": 2,
166 | "Name": "vxlan-6784",
167 | "TypeName": "vxlan"
168 | }
169 | ```
170 |
171 | 7. The VXLAN frame gets L2-switched by the `kind` bridge and arrives at the `control-plane` node, where another ODP lookup is performed
172 |
173 | ```
174 | $ WEAVEPOD=$(kubectl get pods -n kube-system -l name=weave-net --field-selector spec.nodeName=k8s-guide-control-plane -o jsonpath='{.items[0].metadata.name}')
175 | $ kubectl exec -it $WEAVEPOD -n kube-system -- /home/weave/weave --local report
176 | <...>
177 | {
178 | "FlowKeys": [
179 | "UnknownFlowKey{type: 22, key: 00000000, mask: 00000000}",
180 | "UnknownFlowKey{type: 24, key: 00000000, mask: 00000000}",
181 | "UnknownFlowKey{type: 25, key: 00000000000000000000000000000000, mask: 00000000000000000000000000000000}",
182 | "TunnelFlowKey{id: 0000000000ade6da, ipv4src: 172.18.0.3, ipv4dst: 172.18.0.2}",
183 | "InPortFlowKey{vport: 2}",
184 | "UnknownFlowKey{type: 23, key: 0000, mask: 0000}",
185 | "EthernetFlowKey{src: 0a:75:b7:d0:31:58, dst: d6:8d:31:c4:95:85}"
186 | ],
187 | "Actions": [
188 | "OutputAction{vport: 1}"
189 | ],
190 | "Packets": 3,
191 | "Bytes": 182,
192 | "Used": 259264545
193 | },
194 | <...>
195 | ```
196 |
197 | 8. The output port is the veth link connecting ODP to the `weave` bridge:
198 |
199 | ```
200 | $ kubectl exec -it $WEAVEPOD -n kube-system -- /home/weave/weave --local report | jq '.Router.OverlayDiagnostics.fastdp.Vports[1]'
201 | {
202 | "ID": 1,
203 | "Name": "vethwe-datapath",
204 | "TypeName": "netdev"
205 | }
206 | ```
207 |
208 | 9. Following another L2 lookup in the `weave` bridge, the packet is sent down the veth link connected to the target Pod-3:
209 |
210 | ```
211 | $ docker exec -it k8s-guide-control-plane bridge fdb get d6:8d:31:c4:95:85 br weave
212 | d6:8d:31:c4:95:85 dev vethwepl6be12f5 master weave
213 | ```
214 |
215 | 10. Finally, the packet gets delivered to the `eth0` interface of the target pod:
216 |
217 | ```
218 | $ kubectl exec -it net-tshoot-pbp7z -- ip link show dev eth0
219 | 16: eth0@if17: mtu 1376 qdisc noqueue state UP mode DEFAULT group default
220 | link/ether d6:8d:31:c4:95:85 brd ff:ff:ff:ff:ff:ff link-netnsid 0
221 | ```
222 |
223 | ### SNAT functionality
224 |
225 | SNAT functionality for traffic egressing the cluster is done in two stages:
226 |
227 | 1. All packets that don't match the cluster CIDR range, get sent to the IP of the local `weave` bridge which sends them down the default route already configured in the root namespace.
228 |
229 | 2. A new `WEAVE` chain gets appended to the POSTROUTING chain which matches all packets from the cluster IP range `10.32.0.0/12` destined to all non-cluster IPs `!10.32.0.0/12` and translates all flows leaving the node (`MASQUERADE`):
230 |
231 | ```
232 | iptables -t nat -vnL
233 | <...>
234 | Chain POSTROUTING (policy ACCEPT 6270 packets, 516K bytes)
235 | pkts bytes target prot opt in out source destination
236 | 51104 4185K WEAVE all -- * * 0.0.0.0/0 0.0.0.0/0
237 | <...>
238 | Chain WEAVE (1 references)
239 | pkts bytes target prot opt in out source destination
240 | 4 336 RETURN all -- * * 0.0.0.0/0 0.0.0.0/0 match-set weaver-no-masq-local dst /* Prevent SNAT to locally running containers */
241 | 0 0 RETURN all -- * * 10.32.0.0/12 224.0.0.0/4
242 | 0 0 MASQUERADE all -- * * !10.32.0.0/12 10.32.0.0/12
243 | 2 120 MASQUERADE all -- * * 10.32.0.0/12 !10.32.0.0/12
244 | ```
245 |
246 |
247 | ### Partial connectivity
248 |
249 | One of the interesting and unique features of Weave is its ability to function in environments with partial connectivity. This functionality is enabled by [Weave Mesh](https://github.com/weaveworks/mesh) and its use of the [gossip protocol](https://en.wikipedia.org/wiki/Gossip_protocol), allowing mesh members to dynamically discover each other and build the topology graph which is used to calculate the most optimal forwarding path.
250 |
251 | One way to demonstrate this is to break the connectivity between two worker nodes and verify that pods are still able to reach each other. Let's start by checking that ping works under normal conditions:
252 |
253 | ```
254 | POD_WORKER2_IP=$(kubectl get pods -n default --field-selector spec.nodeName=k8s-guide-worker2 -o jsonpath='{.items[0].status.podIP}')
255 | POD_WORKER1_NAME=$(kubectl get pods -n default --field-selector spec.nodeName=k8s-guide-worker -o jsonpath='{.items[0].metadata.name}')
256 | kubectl -n default exec $POD_WORKER1_NAME -- ping -q -c 5 $POD_WORKER2_IP
257 | PING 10.40.0.7 (10.40.0.7) 56(84) bytes of data.
258 |
259 | --- 10.40.0.7 ping statistics ---
260 | 5 packets transmitted, 5 received, 0% packet loss, time 4055ms
261 | rtt min/avg/max/mdev = 0.136/0.178/0.278/0.051 ms
262 | ```
263 |
264 | Get the IPs of the two worker nodes:
265 |
266 | ```
267 | IP_WORKER1=$(docker inspect k8s-guide-worker --format='{{range .NetworkSettings.Networks}}{{.IPAddress}}{{end}}')
268 | IP_WORKER2=$(docker inspect k8s-guide-worker2 --format='{{range .NetworkSettings.Networks}}{{.IPAddress}}{{end}}')
269 | ```
270 |
271 | Add a new `DROP` rule for the traffic between these two IPs:
272 |
273 | ```
274 | sudo iptables -I FORWARD -s $IP_WORKER1 -d $IP_WORKER2 -j DROP
275 | ```
276 |
277 | A few seconds later, once the control plane has reconverged, repeat the ping test:
278 |
279 | ```
280 | kubectl -n default exec $POD_WORKER1_NAME -- ping -q -c 5 $POD_WORKER2_IP
281 |
282 | PING 10.40.0.7 (10.40.0.7) 56(84) bytes of data.
283 |
284 | --- 10.40.0.7 ping statistics ---
285 | 5 packets transmitted, 5 received, 0% packet loss, time 4031ms
286 | rtt min/avg/max/mdev = 0.347/0.489/0.653/0.102 ms
287 |
288 | ```
289 |
290 | The connectivity still works, although the traffic between the two worker nodes is definitely dropped:
291 |
292 | ```
293 | sudo iptables -nvL FORWARD | grep DROP
294 | Chain FORWARD (policy DROP 0 packets, 0 bytes)
295 | 312 43361 DROP all -- * * 172.18.0.5 172.18.0.4
296 | ```
297 |
298 | One thing worth noting here is that the average RTT has almost doubled compared to the original test. This is because the traffic is now relayed by the control-plane node - the only node that has full connectivity to both worker nodes. In the dataplane, this is achieved with a special UDP-based protocol called sleeve(https://www.weave.works/docs/net/latest/concepts/router-encapsulation/).
299 |
300 |
301 | The sending node (172.18.0.5) encapsulates ICMP packets for the other worker node (172.18.0.4) in a Sleeve payload and sends them to the control-plane node (172.18.0.2), which relays them on to the correct destination:
302 |
303 |
304 | ```
305 | 12:28:54.056814 IP 172.18.0.5.48052 > 172.18.0.2.6784: UDP, length 106
306 | 12:28:54.057599 IP 172.18.0.2.48052 > 172.18.0.4.6784: UDP, length 106
307 | 12:28:54.057957 IP 172.18.0.4.48052 > 172.18.0.2.6784: UDP, length 106
308 | 12:28:54.058376 IP 172.18.0.2.48052 > 172.18.0.5.6784: UDP, length 106
309 | ```
310 |
311 | Although it certainly comes with substantial performance trade-offs, this functionality can become very handy in environments with bad network links or where remote nodes are hosted in an isolated network environment with limited/restricted external connectivity.
312 |
313 | Don't forget to remove the drop rule at the end of the testing:
314 |
315 | ```
316 | sudo iptables -D FORWARD -s $IP_WORKER1 -d $IP_WORKER2 -j DROP
317 | ```
318 |
319 |
320 | ### Caveats and Gotchas
321 |
322 | * The official installation guide contains a number of [things to watch out for](https://www.weave.works/docs/net/latest/kubernetes/kube-addon/#-things-to-watch-out-for).
323 | * Addition/Deletion or intermittent connectivity to nodes [results](https://github.com/weaveworks/weave/issues/3645) in flow invalidation on all nodes, which, for a brief period of time, disrupts all connections until the flood-and-learn re-populates all forwarding tables.
324 |
325 |
326 |
327 | ### Additional reading:
328 |
329 | [Weave's IPAM](https://www.weave.works/docs/net/latest/tasks/ipam/ipam/)
330 | [Overlay Method Selection](https://github.com/weaveworks/weave/blob/master/docs/fastdp.md)
331 | [OVS dataplane Implementation Details](http://www.openvswitch.org//support/ovscon2016/8/0935-pumputis.pdf)
332 |
333 |
334 |
--------------------------------------------------------------------------------
/content/credits.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: Credits
3 | disableToc: true
4 | ---
5 |
6 | ## Contributors
7 |
8 | [TKNG contributors](https://github.com/networkop/k8s-networking-guide/graphs/contributors)
9 |
10 |
11 | ## Tooling
12 |
13 | * [Netlify](https://www.netlify.com) - Continuous deployement and hosting of this documentation
14 | * [Hugo](https://gohugo.io/) - static blog generator
15 | * [Flux](https://github.com/fluxcd/flux) - the GitOps operator for Kubernetes
16 | * [Kind](https://kind.sigs.k8s.io/docs/user/quick-start/) -- tool for running local Kubernetes clusters using Docker container “nodes”
17 |
18 | ## Special Thanks
19 |
20 | Special thanks to Roman Dodin [@ntdvps](https://twitter.com/ntdvps) for donating the domain [tkng.io](tkng.io)
--------------------------------------------------------------------------------
/content/dns/_index.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: "DNS"
3 | date: 2020-09-13T17:33:04+01:00
4 | weight: 100
5 | summary: "The role and configuration of DNS"
6 | ---
7 |
8 | DNS plays a central role in Kubernetes service discovery. As it is mentioned in the [Services chapter](/services/), DNS is an essential part of how Services are consumed by end clients and, while implementation is not baked into core Kubernetes controllers, [DNS specification] is very explicit about the behaviour expected from such an implementation. The DNS spec defines the rules for the format of the queries and the expected responses. All Kubernetes Services have at least one corresponding A/AAAA DNS record in the format of `{service-name}.{namespace}.svc.{cluster-domain}` and the response format depends on the type of a Service:
9 |
10 | | Service Type | Response |
11 | |--------------|----------|
12 | | ClusterIP, NodePort, LoadBalancer | ClusterIP value |
13 | | Headless | List of Endpoint IPs |
14 | | ExternalName | CNAME pointing to the value of `spec.externalName` |
15 |
16 | {{% notice note %}}
17 | Some Services have additional SRV and PTR records and Pods also have a corresponding A/AAAA record; see the [official docs](https://kubernetes.io/docs/concepts/services-networking/dns-pod-service/) for more details.
18 | {{% /notice %}}
19 |
20 | Historically, there had been two implementations of this DNS spec -- one based on `dnsmasq` and another one based on `CoreDNS`, the latter had become the [default option](https://kubernetes.io/blog/2018/07/10/coredns-ga-for-kubernetes-cluster-dns/) for kubeadm since Kubernetes 1.11.
21 |
22 |
23 |
24 | ## Service Discovery -- Server-side
25 |
26 | CoreDNS implements the Kubernetes DNS spec in a [dedicated plugin](https://coredns.io/plugins/kubernetes/) that gets compiled into a static binary and deployed in a Kubernetes cluster as a Deployment and exposed as a ClusterIP service. This means that all communications with the DNS service inside a cluster are subject to the same network forwarding rules used and limitations experienced by normal Pods and set up by the [CNI](/cni/) and [Services](/services/) plugins.
27 |
28 | Since DNS speed and stability are considered [crucial](https://isitdns.com/) in any network-based communication, CoreDNS implementation is [highly optimised](https://github.com/coredns/deployment/blob/master/kubernetes/Scaling_CoreDNS.md) to minimise memory consumption and maximise query processing rate. In order to achieve that, CoreDNS stores only the [relevant parts](https://github.com/coredns/coredns/blob/a644eb4472ab61cdef8405b4e42bc9892f2e9295/plugin/kubernetes/object/service.go#L33) of [Services](https://github.com/coredns/coredns/blob/a644eb4472ab61cdef8405b4e42bc9892f2e9295/plugin/kubernetes/object/service.go#L12), [Pods](https://github.com/coredns/coredns/blob/a644eb4472ab61cdef8405b4e42bc9892f2e9295/plugin/kubernetes/object/pod.go#L13) and [Endpoints](https://github.com/coredns/coredns/blob/a644eb4472ab61cdef8405b4e42bc9892f2e9295/plugin/kubernetes/object/endpoint.go#L14) objects in its [local cache](https://github.com/coredns/coredns/blob/a644eb4472ab61cdef8405b4e42bc9892f2e9295/plugin/kubernetes/controller.go#L115) that is optimised to return a response in a [single lookup](https://github.com/coredns/coredns/blob/a644eb4472ab61cdef8405b4e42bc9892f2e9295/plugin/kubernetes/kubernetes.go#L495).
29 |
30 | By default, CoreDNS also acts as a DNS proxy for all external domains (e.g. example.com) using the [`forward` plugin](https://coredns.io/plugins/forward/) and is often deployed with the [`cache` plugin](https://coredns.io/plugins/cache/) enabled. The entire CoreDNS configuration can be found in the `coredns` ConfigMap:
31 |
32 | ```yaml
33 | apiVersion: v1
34 | kind: ConfigMap
35 | metadata:
36 | name: coredns
37 | namespace: kube-system
38 | data:
39 | Corefile: |
40 | .:53 {
41 | errors
42 | health {
43 | lameduck 5s
44 | }
45 | ready
46 | kubernetes cluster.local in-addr.arpa ip6.arpa {
47 | pods insecure
48 | fallthrough in-addr.arpa ip6.arpa
49 | ttl 30
50 | }
51 | prometheus :9153
52 | forward . /etc/resolv.conf {
53 | max_concurrent 1000
54 | }
55 | cache 30
56 | loop
57 | reload
58 | loadbalance
59 | }
60 | ```
61 |
62 | ## Service Discovery -- Client-side
63 |
64 | DNS configuration inside a Pod is controlled by the `spec.dnsPolicy` and `spec.dnsConfig` [settings](https://kubernetes.io/docs/concepts/services-networking/dns-pod-service/#pod-s-dns-policy). By default, kubelet will configure the cluster DNS IP, stored in the configuration file and [hard-coded](https://github.com/kubernetes/kubernetes/blob/cde45fb161c5a4bfa7cfe45dfd814f6cc95433f7/cmd/kubeadm/app/constants/constants.go#L638) to the tenth IP of the ClusterIP range by the kubeadm.
65 |
66 | ```
67 | root@k8s-guide-control-plane:/# cat /var/lib/kubelet/config.yaml
68 | apiVersion: kubelet.config.k8s.io/v1beta1
69 | ...
70 | clusterDNS:
71 | - 10.96.0.10
72 | ...
73 | ```
74 | With the above default settings, this is how a Pod deployed in the default namespace would see its own `resolv.conf` file:
75 |
76 | ```
77 | $ cat /etc/resolv.conf
78 | search default.svc.cluster.local svc.cluster.local cluster.local
79 | nameserver 10.96.0.10
80 | options ndots:5
81 | ```
82 |
83 | The search domains and `ndots` value are configured so that any non-FQDN DNS query made by a Pod is first tried in all of the specified domains, which allows for internal cluster DNS schema to take precedence over the external DNS ([explanation](https://github.com/kubernetes/kubernetes/issues/33554#issuecomment-266251056)). For example, any Pod in the `default` Namespace, can lookup the ClusterIP of the `kubernetes` Service in a single lookup (the shell is running a `stern -n kube-system -l k8s-app=kube-dns` in the background):
84 |
85 | ```bash
86 | $ kubectl -n default exec ds/net-tshoot -- dig kubernetes +search +short
87 | 10.96.0.1
88 | coredns-558bd4d5db-sqhkz coredns [INFO] 10.244.0.5:36255 - 36946 "A IN kubernetes.default.svc.cluster.local. udp 77 false 4096" NOERROR qr,aa,rd 106 0.0002139s
89 | ```
90 |
91 | The downside of this behaviour is that any external domain lookup will require at least 4 separate queries:
92 |
93 | ```
94 | $ kubectl -n default exec ds/net-tshoot -- dig tkng.io +search +short
95 | coredns-558bd4d5db-5jbgh coredns [INFO] 10.244.0.5:54816 - 13660 "A IN tkng.io.default.svc.cluster.local. udp 74 false 4096" NXDOMAIN qr,aa,rd 144 0.0002719s
96 | coredns-558bd4d5db-5jbgh coredns [INFO] 10.244.0.5:38006 - 38084 "A IN tkng.io.svc.cluster.local. udp 66 false 4096" NXDOMAIN qr,aa,rd 136 0.0001705s
97 | coredns-558bd4d5db-5jbgh coredns [INFO] 10.244.0.5:35302 - 4454 "A IN tkng.io.cluster.local. udp 62 false 4096" NXDOMAIN qr,aa,rd 132 0.0001219s
98 | 172.67.201.112
99 | 104.21.21.243
100 | coredns-558bd4d5db-sqhkz coredns [INFO] 10.244.0.5:47052 - 6189 "A IN tkng.io. udp 48 false 4096" NOERROR qr,rd,ad 71 0.0183829s
101 | ```
102 |
103 |
104 | ## Optimisations
105 |
106 | DNS is widely regarded as the main [source](https://isitdns.com/) of all IT problems, and Kubernetes is no exception (see [1](https://github.com/kubernetes/kubernetes/issues/56903), [2](https://www.weave.works/blog/racy-conntrack-and-dns-lookup-timeouts), [3](https://github.com/kubernetes/kubernetes/issues/62628), [4](https://pracucci.com/kubernetes-dns-resolution-ndots-options-and-why-it-may-affect-application-performances.html)). Its way of deployment and reliance on [HPA](https://kubernetes.io/docs/tasks/run-application/horizontal-pod-autoscale/) mean that some Nodes could become connection bottlenecks while the CPU and Memory of the DNS Pods may remain relatively low. There are a number of optimisations that can be enabled to improve DNS performance at the expense of additional resource utilisation and complexity:
107 |
108 | * The [**authopath** plugin](https://coredns.io/plugins/autopath/) can be enabled in CoreDNS to make it follow the chain of search paths on behalf of a client, thereby reducing the number of queries for an external domain required by the client from 4 (see above) to just one.
109 | * Each Kubernetes Node can run a [**NodeLocal DNSCache**](https://kubernetes.io/docs/tasks/administer-cluster/nodelocaldns/) -- a daemonset of recursive DNS resolvers designed to reduce the load on a centralised CoreDNS deployment by serving as a caching layer between Pods and the DNS service.
110 | * Adjust lookups that are used frequently to add a trailing dot (example, `tkng.io.`). This is most effective for external hostnames that experience overhead as an absolute query is performed, avoiding the search path expansion through the `cluster.local`, and potentially other search domains in `/etc/resolv.conf`.
111 |
112 |
113 | ## External DNS
114 |
115 | The [DNS Specification](https://github.com/kubernetes/dns/blob/master/docs/specification.md) is only focused on the intra-cluster DNS resolution and service discovery. Anything to do with external DNS is left out of scope, despite the fact that most of the end-users are located outside of a cluster. For them, Kubernetes has to provide a way to discover external Kubernetes resources, LoadBalancer Services, Ingresses and Gateways, and there are two ways this can be accomplished:
116 |
117 | * An out-of-cluster DNS zone can be orchestrated by the [**ExternalDNS** cluster add-on](https://github.com/kubernetes-sigs/external-dns) -- a Kubernetes controller that synchronises external Kubernetes resources with any supported third-party DNS provider via an API (see the [GH page](https://github.com/kubernetes-sigs/external-dns#externaldns) for the list of supported providers).
118 | * An existing DNZ zone can be configured to delegate a subdomain to a self-hosted external DNS plugin, e.g. [**k8s_gateway**](https://github.com/ori-edge/k8s_gateway). This approach assumes that this DNS plugin is deployed inside a cluster and exposed via a LoadBalancer IP, which is then used in an NS record for the delegated zone. All queries hitting this subdomain will get forwarded to this plugin which will respond as an authoritative nameserver for the delegated subdomain.
119 |
120 |
--------------------------------------------------------------------------------
/content/ingress/_index.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: "Ingress & Egress"
3 | date: 2020-09-13T17:33:04+01:00
4 | weight: 70
5 | summary: "North-South traffic forwarding"
6 | ---
7 |
8 | This chapter deals with anything related to North-South traffic forwarding in Kubernetes. First, let's make it clear that in both ingress (North) and egress (South) cases, traffic flows are actually bidirectional, i.e. a single flow would have packets flowing in both directions. The main distiction between ingress and egress is the direction of the original packet, i.e. where the client and server are located relative to the Kubernetes cluster boundary.
9 |
10 | These two types of traffic are treated very diffirently and almost always take asymmetric paths. This is because ingress is usually more important -- it's the revenue-generating user traffic for cluster applications, while egress is mainly non-revenue, Internet-bound traffic, e.g. DNS queries, package updates -- something that may not even be needed, depending on the application architecture.
11 |
12 | {{% notice note %}}
13 | Egress may have a slightly different meaning in the context of service meshes and multiple clusters, but this is outside of the scope of this chapter.
14 | {{% /notice %}}
15 |
16 |
17 | Because of the above differences, ingress and egress traffic needs to be examined separately and this part of the guide will be split into the following chapters:
18 |
19 | * [**Ingress API**](/ingress/ingress/) -- the original method of routing incoming traffic to different cluster applications.
20 | * [**Gateway API**](/ingress/gateway/) -- can be treated as the evolution of the Ingress API with the same goals and scope.
21 | * [**Egress**](/ingress/egress/) -- describes different options for egress traffic engineering.
22 |
23 |
24 |
--------------------------------------------------------------------------------
/content/ingress/egress/_index.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: "Egress"
3 | date: 2020-09-13T17:33:04+01:00
4 | weight: 70
5 | summary: "Egress traffic engineering"
6 | ---
7 |
8 | Egress is a very loosely defined term in the Kubernetes ecosystem. Unlike its counterpart, egress traffic is not controlled by any standard Kubernetes API or a proxy. This is because most of the egress traffic is not revenue-generating and, in fact, can be completely optional. For situations when a Pod needs to communicate with an external service, it would make sense to do this via an API gateway rather than allow direct communication and most of the service meshes provide this functionality, e.g. Consul's [Terminating Gateway](https://www.consul.io/docs/connect/gateways/terminating-gateway) or OSM's [Egress Policy API](https://docs.openservicemesh.io/docs/guides/traffic_management/egress/). However, we still need a way to allow for Pod-initiated external communication, without a service mesh integration, and this is how it can be done:
9 |
10 | 1. By default, traffic leaving a Pod will follow the default route out of a Node and will get masqueraded (SNAT'ed) to the address of the outgoing interface. This is normally provisioned by a CNI plugin option, e.g. the `ipMasq` option of the [bridge plugin](https://www.cni.dev/plugins/current/main/bridge/#network-configuration-reference), or by a separate agent, e.g. [`ip-masq-agent`](https://github.com/kubernetes-sigs/ip-masq-agent).
11 | 2. For security reasons, some or all egress traffic can get redirected to an "egress gateway" deployed on a subset of Kubernetes Nodes. The operation, UX and redirection mechanism are implementation-specific and can work at an application level, e.g. Istio's [Egress Gateway](https://istio.io/latest/docs/tasks/traffic-management/egress/egress-gateway/), or at an IP level, e.g. Cilium's [Egress Gateway](https://docs.cilium.io/en/stable/gettingstarted/egress-gateway/).
12 |
13 | In both cases, the end result is that a packet leaves one of the Kubernetes Nodes, SNAT'ed to the address of the egress interface. The rest of the forwarding is done by the underlying network.
14 |
15 | {{< iframe "https://viewer.diagrams.net/?highlight=0000ff&edit=_blank&hide-pages=1&editable=false&layers=1&nav=0&page-id=g6ESgU9g5ULUhjZ5bZWG&title=k8s-guide.drawio#Uhttps%3A%2F%2Fraw.githubusercontent.com%2Fnetworkop%2Fk8s-guide-labs%2Fmaster%2Fdiagrams%2Fk8s-guide.drawio" >}}
16 |
17 |
18 | ## Lab
19 |
20 | The way direct local egress works has already been described in the CNI part of this guide. Refer to the respective sections of the [kindnet](http://localhost:1313/cni/kindnet/#snat-functionality), [flannel](http://localhost:1313/cni/flannel/#snat-functionality), [weave](http://localhost:1313/cni/weave/#snat-functionality), [calico](http://localhost:1313/cni/calico/#snat-functionality) and [cilium](http://localhost:1313/cni/cilium/#snat-functionality) chapters for more details.
21 |
22 | For this lab exercise, we’ll focus on how Cilium implements the Egress Gateway functionality via a custom resource called `CiliumEgressNATPolicy`.
23 |
24 | ### Preparation
25 |
26 |
27 | Assuming that the lab environment is already [set up](/lab/), Cilium can be enabled with the following command:
28 |
29 | ```bash
30 | make cilium
31 | ```
32 |
33 | Wait for the Cilium daemonset to initialize:
34 |
35 | ```bash
36 | make cilium-wait
37 | ```
38 |
39 | Now we need to "kick" all Pods to restart and pick up the new CNI plugin:
40 |
41 | ```bash
42 | make nuke-all-pods
43 | ```
44 |
45 | To make sure there's is no interference from `kube-proxy` we'll remove it completely along with any IPTables rules set up by it:
46 |
47 | ```
48 | make nuke-kube-proxy
49 | ```
50 |
51 | Deploy an "external" [echo server](https://github.com/mpolden/echoip) that will be used to check the source IP of the incoming request:
52 |
53 | ```
54 | make egress-prep
55 | ```
56 |
57 | By default, we should have a `net-tshoot` daemonset running on all Nodes:
58 |
59 | ```
60 | $ kubectl -n default get pod -owide
61 | NAME READY STATUS RESTARTS AGE IP NODE NOMINATED NODE READINESS GATES
62 | net-tshoot-5ngbc 1/1 Running 0 4h53m 10.0.0.174 k8s-guide-control-plane
63 | net-tshoot-gcj27 1/1 Running 0 4h53m 10.0.2.86 k8s-guide-worker2
64 | net-tshoot-pcgf8 1/1 Running 0 4h53m 10.0.1.42 k8s-guide-worker
65 | ```
66 |
67 | We can use these Pods to verify the (default) local egress behaviour by sending an HTTP GET to the echo server:
68 |
69 | ```
70 | $ kubectl -n default get pod -l name=net-tshoot -o name | xargs -I{} kubectl -n default exec {} -- wget -q -O - echo
71 | 172.18.0.5
72 | 172.18.0.3
73 | 172.18.0.6
74 | ```
75 |
76 | These are the same IPs that are assigned to our lab Nodes:
77 |
78 | ```
79 | $ make node-ip-1 && make node-ip-2 && make node-ip-3
80 | control-plane:172.18.0.3
81 | worker:172.18.0.5
82 | worker2:172.18.0.6
83 | ```
84 |
85 | Finally, we can enable the `CiliumEgressNATPolicy` that will NAT all traffic from Pods in the default namespace to the IP of the control-plane node:
86 |
87 | ```
88 | make egress-setup
89 | ```
90 |
91 | This can be verified by re-running the earlier command:
92 |
93 | ```
94 | $ kubectl -n default get pod -l name=net-tshoot -o name | xargs -I{} kubectl -n default exec {} -- wget -q -O - echo
95 | 172.18.0.3
96 | 172.18.0.3
97 | 172.18.0.3
98 | ```
99 |
100 | We can see that now all three requests appear to have come from the same Node.
101 |
102 |
103 | ### Walkthrough
104 |
105 | Now let's briefly walk through how Cilium implements the above NAT policy. The Cilium CNI chapter [explains](http://localhost:1313/cni/cilium/#2-nodes-ebpf-programs) how certain eBPF programs get attached to different interfaces. In our case, we're looking at a program attached to all `lxc` interfaces and processing incoming packets a Pod called [`from-container`](https://github.com/cilium/cilium/blob/18513dbc1379a2d439163876e50dd68b009169fd/bpf/bpf_lxc.c#L970). Inside this program, a packet goes through several functions before it eventually gets to the `handle_ipv4_from_lxc` function ([source](https://github.com/cilium/cilium/blob/18513dbc1379a2d439163876e50dd68b009169fd/bpf/bpf_lxc.c#L510)) which does the bulk of work in IPv4 packet processing. The relevant part of this function is this one:
106 |
107 | ```c
108 | #ifdef ENABLE_EGRESS_GATEWAY
109 | {
110 | struct egress_info *info;
111 | struct endpoint_key key = {};
112 |
113 | info = lookup_ip4_egress_endpoint(ip4->saddr, ip4->daddr);
114 | if (!info)
115 | goto skip_egress_gateway;
116 |
117 | /* Encap and redirect the packet to egress gateway node through a tunnel.
118 | * Even if the tunnel endpoint is on the same host, follow the same data
119 | * path to be consistent. In future, it can be optimized by directly
120 | * direct to external interface.
121 | */
122 | ret = encap_and_redirect_lxc(ctx, info->tunnel_endpoint, encrypt_key,
123 | &key, SECLABEL, monitor);
124 | if (ret == IPSEC_ENDPOINT)
125 | goto encrypt_to_stack;
126 | else
127 | return ret;
128 | }
129 | skip_egress_gateway:
130 | #endif
131 | ```
132 |
133 | Here, our packet's source and destination IPs get passed to the `lookup_ip4_egress_endpoint` which performs a lookup in the following map:
134 |
135 |
136 | ```
137 | $ NODE=k8s-guide-worker2
138 | $ cilium=$(kubectl get -l k8s-app=cilium pods -n cilium --field-selector spec.nodeName=$NODE -o jsonpath='{.items[0].metadata.name}')
139 | $ kubectl -n cilium exec -it $cilium -- bpftool map dump pinned /sys/fs/bpf/tc/globals/cilium_egress_v4
140 | key: 30 00 00 00 0a 00 00 1f ac 12 00 00 value: ac 12 00 03 ac 12 00 03
141 | key: 30 00 00 00 0a 00 01 d1 ac 12 00 00 value: ac 12 00 03 ac 12 00 03
142 | key: 30 00 00 00 0a 00 02 0e ac 12 00 00 value: ac 12 00 03 ac 12 00 03
143 | Found 3 elements
144 | ```
145 |
146 | The above can be translated as the following:
147 |
148 | * Match all packets with source IP `10.0.0.174`, `10.0.2.86` or `10.0.1.42` (all Pods in the default namespace) and destination prefix of `172.18.0.0/16`
149 | * Return the value with egress IP of `172.18.0.3` and tunnel endpoint of `172.18.0.3`.
150 |
151 | The returned value is used in the `encap_and_redirect_lxc` function call that encapsulates the packet and forwards it to the Node with IP `172.18.0.3`.
152 |
153 | On the egress Node, our packet gets processed by the `from-overlay` function ([source](https://github.com/cilium/cilium/blob/18513dbc1379a2d439163876e50dd68b009169fd/bpf/bpf_overlay.c#L289)), and eventually falls through to the local network stack. The local network stack has the default route pointing out the `eth0` interface, which is where our packet gets forwarded next.
154 |
155 | At this point, Cilium applies its configured IP masquerade [policy](https://docs.cilium.io/en/v1.9/concepts/networking/masquerading/) using either IPTables or eBPF translation. The eBPF masquerading is implemented as a part of the `to-netdev` ([source](https://github.com/cilium/cilium/blob/18513dbc1379a2d439163876e50dd68b009169fd/bpf/bpf_host.c#L1010)) program attached to the egress direction of the `eth0` interface.
156 |
157 | ```c
158 | #if defined(ENABLE_NODEPORT) && \
159 | (!defined(ENABLE_DSR) || \
160 | (defined(ENABLE_DSR) && defined(ENABLE_DSR_HYBRID)) || \
161 | defined(ENABLE_MASQUERADE) || \
162 | defined(ENABLE_EGRESS_GATEWAY))
163 | if ((ctx->mark & MARK_MAGIC_SNAT_DONE) != MARK_MAGIC_SNAT_DONE) {
164 | ret = handle_nat_fwd(ctx);
165 | if (IS_ERR(ret))
166 | return send_drop_notify_error(ctx, 0, ret,
167 | CTX_ACT_DROP,
168 | METRIC_EGRESS);
169 | }
170 | #endif
171 | ```
172 |
173 | From `handle_nat_fwd` function ([source](https://github.com/cilium/cilium/blob/18513dbc1379a2d439163876e50dd68b009169fd/bpf/lib/nodeport.h#L2179)) the processing goes through `tail_handle_nat_fwd_ipv4`, `nodeport_nat_ipv4_fwd` and eventually gets to the `snat_v4_process` function ([source](https://github.com/cilium/cilium/blob/18513dbc1379a2d439163876e50dd68b009169fd/bpf/lib/nat.h#L504)) where all of the NAT translations take place. All new packets will fall through to the `snat_v4_new_mapping` function where a new random source port will be allocated to the packet:
174 |
175 | ```c
176 | #pragma unroll
177 | for (retries = 0; retries < SNAT_COLLISION_RETRIES; retries++) {
178 | if (!snat_v4_lookup(&rtuple)) {
179 | ostate->common.created = bpf_mono_now();
180 | rstate.common.created = ostate->common.created;
181 |
182 | ret = snat_v4_update(otuple, ostate, &rtuple, &rstate);
183 | if (!ret)
184 | break;
185 | }
186 |
187 | port = __snat_clamp_port_range(target->min_port,
188 | target->max_port,
189 | retries ? port + 1 :
190 | get_prandom_u32());
191 | rtuple.dport = ostate->to_sport = bpf_htons(port);
192 | }
193 | ```
194 |
195 | Finally, once the new source port has been selected and the connection tracking entry for subsequent packets set up, the packet gets its headers updated and before being sent out of the egress interface:
196 |
197 | ```c
198 | return dir == NAT_DIR_EGRESS ?
199 | snat_v4_rewrite_egress(ctx, &tuple, state, off, ipv4_has_l4_header(ip4)) :
200 | snat_v4_rewrite_ingress(ctx, &tuple, state, off);
201 | ```
--------------------------------------------------------------------------------
/content/ingress/gateway/_index.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: "Gateway API"
3 | date: 2020-09-13T17:33:04+01:00
4 | weight: 20
5 | summary: "Evolution of Ingress API"
6 | ---
7 |
8 | Ingress API has had a very difficult history and had remained in `v1beta1` for many years. Despite having a thriving ecosystem of controller implementations, their use of Ingress API have remained largely incompatible. In addition to that, the same controller vendors have started shipping their own set of custom resources designed to address the limitations of Ingress API. At some point, Kubernetes SIG Network group even discussed the possibility of scrapping the Ingress API altogether and letting each vendor bring their own set of CRDs (see "Ingress Discussion Notes" in [Network SIG Meeting Minutes](https://docs.google.com/document/d/1_w77-zG_Xj0zYvEMfQZTQ-wPP4kXkpGD8smVtW_qqWM/edit)). Despite all that, Ingress API has survived, addressed some of the more pressing issues and finally got promoted to `v1` in Kuberntes `v1.19`. However, some of the problems could not be solved by an incremental re-design and this is why the [Gateway API](https://gateway-api.sigs.k8s.io/) project (formerly called Service API) was founded.
9 |
10 | Gateway API decomposes a single Ingress API into a set of [independent resources](https://gateway-api.sigs.k8s.io/concepts/api-overview/) that can be combined via label selectors and references to build the desired proxy state. This decomposition follows a pattern very commonly found in proxy configuration -- listener, route and backends -- and can be viewed as a hierarchy of objects:
11 |
12 | |Hierarchy | Description |
13 | |--------------|---|
14 | | Gateway Class | Identifies a single GatewayAPI controller installed in a cluster. |
15 | | Gateway | Associates listeners with Routes, belongs to one of the Gateway classes. |
16 | | Route | Defines rules for traffic routing by linking Gateways with Services. |
17 | | Service | Represents a set of Endpoints to be used as backends. |
18 |
19 | This is how the above hierarchy can be combined to expose an existing `web` Service to the outside world as `http://gateway.tkng.io` (see the Lab [walkthrough](http://localhost:1313/ingress/gateway/#walkthrough) for more details):
20 |
21 | ```yaml
22 | apiVersion: networking.x-k8s.io/v1alpha1
23 | kind: GatewayClass
24 | metadata:
25 | name: istio
26 | spec:
27 | controller: istio.io/gateway-controller
28 | ---
29 | apiVersion: networking.x-k8s.io/v1alpha1
30 | kind: Gateway
31 | metadata:
32 | name: gateway
33 | namespace: istio-system
34 | spec:
35 | gatewayClassName: istio
36 | listeners:
37 | - hostname: "*"
38 | port: 80
39 | protocol: HTTP
40 | routes:
41 | namespaces:
42 | from: All
43 | selector:
44 | matchLabels:
45 | selected: "yes"
46 | kind: HTTPRoute
47 | ---
48 | apiVersion: networking.x-k8s.io/v1alpha1
49 | kind: HTTPRoute
50 | metadata:
51 | name: http
52 | namespace: default
53 | labels:
54 | selected: "yes"
55 | spec:
56 | gateways:
57 | allow: All
58 | hostnames: ["gateway.tkng.io"]
59 | rules:
60 | - matches:
61 | - path:
62 | type: Prefix
63 | value: /
64 | forwardTo:
65 | - serviceName: web
66 | port: 80
67 | ```
68 |
69 | Regardless of all the new features and operational benefits Gateway API brings, its final goal is exactly the same as for Ingress API -- to configure a proxy for external access to applications running in a cluster.
70 |
71 | {{< iframe "https://viewer.diagrams.net/?highlight=0000ff&edit=_blank&hide-pages=1&editable=false&layers=1&nav=0&page-id=872_TPyC9xnwDXYNSrfC&title=k8s-guide.drawio#Uhttps%3A%2F%2Fraw.githubusercontent.com%2Fnetworkop%2Fk8s-guide-labs%2Fmaster%2Fdiagrams%2Fk8s-guide.drawio" >}}
72 |
73 |
74 | ## Lab
75 |
76 | For this lab exercise, we'll use one of the Gateway API implementations from [Istio](https://kubernetes.github.io/ingress-nginx/).
77 |
78 |
79 | ### Preparation
80 |
81 |
82 | Assuming that the lab environment is already [set up](/lab/), `Istio` can be set up with the following commands:
83 |
84 |
85 | ```
86 | make gateway-setup
87 | ```
88 |
89 | Wait for all Istio Pods to fully initialise:
90 |
91 | ```
92 | $ make gateway-check
93 | pod/istio-ingressgateway-574dff7b88-9cd7v condition met
94 | pod/istiod-59db6b6d9-pl6np condition met
95 | pod/metallb-controller-748756655f-zqdxn condition met
96 | pod/metallb-speaker-97tb7 condition met
97 | pod/metallb-speaker-pwvrx condition met
98 | pod/metallb-speaker-qln9k condition met
99 | ```
100 |
101 | Set up a test Deployment to be used in the walkthrough:
102 |
103 |
104 | ```
105 | $ make deployment && make cluster-ip
106 | ```
107 |
108 | Make sure that the Gateway has been assigned with a LoadBalancer IP:
109 |
110 | ```
111 | $ kubectl get -n istio-system gateways gateway -o jsonpath='{.status.addresses}' | jq
112 | [
113 | {
114 | "type": "IPAddress",
115 | "value": "198.51.100.0"
116 | }
117 | ]
118 | ```
119 |
120 | Now we can verify the functionality:
121 |
122 | ```
123 | $ docker exec k8s-guide-control-plane curl -s -HHost:gateway.tkng.io http://198.51.100.0/ | grep Welcome
124 | Welcome to nginx!
125 |
Welcome to nginx!
126 | ```
127 |
128 | ### Walkthrough
129 |
130 | One of the easiest ways to very data plane configuration is to use the [istioctl](https://istio.io/latest/docs/setup/install/istioctl/) tool. The first thing we can do is look at the current state of all data plane proxies. In our case we're not using Istio's service mesh functionality, so the only proxy will be the `istio-ingressgateway`:
131 |
132 | ```
133 | $ istioctl proxy-status
134 | NAME CDS LDS EDS RDS ISTIOD VERSION
135 | istio-ingressgateway-574dff7b88-tnqck.istio-system SYNCED SYNCED SYNCED SYNCED istiod-59db6b6d9-j8kt8 1.12-alpha.2a768472737998f0e13cfbfec74162005c53300c
136 | ```
137 |
138 | Let's take a close look at the `proxy-config`, starting with the current set of listeners:
139 |
140 | ```
141 | $ istioctl proxy-config listener istio-ingressgateway-574dff7b88-tnqck.istio-system
142 | ADDRESS PORT MATCH DESTINATION
143 | 0.0.0.0 8080 ALL Route: http.8080
144 | 0.0.0.0 15021 ALL Inline Route: /healthz/ready*
145 | 0.0.0.0 15090 ALL Inline Route: /stats/prometheus*
146 | ```
147 |
148 | The one that we're interested in is called `http.8080` and here is how we can check all of the routing currently configured for it:
149 |
150 | ```json
151 | "istioctl proxy-config route istio-ingressgateway-574dff7b88-tnqck.istio-system --name http.8080 -ojson"
152 | [
153 | {
154 | "name": "http.8080",
155 | "virtualHosts": [
156 | {
157 | "name": "gateway.tkng.io:80",
158 | "domains": [
159 | "gateway.tkng.io",
160 | "gateway.tkng.io:*"
161 | ],
162 | "routes": [
163 | {
164 | "match": {
165 | "prefix": "/",
166 | "caseSensitive": true
167 | },
168 | "route": {
169 | "cluster": "outbound|80||web.default.svc.cluster.local",
170 | "timeout": "0s",
171 | "retryPolicy": {
172 | "retryOn": "connect-failure,refused-stream,unavailable,cancelled,retriable-status-codes",
173 | "numRetries": 2,
174 | "retryHostPredicate": [
175 | {
176 | "name": "envoy.retry_host_predicates.previous_hosts"
177 | }
178 | ],
179 | "hostSelectionRetryMaxAttempts": "5",
180 | "retriableStatusCodes": [
181 | 503
182 | ]
183 | },
184 | "maxGrpcTimeout": "0s"
185 | },
186 | "metadata": {
187 | "filterMetadata": {
188 | "istio": {
189 | "config": "/apis/networking.istio.io/v1alpha3/namespaces/default/virtual-service/http-istio-autogenerated-k8s-gateway"
190 | }
191 | }
192 | },
193 | "decorator": {
194 | "operation": "web.default.svc.cluster.local:80/*"
195 | }
196 | }
197 | ],
198 | "includeRequestAttemptCount": true
199 | }
200 | ],
201 | "validateClusters": false
202 | }
203 | ]
204 | ```
205 |
206 | From the above output we can see that the proxy is set up to route all HTTP requests with `Host: gateway.tkng.io` header to a cluster called `outbound|80||web.default.svc.cluster.local`. Let's check this cluster's Endpoints:
207 |
208 | ```
209 | $ istioctl proxy-config endpoints istio-ingressgateway-574dff7b88-tnqck.istio-system --cluster "outbound|80||web.default.svc.cluster.local"
210 | ENDPOINT STATUS OUTLIER CHECK CLUSTER
211 | 10.244.1.12:80 HEALTHY OK outbound|80||web.default.svc.cluster.local
212 | ```
213 |
214 | The above Endpoint address corresponds to the only running Pod in the `web` deployment:
215 |
216 | ```
217 | $ kubectl get pod -owide -l app=web
218 | NAME READY STATUS RESTARTS AGE IP NODE NOMINATED NODE READINESS GATES
219 | web-96d5df5c8-p8f97 1/1 Running 0 104m 10.244.1.12 k8s-guide-worker
220 | ```
--------------------------------------------------------------------------------
/content/ingress/ingress/_index.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: "Ingress API"
3 | date: 2020-09-13T17:33:04+01:00
4 | weight: 10
5 | summary: "Ingress proxy routing"
6 | ---
7 |
8 | Although technically it is possible to expose internal applications via NodePort or LoadBalancer Services, this happens very rarely. There are two main reasons for that:
9 |
10 | * **Costs** -- since each LoadBalancer Services is associated with a single external address, this can translate into a sizeable fee when running in a public cloud environment.
11 | * **Functionality** -- simple L4 load balancing provided by Services lacks a lot of the features that are typically associated with an application proxy or gateway. This means that each exposed application will need to take care of things like TLS management, rate-limiting, authentication and intelligent traffic routing on its own.
12 |
13 | Ingress was designed as a generic, vendor-independent API to configure an HTTP load balancer that would be available to multiple Kubernetes applications. Running an Ingress would amortise the costs and efforts of implementing an application gateway functionality and provide an easy to consume, native Kubernetes experience to cluster operators and users. At the very least, a user is expected to define a single rule telling the Ingress which backend Service to use. This would result in all incoming HTTP requests to be routed to one of the healthy Endpoint of this Service:
14 |
15 | ```yaml
16 | apiVersion: networking.k8s.io/v1
17 | kind: Ingress
18 | metadata:
19 | name: example
20 | spec:
21 | rules:
22 | - http:
23 | paths:
24 | - backend:
25 | service:
26 | name: web
27 | port:
28 | number: 80
29 | path: /
30 | ```
31 |
32 | Similar to Service type [LoadBalancer](/services/loadbalancer/), Kuberenetes only defines the Ingress API and leaves implementation to cluster add-ons. In public cloud environments, these functions are implemented by existing application load balancers, e.g. [Application Gateway](https://azure.microsoft.com/en-us/services/application-gateway/) in AKS, [Application Load Balancer](https://docs.aws.amazon.com/eks/latest/userguide/alb-ingress.html) in EKS or [Google Front Ends (GFEs)](https://cloud.google.com/load-balancing/docs/https) for GKE. However, unlike a LoadBalancer controller, Kubernetes distributions do not limit the type of Ingress controller that can be deployed to perform these functions. There are over a dozen of Ingress controller implementations from the major load balancer, proxy and service mesh vendors which makes choosing the right Ingress controller a very daunting task. Several attempts have been made to compile a decision matrix to help with this choice -- [one](https://docs.google.com/spreadsheets/d/1DnsHtdHbxjvHmxvlu7VhzWcWgLAn_Mc5L1WlhLDA__k/edit#gid=0) done by Flant and [one](https://docs.google.com/spreadsheets/d/191WWNpjJ2za6-nbG4ZoUMXMpUK8KlCIosvQB0f-oq3k/edit#gid=907731238) by learnk8s.io. Multiple Ingress controllers can be deployed in a single cluster and Ingress resources are associated with a particular controller based on the `.spec.ingressClassName` field.
33 |
34 | Ingress controller's implementation almost always includes the following two components:
35 |
36 | * **Controller** -- a process that communicates with the API server and collects all of the information required to successfully provision its proxies.
37 | * **Proxy** -- a data plane component, managed by the controller (via API, plugins or plain text files), can be scaled up and down by the Horizontal Pod Autoscaler.
38 |
39 | Typically, during the installation process, an Ingress Controller creates a Service type LoadBalancer and uses the allocated IP to update the `.status.loadBalancer` field of all managed Ingresses.
40 |
41 | {{< iframe "https://viewer.diagrams.net/?highlight=0000ff&edit=_blank&hide-pages=1&editable=false&layers=1&nav=0&page-id=tSopwAg3hkGCBVX-7IBd&title=k8s-guide.drawio#Uhttps%3A%2F%2Fraw.githubusercontent.com%2Fnetworkop%2Fk8s-guide-labs%2Fmaster%2Fdiagrams%2Fk8s-guide.drawio" >}}
42 |
43 |
44 | ## Lab
45 |
46 | For this lab exercise, we'll use one of the most popular open-source Ingress controllers -- [ingress-nginx](https://kubernetes.github.io/ingress-nginx/).
47 |
48 |
49 | ### Preparation
50 |
51 |
52 | Assuming that the lab environment is already [set up](/lab/), `ingress-nginx` can be set up with the following commands:
53 |
54 | ```
55 | make ingress-setup
56 | ```
57 |
58 | Install a LoadBalancer controller to allocate external IP for the Ingress controller
59 |
60 | ```
61 | make metallb
62 | ```
63 |
64 | Wait for Ingress controller to fully initialise
65 |
66 | ```
67 | make ingress-wait
68 | ```
69 |
70 | Set up a couple of test Deployment and associated Ingress resources to be used in the walkthrough.
71 |
72 | ```
73 | make ingress-prep
74 | ```
75 |
76 | The above command sets up two ingress resources -- one doing the path-based routing and one doing the host-based routing. Use the following command to confirm that both Ingresses have been set up and assigned with an external IP:
77 |
78 | ```
79 | $ kubectl get ing
80 | NAME CLASS HOSTS ADDRESS PORTS AGE
81 | tkng-1 nginx * 198.51.100.0 80 46s
82 | tkng-2 nginx prod,dev 198.51.100.0 80 26s
83 | ```
84 |
85 |
86 | Now we can verify the path-based routing functionality:
87 |
88 | ```
89 | $ docker exec k8s-guide-control-plane curl -s http://198.51.100.0/dev
90 | Server address: 10.244.1.14:8080
91 | Server name: dev-694776949d-w2fw7
92 | Date: 29/Aug/2021:16:25:41 +0000
93 | URI: /dev
94 | Request ID: 6ccd350709dd92b76cdfabbcbf92d5c5
95 |
96 | $ docker exec k8s-guide-control-plane curl -s http://198.51.100.0/prod
97 | Server address: 10.244.1.13:8080
98 | Server name: prod-559ccb4b56-5krn6
99 | Date: 29/Aug/2021:16:25:50 +0000
100 | URI: /prod
101 | Request ID: 2fed2ada42daf911057c798e74504453
102 | ```
103 |
104 | And the host-based routing:
105 |
106 | ```
107 | $ docker exec k8s-guide-control-plane curl -s --resolve prod:80:198.51.100.0 http://prod
108 | Server address: 10.244.1.13:8080
109 | Server name: prod-559ccb4b56-5krn6
110 | Date: 29/Aug/2021:16:25:58 +0000
111 | URI: /
112 | Request ID: 8b28ba1ccab240700a6264024785356b
113 |
114 | $ docker exec k8s-guide-control-plane curl -s --resolve dev:80:198.51.100.0 http://dev
115 | Server address: 10.244.1.14:8080
116 | Server name: dev-694776949d-w2fw7
117 | Date: 29/Aug/2021:16:26:08 +0000
118 | URI: /
119 | Request ID: 5c8a8cfa037a2ece0c3cfe8fd2e1597d
120 | ```
121 |
122 | To confirm that the HTTP routing is correct, take note of the `Server name` field of the response, which should match the name of the backend Pod:
123 |
124 | ```
125 | $ kubectl get pod
126 | NAME READY STATUS RESTARTS AGE
127 | dev-694776949d-w2fw7 1/1 Running 0 10m
128 | prod-559ccb4b56-5krn6 1/1 Running 0 10m
129 | ```
130 |
131 | ### Walkthrough
132 |
133 | Let's start by looking at the Ingress controller logs to see what happens when a new Ingress resource gets added to the API server:
134 |
135 | ```
136 | $ kubectl logs deploy/ingress-controller-ingress-nginx-controller
137 | I0826 16:10:40.364640 8 main.go:101] "successfully validated configuration, accepting" ingress="tkng-1/default"
138 | I0826 16:10:40.371315 8 store.go:365] "Found valid IngressClass" ingress="default/tkng-1" ingressclass="nginx"
139 | I0826 16:10:40.371770 8 event.go:282] Event(v1.ObjectReference{Kind:"Ingress", Namespace:"default", Name:"tkng-1", UID:"8229d775-0a73-4484-91bf-fdb9053922b5", APIVersion:"networking.k8s.io/v1", ResourceVersion:"22155", FieldPath:""}): type: 'Normal' reason: 'Sync' Scheduled for sync
140 | I0826 16:10:40.372381 8 controller.go:150] "Configuration changes detected, backend reload required"
141 | ingress.networking.k8s.io/tkng-1 created
142 | I0826 16:10:40.467838 8 controller.go:167] "Backend successfully reloaded"
143 | I0826 16:10:40.468147 8 event.go:282] Event(v1.ObjectReference{Kind:"Pod", Namespace:"kube-system", Name:"ingress-controller-ingress-nginx-controller-84d5f6c695-pd54s", UID:"b6b63172-0240-41fb-a110-e18f475caddf", APIVersion:"v1", ResourceVersion:"14712", FieldPath:""}): type: 'Normal' reason: 'RELOAD' NGINX reload triggered due to a change in configuration
144 | I0826 16:11:29.812516 8 status.go:284] "updating Ingress status" namespace="default" ingress="tkng-1" currentValue=[] newValue=[{IP:198.51.100.0 Hostname: Ports:[]}]
145 | I0826 16:11:29.818436 8 event.go:282] Event(v1.ObjectReference{Kind:"Ingress", Namespace:"default", Name:"tkng-1", UID:"8229d775-0a73-4484-91bf-fdb9053922b5", APIVersion:"networking.k8s.io/v1", ResourceVersion:"22343", FieldPath:""}): type: 'Normal' reason: 'Sync' Scheduled for sync
146 | ```
147 |
148 | Most of the above log is self-explanatory -- we see that the controller performs some initial validations, updates the configuration, triggers a proxy reload and updates the status field of the managed Ingress. We can see where the allocated IP is coming from by looking at the associated LoadBalancer service:
149 |
150 | ```
151 | $ kubectl -n kube-system get svc -l app.kubernetes.io/name=ingress-nginx
152 | NAME TYPE CLUSTER-IP EXTERNAL-IP PORT(S) AGE
153 | ingress-controller-ingress-nginx-controller LoadBalancer 10.96.193.245 198.51.100.0 80:30881/TCP,443:31634/TCP 36m
154 | ```
155 |
156 | Now that we know what happens when a new Ingress is processed, let's take a look inside the Ingress controller pod
157 |
158 |
159 | ```
160 | $ kubectl -n kube-system exec -it deploy/ingress-controller-ingress-nginx-controller -- pgrep -l nginx
161 | 8 /nginx-ingress-controller
162 | 31 nginx: master process /usr/local/nginx/sbin/nginx -c /etc/nginx/nginx.conf
163 | 579 nginx: worker process
164 | 580 nginx: worker process
165 | 581 nginx: worker process
166 | 582 nginx: worker process
167 | 583 nginx: worker process
168 | 584 nginx: worker process
169 | 585 nginx: worker process
170 | 586 nginx: worker process
171 | 587 nginx: cache manager process
172 | ```
173 |
174 | Here we see to main components described above -- a controller called `nginx-ingress-controller` and a proxy process `/usr/local/nginx/sbin/nginx`. We also see that the proxy is started with the `-c` argument, pointing it at the configuration file. If we look inside this configuration file, we should see the host-based routing [`server_name`](https://nginx.org/en/docs/http/ngx_http_core_module.html#server_name) directives:
175 | ```
176 | $ kubectl -n kube-system exec -it deploy/ingress-controller-ingress-nginx-controller -- cat /etc/nginx/nginx.conf | grep server_name
177 | server_names_hash_max_size 1024;
178 | server_names_hash_bucket_size 32;
179 | server_name_in_redirect off;
180 | server_name _ ;
181 | server_name dev ;
182 | server_name prod ;
183 | ```
184 |
185 | Similarly, we can view the path-based routing [`location`](https://nginx.org/en/docs/http/ngx_http_core_module.html#location) directives:
186 |
187 | ```
188 | kubectl exec -it deploy/ingress-controller-ingress-nginx-controller -- cat /etc/nginx/nginx.conf | grep "location /"
189 | location /prod/ {
190 | location /dev/ {
191 | location / {
192 | location /healthz {
193 | location /nginx_status {
194 | location / {
195 | location / {
196 | location / {
197 | location /healthz {
198 | location /is-dynamic-lb-initialized {
199 | location /nginx_status {
200 | location /configuration {
201 | location / {
202 | ```
203 |
204 | Examining the plain `nginx.conf` configuration can be a bit difficult, especially for large configs. A simpler way of doing it is using an [ingress-nginx plugin](https://kubernetes.github.io/ingress-nginx/kubectl-plugin/) for kubectl which can be installed with [krew](https://krew.sigs.k8s.io/docs/user-guide/setup/install/). For example, this is how we could list all active Ingress resources managed by this controller:
205 |
206 |
207 | ```
208 | $ kubectl ingress-nginx ingresses --all-namespaces
209 | NAMESPACE INGRESS NAME HOST+PATH ADDRESSES TLS SERVICE SERVICE PORT ENDPOINTS
210 | default tkng-1 /prod 198.51.100.0 NO prod 8080 1
211 | default tkng-1 /dev 198.51.100.0 NO dev 8080 1
212 | default tkng-2 prod/ 198.51.100.0 NO prod 8080 1
213 | default tkng-2 dev/ 198.51.100.0 NO dev 8080 1
214 | ```
215 |
216 | Backend objects are [not managed](https://kubernetes.github.io/ingress-nginx/how-it-works/#avoiding-reloads-on-endpoints-changes) via a configuration file, so you won't see them in the `nginx.conf` rendered by the controller. The only way to view them is using the `ingress-nginx` plugin, e.g.:
217 |
218 | ```
219 | $ kubectl ingress-nginx -n kube-system backends --deployment ingress-controller-ingress-nginx-controller | jq -r '.[] | "\(.name) => \(.endpoints)"'
220 | default-dev-8080 => [{"address":"10.244.1.16","port":"8080"}]
221 | default-prod-8080 => [{"address":"10.244.2.14","port":"8080"}]
222 | upstream-default-backend => [{"address":"127.0.0.1","port":"8181"}]
223 | ```
224 |
225 |
226 | {{% notice warning %}}
227 | The above walkthrough is only applicable to the `nginx-ingress` controller. Other controllers may implement the same functionality differently, even if the data plane proxy is the same (e.g. nginx-ingress vs F5 nginx Ingress controller). Ingress API changes do not necessarily result in a complete proxy reload, assuming the underlying proxy supports hot restarts, e.g. [Envoy](https://www.envoyproxy.io/docs/envoy/latest/intro/arch_overview/operations/hot_restart).
228 | {{% /notice %}}
--------------------------------------------------------------------------------
/content/lab/_index.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: "Lab Setup"
3 | menutitle: "Lab Setup"
4 | date: 2020-09-13T17:33:04+01:00
5 | summary: "Prerequisites and setup of the lab environment"
6 | ---
7 |
8 | {{% notice info %}}
9 | All labs are stored in a separate Github repository -- [k8s-guide-labs](https://github.com/networkop/k8s-guide-labs)
10 | {{% /notice %}}
11 |
12 | ## Prerequisites
13 |
14 | In order to interact with the lab, the following set of tools need to be pre-installed:
15 |
16 | * **Docker** with `containerd` runtime. This is what you get by default when you install [docker-ce](https://docs.docker.com/engine/install/).
17 | * **kubectl** to interact with a Kubernetes cluster. Installation instructions can be found [here](https://kubernetes.io/docs/tasks/tools/install-kubectl/).
18 | * **helm** to bootstrap the cluster with Flux. Installation instructions can be found [here](https://github.com/helm/helm#install)
19 | * **make** is used to automate and orchestrate manual tasks. Most instructions will be provided as a series of make commands.
20 |
21 |
22 | {{% notice info %}}
23 | A number of additional tools (e.g. kind) will be installed automatically during the Setup phase
24 | {{% /notice %}}
25 |
26 | Some **optional extras** that may make your life a lot easier:
27 |
28 | * [kubectl](https://kubernetes.io/docs/tasks/tools/install-kubectl/#optional-kubectl-configurations) and [docker](https://github.com/docker/docker-ce/tree/master/components/cli/contrib/completion) commands auto-completion.
29 | * [kubens/kubectx](https://github.com/ahmetb/kubectx) to easily switch between namespaces and contexts.
30 | * [stern](https://github.com/stern/stern) to read logs from multiple Pods at the same time.
31 | * [k9s](https://github.com/derailed/k9s) is a very convinient terminal dashboard for a Kubernetes cluster.
32 |
33 | Installation instructions will depend on the operating system. Here's an example [bash bootstrap script](https://gist.github.com/hellt/61242c680c78c3c813f20ecb9577a93e) for Ubuntu 20.4 LTS x86_64.
34 |
35 | ## Supported Operating Systems
36 |
37 | The main supported operating system is **Linux**. The kernel version can be anything that's `>=4.19`.
38 |
39 | {{% notice note %}}
40 | Most of the things should also be supported on Darwin. If you find a discrepancy and know how to fix it, please submit a PR.
41 | {{% /notice %}}
42 |
43 |
44 | ## Setup instructions
45 |
46 | Clone the k8s-guide-labs repository:
47 |
48 | ```bash
49 | git clone https://github.com/networkop/k8s-guide-labs.git && cd k8s-guide-labs
50 | ```
51 |
52 | To view the list of available operations do:
53 |
54 | ```bash
55 | $ make
56 |
57 | check Check prerequisites
58 | setup Setup the lab environment
59 | up Bring up the cluster
60 | connect Connect to Weave Scope
61 | tshoot Connect to the troubleshooting pod
62 | reset Reset k8s cluster
63 | down Shutdown
64 | cleanup Destroy the lab environment
65 | ```
66 |
67 | Check and install the required prerequisites:
68 |
69 | ```bash
70 | $ make check
71 | all good
72 | ```
73 |
74 | Setup the lab environment with:
75 |
76 | ```bash
77 | make setup
78 | ```
79 |
80 | Finally, bootstrap the cluster with Flux:
81 |
82 |
83 | ```bash
84 | make up
85 | ```
86 |
87 | {{% notice tip %}}
88 | All labs are built in [GitOps](https://www.weave.works/technologies/gitops/) style using [Flux](https://github.com/fluxcd/flux) as the controller that manages the state of the cluster.
89 | {{% /notice %}}
90 |
91 | ## Interacting with the Lab
92 |
93 | The lab consists of a local Kubernetes cluster along with a caching pull-through Docker registry to speed up download times. The cluster is built with [kind](https://github.com/kubernetes-sigs/kind) and the caching registry is a standalone container running alongside of it.
94 |
95 | To build the cluster for the first time run:
96 |
97 | ```
98 | make up
99 | ```
100 |
101 | In order to stop the cluster (e.g. to free up resources) run:
102 |
103 | ```
104 | make down
105 | ````
106 |
107 | In order to rebuild the cluster (combined `up` and `down`) run:
108 |
109 | ```
110 | make reset
111 | ```
112 |
113 | To completely destroy the lab environment, including the caching registry run:
114 |
115 |
116 | ```
117 | make cleanup
118 | ```
119 |
120 |
121 | ## Default applications
122 |
123 | The lab cluster is setup with a couple of applications that will be used throughout this guide:
124 |
125 | 1. **[Weave Scope](https://github.com/weaveworks/scope)** -- a tool to visualise and monitor Kubernetes cluster workloads.
126 |
127 | {{% notice tip %}}
128 | To connect to Weave Scope's front-end, run `make connect` and go to [http://localhost:8080](http://localhost:8080)
129 | {{% /notice %}}
130 |
131 |
132 | 2. **[netshoot](https://github.com/nicolaka/netshoot)** -- deployed as a Daemonset, a docker image pre-installed with a wide range of network troubleshooting tools.
133 |
134 | {{% notice tip %}}
135 | To connect to a Pod running on a particular Node (e.g. k8s-guide-worker), run `NODE=k8s-guide-worker make tshoot`
136 | {{% /notice %}}
137 |
--------------------------------------------------------------------------------
/content/security/_index.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: "Network Policies"
3 | date: 2020-09-13T17:33:04+01:00
4 | weight: 70
5 | summary: "Network Policies & Access Control"
6 | ---
7 |
8 | # Under construction [help needed]
--------------------------------------------------------------------------------
/content/services/Headless/_index.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: "Headless"
3 | date: 2020-09-13T17:33:04+01:00
4 | weight: 30
5 | draft: false
6 | ---
7 |
8 | This type of service does not perform any load-balancing and only implements DNS Service Discovery, based on the Kubernetes [DNS Spec](https://github.com/kubernetes/dns/blob/master/docs/specification.md#24---records-for-a-headless-service). Although this is the simplest and the most basic type of Service, its use is mainly limited to stateful applications like databases and clusters. In these use case the assumption is that clients have some prior knowledge about the application they're going to be communicating with, e.g. number of nodes, naming structure, and can handle failover and load-balancing on their own.
9 |
10 | Some typical examples of stateful applications that use this kind of service are:
11 |
12 | * [zookeeper](https://github.com/bitnami/charts/blob/master/bitnami/zookeeper/templates/svc-headless.yaml)
13 | * [etcd](https://github.com/bitnami/charts/blob/master/bitnami/etcd/templates/svc-headless.yaml)
14 | * [consul](https://github.com/hashicorp/consul-helm/blob/master/templates/server-service.yaml)
15 |
16 | The only thing that makes a service "Headless" is the `clusterIP: None` which, on the one hand, tells dataplane agents to ignore this resource and, on the other hand, tells the DNS plugin that it needs [special type of processing](https://github.com/coredns/coredns/blob/5b9b079dabc7f71463cea3f0c6a92f338935039d/plugin/kubernetes/kubernetes.go#L461). The rest of the API parameters look similar to any other Service:
17 |
18 |
19 | ```yaml
20 | apiVersion: v1
21 | kind: Service
22 | metadata:
23 | name: headless
24 | namespace: default
25 | spec:
26 | clusterIP: None
27 | ports:
28 | - name: http
29 | port: 8080
30 | selector:
31 | app: database
32 | ```
33 |
34 | The corresponding Endpoints resources are still creates for every healthy backend Pod, with the only notable distinction being the absence of hash in Pods name and presence of the hostname field.
35 |
36 | ```yaml
37 | apiVersion: v1
38 | kind: Endpoints
39 | metadata:
40 | labels:
41 | service.kubernetes.io/headless: ""
42 | name: headless
43 | namespace: default
44 | subsets:
45 | - addresses:
46 | - hostname: database-0
47 | ip: 10.244.0.12
48 | nodeName: k8s-guide-control-plane
49 | targetRef:
50 | kind: Pod
51 | name: database-0
52 | namespace: default
53 | ports:
54 | - name: http
55 | port: 8080
56 | protocol: TCP
57 | ```
58 | {{% notice info %}}
59 |
60 | In order to optimise the work of kube-proxy and other controllers that may need to read Endpoints, their Controller annotates all objects with the `service.kubernetes.io/headless` label.
61 | {{% /notice %}}
62 |
63 |
64 | ## Implementation
65 |
66 | This type of service is implemented entirely within a DNS plugin. The following is a simplified version of the [actual code](https://github.com/coredns/coredns/blob/5b9b079dabc7f71463cea3f0c6a92f338935039d/plugin/kubernetes/kubernetes.go#L383) from CoreDNS's kubernetes plugin:
67 |
68 | {{< gist networkop cc2f49248321e6547d880ea1406704ea >}}
69 |
70 |
71 | CoreDNS builds an internal representation of Services, containing only the information that may be relevant to DNS (IPs, port numbers) and dropping all of the other details. This information is later used to build a DNS response.
72 |
73 |
74 | ### Lab
75 |
76 | Assuming that the lab is already [setup](/lab/), we can install a stateful application (consul) with the following command:
77 |
78 | ```bash
79 | make headless
80 | ```
81 |
82 | Check that the consul statefulset has been deployed:
83 |
84 | ```bash
85 | $ kubect get sts
86 | NAME READY AGE
87 | consul-server 3/3 25m
88 | ```
89 |
90 | Now we should be able to see one Headless Services in the default namespace:
91 |
92 | ```bash
93 | $ kubect get svc consul-server
94 | NAME TYPE CLUSTER-IP EXTERNAL-IP PORT(S) AGE
95 | consul-server ClusterIP None 8500/TCP,8301/TCP,8301/UDP,8302/TCP,8302/UDP,8300/TCP,8600/TCP,8600/UDP 29m
96 | ```
97 |
98 | To interact with this service, we can do a DNS query from any of the `net-tshoot` Pods:
99 |
100 | ```
101 | kubectl exec -it net-tshoot-8kqh6 -- dig consul-server +search
102 |
103 | ; <<>> DiG 9.16.11 <<>> consul-server +search
104 | ;; global options: +cmd
105 | ;; Got answer:
106 | ;; WARNING: .local is reserved for Multicast DNS
107 | ;; You are currently testing what happens when an mDNS query is leaked to DNS
108 | ;; ->>HEADER<<- opcode: QUERY, status: NOERROR, id: 2841
109 | ;; flags: qr aa rd; QUERY: 1, ANSWER: 3, AUTHORITY: 0, ADDITIONAL: 1
110 | ;; WARNING: recursion requested but not available
111 |
112 | ;; OPT PSEUDOSECTION:
113 | ; EDNS: version: 0, flags:; udp: 4096
114 | ; COOKIE: fe116ac7ab444725 (echoed)
115 | ;; QUESTION SECTION:
116 | ;consul-server.default.svc.cluster.local. IN A
117 |
118 | ;; ANSWER SECTION:
119 | consul-server.default.svc.cluster.local. 13 IN A 10.244.2.8
120 | consul-server.default.svc.cluster.local. 13 IN A 10.244.1.8
121 | consul-server.default.svc.cluster.local. 13 IN A 10.244.0.6
122 |
123 | ;; Query time: 0 msec
124 | ;; SERVER: 10.96.0.10#53(10.96.0.10)
125 | ;; WHEN: Sat Jun 05 15:30:09 UTC 2021
126 | ;; MSG SIZE rcvd: 245
127 | ```
128 |
129 | Application interacting with this StatefulSet can make use of DNS SRV lookup to find individual hostnames and port numbers exposed by the backend Pods:
130 |
131 | ```
132 | $ kubectl exec -it net-tshoot-8kqh6 -- dig consul-server +search srv +short
133 | 0 4 8301 consul-server-2.consul-server.default.svc.cluster.local.
134 | 0 4 8600 consul-server-2.consul-server.default.svc.cluster.local.
135 | 0 4 8300 consul-server-2.consul-server.default.svc.cluster.local.
136 | 0 4 8500 consul-server-2.consul-server.default.svc.cluster.local.
137 | 0 4 8302 consul-server-2.consul-server.default.svc.cluster.local.
138 | 0 4 8301 consul-server-1.consul-server.default.svc.cluster.local.
139 | 0 4 8600 consul-server-1.consul-server.default.svc.cluster.local.
140 | 0 4 8300 consul-server-1.consul-server.default.svc.cluster.local.
141 | 0 4 8500 consul-server-1.consul-server.default.svc.cluster.local.
142 | 0 4 8302 consul-server-1.consul-server.default.svc.cluster.local.
143 | 0 4 8301 consul-server-0.consul-server.default.svc.cluster.local.
144 | 0 4 8600 consul-server-0.consul-server.default.svc.cluster.local.
145 | 0 4 8300 consul-server-0.consul-server.default.svc.cluster.local.
146 | 0 4 8500 consul-server-0.consul-server.default.svc.cluster.local.
147 | 0 4 8302 consul-server-0.consul-server.default.svc.cluster.local.
148 | ```
149 |
150 |
--------------------------------------------------------------------------------
/content/services/Optimisations/_index.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: "Optimisations"
3 | date: 2020-09-13T17:33:04+01:00
4 | weight: 99
5 | ---
6 |
7 |
8 | # Under construction [help needed]
9 |
10 | Endpoint Slices
11 |
12 | Topology Aware Hints
13 |
14 | SessionAffinity
15 |
16 | externalTrafficPolicy
17 |
18 | DSR (with Cilium)
19 |
20 | Maglev (with Cilium)
21 |
22 | Traffic policies
23 |
24 | [ServiceAppProtocol ](https://github.com/kubernetes/enhancements/blob/0e4d5df19d396511fe41ed0860b0ab9b96f46a2d/keps/sig-network/1507-app-protocol/README.md#risks-and-mitigations)
--------------------------------------------------------------------------------
/content/services/_index.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: "Services"
3 | date: 2020-09-13T17:33:04+01:00
4 | summary: "Cluster load-balancing solutions"
5 | weight: 20
6 | ---
7 |
8 | "Service" is one of the most powerful and, as a result, complex abstractions in Kubernetes. It is, also, a very heavily overloaded term which makes it even more confusing for people approaching Kubernetes for the first time. This chapter will provide a high-level overview of different types of Services, their goals and how they relate to other cluster elements and APIs.
9 |
10 | {{% notice info %}}
11 | A lot of ideas and concepts in this chapter are based on numerous talks and presentations on this topic. It's difficult to make concrete attributions, however most credit goes to members of [Network Special Interest Group](https://github.com/kubernetes/community/tree/master/sig-network).
12 | {{% /notice %}}
13 |
14 | ## Services Hierarchy
15 |
16 | A good starting point to understand a Kubernetes Service is to think of it as a distributed load-balancer. Similar to traditional load-balancers, its data model can be reduced to the following two components:
17 |
18 | 1. **Grouping of backend Pods** -- all Pods with similar labels represent a single service and can receive and process incoming traffic for that service.
19 | 2. **Methods of exposure** -- each group of Pods can be exposed either internally, to other Pods in a cluster, or externally, to end-users or external services in many different ways.
20 |
21 | All Services implement the above functionality, but each in its own way designed for its unique use case. In order to understand various Service types, it helps to view them as an "hierarchy" -- starting from the simplest, with each subsequent type building on top of the previous one. The table below is an attempt to explore and explain this hierarchy:
22 |
23 | | Type | Description |
24 | | ----------| ----------- |
25 | | **Headless** | The simplest form of load-balancing involving only DNS. Nothing is programmed in the data plane and no load-balancer VIP is assigned, however DNS query will return IPs for all backend Pods. The most typical use-case for this is stateful workloads (e.g. databases), where clients need stable and predictable DNS name and can handle the loss of connectivity and failover on their own. |
26 | | **ClusterIP** | The most common type, assigns a unique ClusterIP (VIP) to a set of matching backend Pods. DNS lookup of a Service name returns the allocated ClusterIP. All ClusterIPs are configured in the data plane of each node as DNAT rules -- destination ClusterIP is translated to one of the PodIPs. These NAT translations always happen on the egress (client-side) node which means that Node-to-Pod reachability must be provided externally (by a [CNI plugin](/cni)). |
27 | | **NodePort** | Builds on top of the ClusterIP Service by allocating a unique static port in the root network namespace of each Node and mapping it (via Port Translation) to the port exposed by the backend Pods. The incoming traffic can hit _any_ cluster Node and, as long as the destination port matches the NodePort, it will get forwarded to one of the healthy backend Pods. |
28 | | **LoadBalancer** | Attracts external user traffic to a Kubernetes cluster. Each LoadBalancer Service instance is assigned a unique, externally routable IP address which is advertised to the underlying physical network via BGP or gratuitous ARP. This Service type is implemented outside of the main kube controller -- either by the underlying cloud as an external L4 load-balancer or with a cluster add-on like [MetalLB](https://github.com/metallb/metallb), [Porter](https://github.com/kubesphere/porter) or [kube-vip](https://kube-vip.io/). |
29 |
30 | {{% notice note %}}
31 | One Service type that doesn't fit with the rest is `ExternalName`. It instructs DNS cluster add-on (e.g. CoreDNS) to respond with a CNAME, redirecting all queries for this service's domain name to an external FQDN, which can simplify interacting with external services (for more details see the [Design Spec](https://github.com/kubernetes/community/blob/b3349d5b1354df814b67bbdee6890477f3c250cb/contributors/design-proposals/network/service-external-name.md#motivation)).
32 | {{% /notice %}}
33 |
34 | The following diagram illustrates how different Service types can be combined to expose a stateful application:
35 |
36 | {{< iframe "https://viewer.diagrams.net/?highlight=0000ff&edit=_blank&hide-pages=1&editable=false&layers=1&nav=0&page-id=xy2cxxoLWAjYxmtAeYh4&title=k8s-guide.drawio#Uhttps%3A%2F%2Fraw.githubusercontent.com%2Fnetworkop%2Fk8s-guide-labs%2Fmaster%2Fdiagrams%2Fk8s-guide.drawio" >}}
37 |
38 |
39 | {{% notice info %}}
40 | Although not directly connected, most Services rely on Deployments and StatefulSets to create the required number of Pods with a unique set of labels.
41 | {{% /notice %}}
42 |
43 | ## Service APIs and Implementation
44 |
45 | Services have a relatively small and simple API. At the very least they expect the following to be defined:
46 |
47 | * Explicit list of backend **ports** that needs to be exposed.
48 | * Label **selector** to understand which Pods are potential upstream candidates.
49 | * A Service **type** which defaults to `ClusterIP`.
50 |
51 | ```yaml
52 | kind: Service
53 | apiVersion: v1
54 | metadata:
55 | name: service-example
56 | spec:
57 | ports:
58 | - name: http
59 | port: 80
60 | targetPort: 80
61 | selector:
62 | app: nginx
63 | type: LoadBalancer
64 | ```
65 |
66 | {{% notice note %}}
67 | Some services may not have any label selectors, in which case the list of backend Pods can still be constructed manually. This is often used to interconnect with services outside of the Kubernetes cluster while still relying on internal mechanisms of service discovery.
68 | {{% /notice %}}
69 |
70 | Service's internal architecture consists of two loosely-coupled components:
71 |
72 | * Kubernetes **control plane** -- a process running inside the `kube-controller-manager` binary, that reacts to API events and builds an internal representation of each service instance. This internal representation is a special **Endpoints** object that gets created for every Service instance and contains a list of healthy backend endpoints (PodIP + port).
73 | * Distributed **data plane** -- a set of Node-local agents that read **Endpoints** objects and program their local data plane. This is most commonly implemented with `kube-proxy` with various competing implementations from 3rd-party Kubernetes networking providers like Cilium, Calico, kube-router and others.
74 |
75 | Another less critical, but nonetheless important components is DNS. Internally, DNS add-on is just a Pod running in a cluster that caches `Service` and `Endpoints` objects and responds to incoming queries according to the DNS-Based Service Discovery [specification](https://github.com/kubernetes/dns/blob/master/docs/specification.md), which defines the format for incoming queries and the expected structure for responses.
76 |
77 | {{< iframe "https://viewer.diagrams.net/?highlight=0000ff&edit=_blank&hide-pages=1&editable=false&layers=1&nav=0&page-id=HR_OWBqgmX47NSTQvTWL&title=k8s-guide.drawio#Uhttps%3A%2F%2Fraw.githubusercontent.com%2Fnetworkop%2Fk8s-guide-labs%2Fmaster%2Fdiagrams%2Fk8s-guide.drawio" >}}
78 |
79 | {{% notice info %}}
80 | This following series of [short youtube videos](https://www.youtube.com/playlist?list=PLoWxE_5hnZUZMWrEON3wxMBoIZvweGeiq) provide a very good, albeit a vendor-centric, overview of various types of Kubernetes Networking.
81 | {{% /notice %}}
82 |
83 |
--------------------------------------------------------------------------------
/content/services/clusterIP/_index.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: "ClusterIP"
3 | date: 2020-09-13T17:33:04+01:00
4 | weight: 40
5 | ---
6 |
7 | When people say that Kubernetes networking is difficult, they very often refer to this type of service. One of the reasons for this perception is that all of its complexity is hidden behind a very minimalistic API. A common way of defining a Service only takes 5 lines of configuration (plus the standard metadata):
8 |
9 | ```yaml
10 | apiVersion: v1
11 | kind: Service
12 | metadata:
13 | name: clusterIP-example
14 | spec:
15 | ports:
16 | - name: http
17 | port: 80
18 | selector:
19 | app: my-backend-app
20 | ```
21 |
22 | Quite unexpectedly, these 5 lines can generate a large amount of state inside the cluster as each Service has to be implemented on all Nodes and its state grows proportionally to the number of backend Endpoints. In order to better understand the networking behind it, the remainder of this chapter will be broken down into the following sections:
23 |
24 | - **Control Plane** will examine the mechanics of interaction between the user input, the API server processing it and a distributed set of node-local agents ultimately consuming it.
25 | - **Data Plane** will cover some of the standard implementations including iptables, ipvs and eBPF.
26 |
--------------------------------------------------------------------------------
/content/services/clusterIP/control-plane.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: "Control Plane"
3 | date: 2020-09-13T17:33:04+01:00
4 | weight: 10
5 | ---
6 |
7 | Let's start our exploration with the first step of any Kubernetes cluster's lifecycle -- bootstrapping. At this stage, a cluster admin is expected to provide a number of parameters one of which will be called `service-cidr` (or something similar depending on the orchestrator) which gets mapped to a `service-cluster-ip-range` argument of the `kube-apiserver`.
8 |
9 | {{% notice note %}}
10 | For the sake of simplicity we'll assume `kubeadm` is used to orchestrate a cluster.
11 | {{% /notice %}}
12 |
13 | An Orchestrator will suggest a default value for this range (e.g. `10.96.0.0/12`) which most of the times is safe to use. As we'll see later, this range is completely "virtual", i.e. does not need to have any coordination with the underlying network and can be re-used between clusters (one notable exception being [this Calico feature](https://docs.projectcalico.org/networking/advertise-service-ips#advertise-service-cluster-ip-addresses)). The only constraints for this value are:
14 |
15 | - It must not overlap with any of the Pod IP ranges or Node IPs of the same cluster.
16 | - It must not be loopback (127.0.0.0/8 for IPv4, ::1/128 for IPv6) or link-local (169.254.0.0/16 and 224.0.0.0/24 for IPv4, fe80::/64 for IPv6).
17 |
18 | Once a Kubernetes cluster has been bootstrapped, every new `ClusterIP` service type will get a unique IP allocated from this range, for example:
19 |
20 |
21 | ```yaml
22 | $ kubectl create svc clusterip test --tcp=80 && kubectl get svc test
23 | service/test created
24 | NAME TYPE CLUSTER-IP EXTERNAL-IP PORT(S) AGE
25 | test ClusterIP 10.96.37.70 80/TCP 0s
26 | ```
27 |
28 | {{% notice info %}}
29 | The first IP from the Service CIDR range is reserved and always assigned to a special `kubernetes` service. See [this explanation](https://networkop.co.uk/post/2020-06-kubernetes-default/) for more details.
30 | {{% /notice %}}
31 |
32 |
33 | Inside the [`kube-controller-manager`](https://kubernetes.io/docs/reference/command-line-tools-reference/kube-controller-manager/)'s [reconciliation loop](https://github.com/kubernetes/kubernetes/blob/52eea971c57580c6b1b74f0a12bf9cc6083a4d6b/pkg/controller/endpoint/endpoints_controller.go#L378), it builds an internal representation for each Service which includes a list of all associated Endpoints. From then on, both `Service` and `Endpoints` resources co-exist, with the former being the user-facing, aggregated view of a load-balancer and the latter being the detailed, low-level set of IP and port details that will be programmed in the dataplane. There are two ways to compile a list of Endpoints:
34 |
35 |
36 | - **Label selectors** is the most common approach, relies on labels to [identify](https://github.com/kubernetes/kubernetes/blob/52eea971c57580c6b1b74f0a12bf9cc6083a4d6b/pkg/controller/endpoint/endpoints_controller.go#L414) all matching Pods, and collect their [IP](https://github.com/kubernetes/kubernetes/blob/52eea971c57580c6b1b74f0a12bf9cc6083a4d6b/pkg/controller/endpoint/endpoints_controller.go#L259) and [port](https://github.com/kubernetes/kubernetes/blob/52eea971c57580c6b1b74f0a12bf9cc6083a4d6b/pkg/controller/endpoint/endpoints_controller.go#L479) information.
37 | - **Manual configuration** relies on users to assemble their own set of Endpoints; this approach is very rarely used but can give an intra-cluster address and hostname to any external service.
38 |
39 | All Endpoints are stored in an `Endpoints` resource that bears the same name as its parent Service. Below is an example of how it might look for the `kubernetes` service:
40 |
41 | ```yaml
42 | apiVersion: v1
43 | kind: Endpoints
44 | metadata:
45 | labels:
46 | endpointslice.kubernetes.io/skip-mirror: "true"
47 | name: kubernetes
48 | namespace: default
49 | subsets:
50 | - addresses:
51 | - ip: 172.18.0.4
52 | ports:
53 | - name: https
54 | port: 6443
55 | protocol: TCP
56 | ```
57 |
58 | {{% notice info %}}
59 | Under the hood Endpoints are implemented as a set of slices; this will be covered in the Optimisations sections.
60 | {{% /notice %}}
61 |
62 | It is worth noting that the [DNS Spec](https://github.com/kubernetes/dns/blob/master/docs/specification.md#23---records-for-a-service-with-clusterip), mentioned briefly in the previous chapter, also defines the behaviour for the `ClusterIP` type services. Specifically, the following 3 query types must be supported:
63 |
64 | * **A/AAAA** Records -- will return a single ClusterIP for any query matching the Service Name (`metadata.name`) in the same namespace or `..svc.` in a different namespace.
65 | * **SRV** Record -- will return an SRV record for each unique port + protocol combination.
66 | * **PTR** Record -- can be used to lookup a service name based on provided `ClusterIP`.
67 |
68 |
69 | ---
70 |
71 | The Kubernetes' `kube-controller-manager` is constantly collecting, processing and updating all Endpoints and Service resources, however nothing is being done with this yet. Ultimate consumers of this information are a set of node-local agents (controllers) that will use it to program their local dataplane. Most of these node-local agents are using
72 | [client-go](https://github.com/kubernetes/sample-controller/blob/master/docs/controller-client-go.md) library to synchronize and process updates coming from the API server, which means they will all share the following behaviour:
73 |
74 | * Each node-local agent maintains a local cache of all interesting objects, which gets sync'ed in the beginning (via `List` operation) and observed for the remainder of the their lifecycle (via `Watch` operation).
75 | * The [architecture](https://github.com/kubernetes/sample-controller/blob/master/docs/controller-client-go.md) with two queues and a local cache ensures that controllers can absorb multiple frequent changes of the same object thereby minimising the churn in the dataplane.
76 |
77 |
--------------------------------------------------------------------------------
/content/services/clusterIP/dataplane/IPVS.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: "IPVS"
3 | date: 2020-09-13T17:33:04+01:00
4 | weight: 20
5 | ---
6 |
7 | IPTables has been the first implementation of kube-proxy's dataplane, however, over time its limitations have become more pronounced, especially when operating at scale. There are several side-effects of implementing a proxy with something that was designed to be a firewall, the main one being a limited set of data structures. The way it manifests itself is that every ClusterIP Service needs to have a unique entry, these entries can't be grouped and have to be processed sequentially as chains of tables. This means that any dataplane lookup or a create/update/delete operation needs to traverse the chain until a match is found which, at a large-enough scale can result in [minutes](https://docs.google.com/presentation/d/1BaIAywY2qqeHtyGZtlyAp89JIZs59MZLKcFLxKE6LyM/edit#slide=id.p20) of added processing time.
8 |
9 | {{% notice note %}}
10 | Detailed performance analysis and measurement results of running iptables at scale can be found in the [Additional Reading](#additional-reading) section at the bottom of the page.
11 | {{% /notice %}}
12 |
13 | All this led to `ipvs` being added as an [enhancement proposal](https://github.com/kubernetes/enhancements/tree/0e4d5df19d396511fe41ed0860b0ab9b96f46a2d/keps/sig-network/265-ipvs-based-load-balancing) and eventually graduating to GA in Kubernetes version 1.11. The new dataplane implementation offers a number of improvements over the existing `iptables` mode:
14 |
15 | * All Service load-balancing is migrated to IPVS which can perform in-kernel lookups and masquerading in constant time, regardless of the number of configured Services or Endpoints.
16 |
17 | * The remaining rules in IPTables have been re-engineered to make use of [ipset](https://wiki.archlinux.org/title/Ipset), making the lookups more efficient.
18 |
19 | * Multiple additional load-balancer [scheduling modes](https://kubernetes.io/blog/2018/07/09/ipvs-based-in-cluster-load-balancing-deep-dive/#parameter-changes) are now available, with the default one being a simple round-robin.
20 |
21 |
22 | On the surface, this makes the decision to use `ipvs` an obvious one, however, since `iptables` have been the default mode for so long, some of its quirks and undocumented side-effects have become the standard. One of the fortunate side-effects of the `iptables` mode is that `ClusterIP` is never bound to any kernel interface and remains completely virtual (as a NAT rule). So when `ipvs` changed this behaviour by introducing a dummy `kube-ipvs0` interface, it [made it possible](https://github.com/kubernetes/kubernetes/issues/72236) for processes inside Pods to access any host-local services bound to `0.0.0.0` by targeting any existing `ClusterIP`. Although this does make `ipvs` less safe by default, it doesn't mean that these risks can't be mitigated (e.g. by not binding to `0.0.0.0`).
23 |
24 | The diagram below is a high-level and simplified view of two distinct datapaths for the same `ClusterIP` virtual service -- one from a remote Pod and one from a host-local interface.
25 |
26 |
27 | {{< iframe "https://viewer.diagrams.net/?highlight=0000ff&edit=_blank&hide-pages=1&editable=false&layers=1&nav=0&page-id=BucKDkpFbDgBnzcmmJd5&title=k8s-guide.drawio#Uhttps%3A%2F%2Fraw.githubusercontent.com%2Fnetworkop%2Fk8s-guide-labs%2Fmaster%2Fdiagrams%2Fk8s-guide.drawio" >}}
28 |
29 |
30 |
31 | ### Lab Setup
32 |
33 | Assuming that the lab environment is already [set up](/lab/), ipvs can be enabled with the following command:
34 |
35 | ```bash
36 | make ipvs
37 | ```
38 |
39 | Under the covers, the above command updates the proxier mode in kube-proxy's ConfigMap so in order for this change to get picked up, we need to restart all of the agents and flush out any existing iptable rules:
40 |
41 | ```bash
42 | make flush-nat
43 | ```
44 |
45 | Check the logs to make sure kube-proxy has loaded all of the [required kernel modules](https://github.com/kubernetes/kubernetes/blob/2f753ec4c826895e4ccd3d6bdda2b1ab777ceeb8/pkg/util/ipvs/ipvs.go#L130). In case of a failure, the following error will be present in the logs and kube-proxy will fall back to the `iptables` mode:
46 |
47 |
48 | ```bash
49 | $ make kube-proxy-logs | grep -i ipvs
50 | E0626 17:19:43.491383 1 server_others.go:127] Can't use the IPVS proxier: IPVS proxier will not be used because the following required kernel modules are not loaded: [ip_vs ip_vs_rr ip_vs_wrr ip_vs_sh]
51 | ```
52 |
53 | Another way to confirm that the change has succeeded is to check that Nodes now have a new dummy ipvs device:
54 |
55 | {{< highlight bash "linenos=false,hl_lines=2" >}}
56 | $ docker exec -it k8s-guide-worker ip link show kube-ipvs0
57 | 7: kube-ipvs0: mtu 1500 qdisc noop state DOWN mode DEFAULT group default
58 | link/ether 22:76:01:f0:71:9f brd ff:ff:ff:ff:ff:ff promiscuity 0 minmtu 0 maxmtu 0
59 | dummy addrgenmode eui64 numtxqueues 1 numrxqueues 1 gso_max_size 65536 gso_max_segs 65535
60 | {{< / highlight >}}
61 |
62 |
63 | {{% notice note %}}
64 | One thing to remember when migrating from iptables to ipvs on an existing cluster (as opposed to rebuilding it from scratch), is that all of the KUBE-SVC/KUBE-SEP chains will still be there at least until they cleaned up manually or a node is rebooted.
65 | {{% /notice %}}
66 |
67 | Spin up a test deployment and expose it as a `ClusterIP` Service:
68 |
69 |
70 | ```bash
71 | kubectl create deploy web --image=nginx --replicas=2
72 | kubectl expose deploy web --port 80
73 | ```
74 |
75 | Check that all Pods are up and note the IP allocated to our Service:
76 |
77 | {{< highlight bash "linenos=false,hl_lines=3-4 7" >}}
78 | $ kubectl get pod -owide -l app=web
79 | NAME READY STATUS RESTARTS AGE IP NODE NOMINATED NODE READINESS GATES
80 | web-96d5df5c8-6bgpr 1/1 Running 0 111s 10.244.1.6 k8s-guide-worker
81 | web-96d5df5c8-wkfrb 1/1 Running 0 111s 10.244.2.4 k8s-guide-worker2
82 | $ kubectl get svc web
83 | NAME TYPE CLUSTER-IP EXTERNAL-IP PORT(S) AGE
84 | web ClusterIP 10.96.119.228 80/TCP 92s
85 | {{< / highlight >}}
86 |
87 | Before we move forward, there are a couple of dependencies we need to satisfy:
88 |
89 | 1. Pick one of the Nodes hosting a test deployment and install the following packages:
90 |
91 | ```bash
92 | docker exec k8s-guide-worker apt update
93 | docker exec k8s-guide-worker apt install ipset ipvsadm -y
94 | ```
95 |
96 | 2. On the same Node set up the following set of aliases to simplify access to iptables, ipvs and ipset:
97 |
98 | ```bash
99 | alias ipt="docker exec k8s-guide-worker iptables -t nat -nvL"
100 | alias ipv="docker exec k8s-guide-worker ipvsadm -ln"
101 | alias ips="docker exec k8s-guide-worker ipset list"
102 | ```
103 |
104 | ### Use case #1: Pod-to-Service communication
105 |
106 | Any packet leaving a Pod will first pass through the `PREROUTING` chain which is where kube-proxy intercepts all Service-bound traffic:
107 |
108 | {{< highlight bash "linenos=false,hl_lines=3-4 7" >}}
109 | $ ipt PREROUTING
110 | Chain PREROUTING (policy ACCEPT 0 packets, 0 bytes)
111 | pkts bytes target prot opt in out source destination
112 | 128 12020 KUBE-SERVICES all -- * * 0.0.0.0/0 0.0.0.0/0 /* kubernetes service portals */
113 | 0 0 DOCKER_OUTPUT all -- * * 0.0.0.0/0 192.168.224.1
114 | {{< / highlight >}}
115 |
116 | The size of the `KUBE-SERVICES` chain is reduced compared to the [`iptables`](/services/clusterip/dataplane/iptables/) mode and the lookup stops once the destination IP is matched against the `KUBE-CLUSTER-IP` ipset:
117 |
118 | {{< highlight bash "linenos=false,hl_lines=6" >}}
119 | $ ipt KUBE-SERVICES
120 | Chain KUBE-SERVICES (2 references)
121 | pkts bytes target prot opt in out source destination
122 | 0 0 KUBE-MARK-MASQ all -- * * !10.244.0.0/16 0.0.0.0/0 /* Kubernetes service cluster ip + port for masquerade purpose */ match-set KUBE-CLUSTER-IP dst,dst
123 | 0 0 KUBE-NODE-PORT all -- * * 0.0.0.0/0 0.0.0.0/0 ADDRTYPE match dst-type LOCAL
124 | 0 0 ACCEPT all -- * * 0.0.0.0/0 0.0.0.0/0 match-set KUBE-CLUSTER-IP dst,dst
125 | {{< / highlight >}}
126 |
127 | This ipset contains all existing ClusterIPs and the lookup is performed in [O(1)](https://en.wikipedia.org/wiki/Time_complexity#Constant_time) time:
128 |
129 | {{< highlight bash "linenos=false,hl_lines=18" >}}
130 | $ ips KUBE-CLUSTER-IP
131 | Name: KUBE-CLUSTER-IP
132 | Type: hash:ip,port
133 | Revision: 5
134 | Header: family inet hashsize 1024 maxelem 65536
135 | Size in memory: 768
136 | References: 2
137 | Number of entries: 9
138 | Members:
139 | 10.96.0.10,udp:53
140 | 10.96.0.1,tcp:443
141 | 10.96.0.10,tcp:53
142 | 10.96.148.225,tcp:80
143 | 10.96.68.46,tcp:3030
144 | 10.96.10.207,tcp:3030
145 | 10.96.0.10,tcp:9153
146 | 10.96.159.35,tcp:11211
147 | 10.96.119.228,tcp:80
148 | {{< / highlight >}}
149 |
150 | Following the lookup in the `PREROUTING` chain, our packet gets to the [routing decision](https://upload.wikimedia.org/wikipedia/commons/3/37/Netfilter-packet-flow.svg) stage which is where it gets intercepted by Netfilter's `NF_INET_LOCAL_IN` hook and redirected to IPVS.
151 |
152 |
153 |
154 | {{< highlight bash "linenos=false,hl_lines=20-22" >}}
155 | $ ipv
156 | IP Virtual Server version 1.2.1 (size=4096)
157 | Prot LocalAddress:Port Scheduler Flags
158 | -> RemoteAddress:Port Forward Weight ActiveConn InActConn
159 | TCP 192.168.224.4:31730 rr
160 | -> 10.244.1.6:80 Masq 1 0 0
161 | -> 10.244.2.4:80 Masq 1 0 0
162 | TCP 10.96.0.1:443 rr
163 | -> 192.168.224.3:6443 Masq 1 0 0
164 | TCP 10.96.0.10:53 rr
165 | -> 10.244.0.3:53 Masq 1 0 0
166 | -> 10.244.0.4:53 Masq 1 0 0
167 | TCP 10.96.0.10:9153 rr
168 | -> 10.244.0.3:9153 Masq 1 0 0
169 | -> 10.244.0.4:9153 Masq 1 0 0
170 | TCP 10.96.10.207:3030 rr
171 | -> 10.244.1.4:3030 Masq 1 0 0
172 | TCP 10.96.68.46:3030 rr
173 | -> 10.244.2.2:3030 Masq 1 0 0
174 | TCP 10.96.119.228:80 rr
175 | -> 10.244.1.6:80 Masq 1 0 0
176 | -> 10.244.2.4:80 Masq 1 0 0
177 | TCP 10.96.148.225:80 rr
178 | -> 10.244.1.6:80 Masq 1 0 0
179 | -> 10.244.2.4:80 Masq 1 0 0
180 | TCP 10.96.159.35:11211 rr
181 | -> 10.244.1.3:11211 Masq 1 0 0
182 | TCP 10.244.2.1:31730 rr
183 | -> 10.244.1.6:80 Masq 1 0 0
184 | -> 10.244.2.4:80 Masq 1 0 0
185 | TCP 127.0.0.1:31730 rr
186 | -> 10.244.1.6:80 Masq 1 0 0
187 | -> 10.244.2.4:80 Masq 1 0 0
188 | UDP 10.96.0.10:53 rr
189 | -> 10.244.0.3:53 Masq 1 0 8
190 | -> 10.244.0.4:53 Masq 1 0 8
191 | {{< / highlight >}}
192 |
193 |
194 | This is where the packet gets DNAT'ed to the IP of one of the selected backend Pods (`10.244.1.6` in our case) and continues on to its destination unmodified, following the forwarding path built by a CNI plugin.
195 |
196 | ### Use case #2: Any-to-Service communication
197 |
198 | Any host-local service trying to communicate with a ClusterIP will first get its packet through `OUTPUT` and `KUBE-SERVICES` chains:
199 |
200 | {{< highlight bash "linenos=false,hl_lines=4" >}}
201 | $ ipt OUTPUT
202 | Chain OUTPUT (policy ACCEPT 5 packets, 300 bytes)
203 | pkts bytes target prot opt in out source destination
204 | 1062 68221 KUBE-SERVICES all -- * * 0.0.0.0/0 0.0.0.0/0 /* kubernetes service portals */
205 | 287 19636 DOCKER_OUTPUT all -- * * 0.0.0.0/0 192.168.224.1
206 | {{< / highlight >}}
207 |
208 | Since source IP does not belong to the PodCIDR range, our packet gets a de-tour via the `KUBE-MARK-MASQ` chain:
209 |
210 | {{< highlight bash "linenos=false,hl_lines=4" >}}
211 | $ ipt KUBE-SERVICES
212 | Chain KUBE-SERVICES (2 references)
213 | pkts bytes target prot opt in out source destination
214 | 0 0 KUBE-MARK-MASQ all -- * * !10.244.0.0/16 0.0.0.0/0 /* Kubernetes service cluster ip + port for masquerade purpose */ match-set KUBE-CLUSTER-IP dst,dst
215 | 0 0 KUBE-NODE-PORT all -- * * 0.0.0.0/0 0.0.0.0/0 ADDRTYPE match dst-type LOCAL
216 | 0 0 ACCEPT all -- * * 0.0.0.0/0 0.0.0.0/0 match-set KUBE-CLUSTER-IP dst,dst
217 | {{< / highlight >}}
218 |
219 | Here the packet gets marked for future SNAT, to make sure it will have a return path from the Pod:
220 |
221 | {{< highlight bash "linenos=false,hl_lines=4" >}}
222 | $ ipt KUBE-MARK-MASQ
223 | Chain KUBE-MARK-MASQ (13 references)
224 | pkts bytes target prot opt in out source destination
225 | 0 0 MARK all -- * * 0.0.0.0/0 0.0.0.0/0 MARK or 0x4000
226 | {{< / highlight >}}
227 |
228 | The following few steps are exactly the same as described for the previous use case:
229 |
230 | * The packet reaches the end of the `KUBE-SERVICES` chain.
231 | * The routing lookup returns a local dummy ipvs interface.
232 | * IPVS intercepts the packet and performs the backend selection and NATs the destination IP address.
233 |
234 | The modified packet metadata continues along the forwarding path until it hits the egress `veth` interface where it gets picked up by the `POSTROUTING` chain:
235 |
236 | {{< highlight bash "linenos=false,hl_lines=4" >}}
237 | $ ipt POSTROUTING
238 | Chain POSTROUTING (policy ACCEPT 5 packets, 300 bytes)
239 | pkts bytes target prot opt in out source destination
240 | 1199 80799 KUBE-POSTROUTING all -- * * 0.0.0.0/0 0.0.0.0/0 /* kubernetes postrouting rules */
241 | 0 0 DOCKER_POSTROUTING all -- * * 0.0.0.0/0 192.168.224.1
242 | 920 61751 KIND-MASQ-AGENT all -- * * 0.0.0.0/0 0.0.0.0/0 ADDRTYPE match dst-type !LOCAL /* kind-masq-agent: ensure nat POSTROUTING directs all non-LOCAL destination traffic to our custom KIND-MASQ-AGENT chain */
243 | {{< / highlight >}}
244 |
245 | This is where the source IP of the packet gets modified to match the one of the egress interface, so the destination Pod knows where to send a reply:
246 |
247 | {{< highlight bash "linenos=false,hl_lines=4 " >}}
248 | $ ipt KUBE-POSTROUTING
249 | Chain KUBE-POSTROUTING (1 references)
250 | pkts bytes target prot opt in out source destination
251 | 0 0 MASQUERADE all -- * * 0.0.0.0/0 0.0.0.0/0 /* Kubernetes endpoints dst ip:port, source ip for solving hairpin purpose */ match-set KUBE-LOOP-BACK dst,dst,src
252 | 1 60 RETURN all -- * * 0.0.0.0/0 0.0.0.0/0 mark match ! 0x4000/0x4000
253 | 0 0 MARK all -- * * 0.0.0.0/0 0.0.0.0/0 MARK xor 0x4000
254 | 0 0 MASQUERADE all -- * * 0.0.0.0/0 0.0.0.0/0 /* kubernetes service traffic requiring SNAT */ random-fully
255 | {{< / highlight >}}
256 |
257 | The final masquerading action is performed if the destination IP and Port match one of the local Endpoints which are stored in the `KUBE-LOOP-BACK` ipset:
258 |
259 | {{< highlight bash "linenos=false,hl_lines=11" >}}
260 | $ ips KUBE-LOOP-BACK
261 | Name: KUBE-LOOP-BACK
262 | Type: hash:ip,port,ip
263 | Revision: 5
264 | Header: family inet hashsize 1024 maxelem 65536
265 | Size in memory: 360
266 | References: 1
267 | Number of entries: 2
268 | Members:
269 | 10.244.1.2,tcp:3030,10.244.1.2
270 | 10.244.1.6,tcp:80,10.244.1.6
271 | {{< / highlight >}}
272 |
273 | {{% notice info %}}
274 | It should be noted that, similar to the iptables mode, all of the above lookups are only performed for the first packet of the session and all subsequent packets follow a much shorter path in the conntrack subsystem.
275 | {{% /notice %}}
276 |
277 |
278 | ### Additional reading
279 |
280 | [Scaling Kubernetes to Support 50,000 Services](https://github.com/sbueringer/kubecon-slides/blob/4a793c54a5bb31ededb2ec3ba230aaa94bc003d7/slides/2017-kubecon-eu/Scale%20Kubernetes%20to%20Support%2050,000%20Services%20%5BI%5D%20-%20Haibin%20Xie%20&%20Quinton%20Hoole,%20Huawei%20Technologies%20-%20Scale%20Kubernetes%20to%20Support%2050000%20Services.pdf)
281 |
282 | [Comparing kube-proxy modes: iptables or IPVS?](https://www.projectcalico.org/comparing-kube-proxy-modes-iptables-or-ipvs/)
283 |
284 | [IPVS-Based In-Cluster Load Balancing Deep Dive](https://kubernetes.io/blog/2018/07/09/ipvs-based-in-cluster-load-balancing-deep-dive/)
--------------------------------------------------------------------------------
/content/services/clusterIP/dataplane/eBPF.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: "eBPF"
3 | date: 2020-09-13T17:33:04+01:00
4 | weight: 40
5 | ---
6 |
7 | eBPF has emerged as a new alternative to IPTables and IPVS mechanisms implemented by `kube-proxy` with the promise to reduce CPU utilization and latency, improve throughput and increase scale.
8 | As of today, there are two implementations of Kubernetes Service's data plane in eBPF -- one from [Calico](https://docs.projectcalico.org/maintenance/ebpf/enabling-bpf) and one from [Cilium](https://docs.cilium.io/en/latest/gettingstarted/kubeproxy-free/).
9 | Since Cilium was the first product to introduce `kube-proxy`-less data plane, we'll focus on its implementation in this chapter. However it should be noted that there is no "standard" way to implement the Services data plane in eBPF, so Calico's approach may be different.
10 |
11 | Cilium's `kube-proxy` replacement is called [Host-Reachable Services](https://docs.cilium.io/en/v1.10/gettingstarted/host-services/#host-services) and it literally makes any ClusterIP reachable from the host (Kubernetes Node). It does that by attaching eBPF programs to cgroup hooks, intercepting all system calls and transparently modifying the ones that are destined to ClusterIP VIPs. Since Cilium attaches them to the root cgroup, it affects all sockets of all processes on the host. As of today, Cilium's implementation supports the following syscalls, which cover most of the use-cases but [depend](https://docs.cilium.io/en/latest/gettingstarted/kubeproxy-free/#limitations) on the underlying Linux kernel version:
12 |
13 | ```
14 | $ bpftool cgroup tree /run/cilium/cgroupv2/
15 | CgroupPath
16 | ID AttachType AttachFlags Name
17 | /run/cilium/cgroupv2
18 | 2005 connect4
19 | 1970 connect6
20 | 2007 post_bind4
21 | 2002 post_bind6
22 | 2008 sendmsg4
23 | 2003 sendmsg6
24 | 2009 recvmsg4
25 | 2004 recvmsg6
26 | 2006 getpeername4
27 | 1991 getpeername6
28 | ```
29 |
30 | This is what typically happens when a client, e.g. a process inside a Pod, tries to communicate with a remote ClusterIP:
31 |
32 | * Client's network application invokes one of the syscalls.
33 | * eBPF program attached to this syscall's hook is executed.
34 | * The input to this eBPF program contains a number of socket parameters like destination IP and port number.
35 | * These input details are compared to existing ClusterIP Services and if no match is found, control flow is returned to the Linux kernel.
36 | * In case one of the existing Services did match, the eBPF program selects one of the backend Endpoints and "redirects" the syscall to it by modifying its destination address, before passing it back to the Linux kernel.
37 | * Subsequent data is exchanged over the opened socket by calling `read()` and `write()` without any involvement from the eBPF program.
38 |
39 | It's very important to understand that in this case, the destination NAT translation happens at the syscall level, before the packet is even built by the kernel. What this means is that the first packet to leave the client network namespace already has the right destination IP and port number and can be forwarded by a separate data plane managed by a CNI plugin (in most cases though the entire data plane is managed by the same plugin).
40 |
41 | {{% notice info %}}
42 | A somewhat similar idea has previously been implemented by a product called Appswitch. See [1](https://hci.stanford.edu/cstr/reports/2017-01.pdf), [2](https://appswitch.readthedocs.io/en/latest/index.html), [3](https://networkop.co.uk/post/2018-05-29-appswitch-sdn/) for more details.
43 | {{% /notice %}}
44 |
45 | Below is a high-level diagram of what happens when a Pod on Node `worker-2` tries to communicate with a ClusterIP `10.96.32.28:80`. See [section below](/services/clusterip/dataplane/ebpf/#a-day-in-the-life-of-a-packet) for a detailed code walkthrough.
46 |
47 | {{< iframe "https://viewer.diagrams.net/?highlight=0000ff&edit=_blank&hide-pages=1&editable=false&layers=1&nav=0&page-id=oxqjjDhMhjtZh66px_17&title=k8s-guide.drawio#Uhttps%3A%2F%2Fraw.githubusercontent.com%2Fnetworkop%2Fk8s-guide-labs%2Fmaster%2Fdiagrams%2Fk8s-guide.drawio" >}}
48 |
49 |
50 | ## Lab
51 |
52 |
53 | ### Preparation
54 |
55 | Assuming that the lab environment is already [set up](/lab/), Cilium can be enabled with the following command:
56 |
57 | ```bash
58 | make cilium
59 | ```
60 |
61 | Wait for Cilium daemonset to initialize:
62 |
63 | ```bash
64 | make cilium-wait
65 | ```
66 |
67 | Now we need to "kick" all Pods to restart and pick up the new CNI plugin:
68 |
69 | ```bash
70 | make nuke-all-pods
71 | ```
72 |
73 | To make sure there's is no interference from `kube-proxy` we'll remove it completely along with any IPTables rules set up by it:
74 |
75 | ```
76 | make nuke-kube-proxy
77 | ```
78 |
79 | Check that the cilium is healthy:
80 |
81 | ```bash
82 | $ make cilium-check | grep health
83 | Cilium health daemon: Ok
84 | Controller Status: 40/40 healthy
85 | Cluster health: 3/3 reachable (2021-08-02T19:52:07Z)
86 | ```
87 |
88 | In order to have a working ClusterIP to test against, create a deployment with 3 nginx Pods and examine the assigned ClusterIP and IPs of the backend Pods:
89 |
90 | ```
91 | make deployment && make scale-up && make cluster-ip
92 | $ kubectl get svc web
93 | NAME TYPE CLUSTER-IP EXTERNAL-IP PORT(S) AGE
94 | web ClusterIP 10.96.32.28 80/TCP 5s
95 | $ kubectl get ep web
96 | NAME ENDPOINTS AGE
97 | web 10.0.0.234:80,10.0.0.27:80,10.0.2.76:80 11m
98 | ```
99 |
100 | Now let's see what happens when a client tries to communicate with this Service.
101 |
102 | ## A day in the life of a Packet
103 |
104 | First, let's take a look at the first few packets of a client session. Keep a close eye on the destination IP of the captured packets:
105 | ```
106 | $ NODE=k8s-guide-worker2 make tshoot
107 | bash-5.1# tcpdump -enni any -q &
108 | bash-5.1# tcpdump: verbose output suppressed, use -v or -vv for full protocol decode
109 | listening on any, link-type LINUX_SLL (Linux cooked v1), capture size 262144 bytes
110 |
111 | bash-5.1# curl -s 10.96.32.28 | grep Welcome
112 | Welcome to nginx!
113 |
Welcome to nginx!
114 | 20:11:29.780374 eth0 Out ifindex 24 aa:24:9c:63:2e:7d 10.0.2.202.45676 > 10.0.0.27.80: tcp 0
115 | 20:11:29.781996 eth0 In ifindex 24 2a:89:e2:43:42:6e 10.0.0.27.80 > 10.0.2.202.45676: tcp 0
116 | 20:11:29.782014 eth0 Out ifindex 24 aa:24:9c:63:2e:7d 10.0.2.202.45676 > 10.0.0.27.80: tcp 0
117 | 20:11:29.782297 eth0 Out ifindex 24 aa:24:9c:63:2e:7d 10.0.2.202.45676 > 10.0.0.27.80: tcp 75
118 | ```
119 |
120 | The first TCP packet sent at `20:11:29.780374` already contains the destination IP of one of the backend Pods. This kind of behaviour can very easily [enhance](https://cilium.io/blog/2018/08/07/istio-10-cilium) but also [trip up](https://github.com/linkerd/linkerd2/issues/5932#issuecomment-811747872) applications relying on [traffic interception](https://docs.openservicemesh.io/docs/tasks/traffic_management/iptables_redirection/).
121 |
122 | Now let's take a close look at the "happy path" of the eBPF program responsible for this. The above `curl` command would try to connect to an IPv4 address and would invoke the [`connect()`](https://man7.org/linux/man-pages/man2/connect.2.html) syscall, to which the `connect4` eBPF program is attached ([source](https://github.com/cilium/cilium/blob/4145278ccc6e90739aa100c9ea8990a0f561ca95/bpf/bpf_sock.c#L446)).
123 |
124 | {{< highlight c "linenos=false,hl_lines=7 " >}}
125 | __section("connect4")
126 | int sock4_connect(struct bpf_sock_addr *ctx)
127 | {
128 | if (sock_is_health_check(ctx))
129 | return __sock4_health_fwd(ctx);
130 |
131 | __sock4_xlate_fwd(ctx, ctx, false);
132 | return SYS_PROCEED;
133 | }
134 | {{< / highlight >}}
135 |
136 |
137 | Most of the processing is done inside the [`__sock4_xlate_fwd`](https://github.com/cilium/cilium/blob/4145278ccc6e90739aa100c9ea8990a0f561ca95/bpf/bpf_sock.c#L328) function; we'll break it down into multiple parts for simplicity and omit some of the less important bits that cover special use cases like `sessionAffinity` and `externalTrafficPolicy`. Note that regardless of what happens in the above function, the returned value is always `SYS_PROCEED`, which returns the control flow back to the kernel.
138 |
139 | The first thing that happens inside this function is the Services map lookup based on the destination IP and port:
140 |
141 | {{< highlight c "linenos=false,hl_lines=8-9 13 " >}}
142 | static __always_inline int __sock4_xlate_fwd(struct bpf_sock_addr *ctx,
143 | struct bpf_sock_addr *ctx_full,
144 | const bool udp_only)
145 | {
146 | struct lb4_backend *backend;
147 | struct lb4_service *svc;
148 | struct lb4_key key = {
149 | .address = ctx->user_ip4,
150 | .dport = ctx_dst_port(ctx),
151 | }, orig_key = key;
152 | struct lb4_service *backend_slot;
153 |
154 | svc = lb4_lookup_service(&key, true);
155 | if (!svc)
156 | svc = sock4_wildcard_lookup_full(&key, in_hostns);
157 | if (!svc)
158 | return -ENXIO;
159 | {{< / highlight >}}
160 |
161 | Kubernetes Services can have an arbitrary number of Endpoints, depending on the number of matching Pods, however eBPF maps have [fixed size](https://docs.cilium.io/en/latest/concepts/ebpf/maps/#ebpf-maps), so storing variable-size values is not possible. In order to overcome that, the lookup process is broken into two steps:
162 |
163 | * The first lookup is done just with the destination IP and port and the returned value tells how many Endpoints are currently associated with the Service.
164 | * The second lookup is done with the same destination IP and port _plus_ an additional field called `backend_slot` which corresponds to one of the backend Endpoints.
165 |
166 | During the first lookup `backend_slot` is set to 0. The returned value contains [a number of fields](https://github.com/cilium/cilium/blob/4145278ccc6e90739aa100c9ea8990a0f561ca95/bpf/lib/common.h#L767) but the most important one at this stage is `count` -- the total number of Endpoints for this Service.
167 |
168 | {{< highlight c "linenos=false,hl_lines=8-9 15 " >}}
169 | static __always_inline
170 | struct lb4_service *lb4_lookup_service(struct lb4_key *key,
171 | const bool scope_switch)
172 | {
173 | struct lb4_service *svc;
174 |
175 | key->scope = LB_LOOKUP_SCOPE_EXT;
176 | key->backend_slot = 0;
177 | svc = map_lookup_elem(&LB4_SERVICES_MAP_V2, key);
178 | if (svc) {
179 | if (!scope_switch || !lb4_svc_is_local_scope(svc))
180 | return svc->count ? svc : NULL;
181 | key->scope = LB_LOOKUP_SCOPE_INT;
182 | svc = map_lookup_elem(&LB4_SERVICES_MAP_V2, key);
183 | if (svc && svc->count)
184 | return svc;
185 | }
186 |
187 | return NULL;
188 | }
189 | {{< / highlight >}}
190 |
191 | Let's look inside the eBPF map and see what entries match that last two octets of our ClusterIP `10.96.32.28`:
192 |
193 | {{< highlight bash "linenos=false,hl_lines=5 " >}}
194 | $ NODE=k8s-guide-worker2
195 | $ cilium=$(kubectl get -l k8s-app=cilium pods -n cilium --field-selector spec.nodeName=$NODE -o jsonpath='{.items[0].metadata.name}')
196 | $ kubectl -n cilium exec -it $cilium -- bpftool map dump pinned /sys/fs/bpf/tc/globals/cilium_lb4_services_v2 | grep "20 1c"
197 | key: 0a 60 20 1c 00 50 03 00 00 00 00 00 value: 0b 00 00 00 00 00 00 07 00 00 00 00
198 | key: 0a 60 20 1c 00 50 00 00 00 00 00 00 value: 00 00 00 00 03 00 00 07 00 00 00 00
199 | key: 0a 60 20 1c 00 50 01 00 00 00 00 00 value: 09 00 00 00 00 00 00 07 00 00 00 00
200 | key: 0a 60 20 1c 00 50 02 00 00 00 00 00 value: 0a 00 00 00 00 00 00 07 00 00 00 00
201 | {{< / highlight >}}
202 |
203 | If the `backend_slot` is set to 0, the key would only contain the IP and port of the Service, so that second line would match the first lookup and the [returned value](https://github.com/cilium/cilium/blob/4145278ccc6e90739aa100c9ea8990a0f561ca95/bpf/lib/common.h#L767) can be interpreted as:
204 |
205 | * `backend_id = 0`
206 | * `count = 3`
207 |
208 | Now the eBPF program knows that the total number of Endpoints is 3 but it still hasn't picked one yet. The control returns to the `__sock4_xlate_fwd` function where the `count` information is used to update the lookup `key.backend_slot`:
209 |
210 | {{< highlight c "linenos=false,hl_lines=4 " >}}
211 | if (backend_id == 0) {
212 | backend_from_affinity = false;
213 |
214 | key.backend_slot = (sock_select_slot(ctx_full) % svc->count) + 1;
215 | backend_slot = __lb4_lookup_backend_slot(&key);
216 | if (!backend_slot) {
217 | update_metrics(0, METRIC_EGRESS, REASON_LB_NO_BACKEND_SLOT);
218 | return -ENOENT;
219 | }
220 |
221 | backend_id = backend_slot->backend_id;
222 | backend = __lb4_lookup_backend(backend_id);
223 | }
224 | {{< / highlight >}}
225 |
226 | This is where the backend selection takes place either randomly (for TCP) or based on the [socket cookie](https://github.com/cilium/cilium/blob/4145278ccc6e90739aa100c9ea8990a0f561ca95/bpf/bpf_sock.c#L101) (for UDP):
227 |
228 | {{< highlight c "linenos=false,hl_lines=5 " >}}
229 | static __always_inline __maybe_unused
230 | __u64 sock_select_slot(struct bpf_sock_addr *ctx)
231 | {
232 | return ctx->protocol == IPPROTO_TCP ?
233 | get_prandom_u32() : sock_local_cookie(ctx);
234 | }
235 | {{< / highlight >}}
236 |
237 | The [second lookup](https://github.com/cilium/cilium/blob/4145278ccc6e90739aa100c9ea8990a0f561ca95/bpf/lib/lb.h#L1095) is performed in the same map, but now the key contains the previously selected `backend_slot`:
238 |
239 | {{< highlight c "linenos=false,hl_lines=4 " >}}
240 | static __always_inline
241 | struct lb4_service *__lb4_lookup_backend_slot(struct lb4_key *key)
242 | {
243 | return map_lookup_elem(&LB4_SERVICES_MAP_V2, key);
244 | }
245 | {{< / highlight >}}
246 |
247 | The lookup result will contain either one of the values from rows 1, 3 or 4 and will have a non-zero value for `backend_id` -- `0b 00`, `09 00` or `0a 00`:
248 |
249 | {{< highlight c "linenos=false,hl_lines=2 4 5 " >}}
250 | $ kubectl -n cilium exec -it $cilium -- bpftool map dump pinned /sys/fs/bpf/tc/globals/cilium_lb4_services_v2 | grep "20 1c"
251 | key: 0a 60 20 1c 00 50 03 00 00 00 00 00 value: 0b 00 00 00 00 00 00 07 00 00 00 00
252 | key: 0a 60 20 1c 00 50 00 00 00 00 00 00 value: 00 00 00 00 03 00 00 07 00 00 00 00
253 | key: 0a 60 20 1c 00 50 01 00 00 00 00 00 value: 09 00 00 00 00 00 00 07 00 00 00 00
254 | key: 0a 60 20 1c 00 50 02 00 00 00 00 00 value: 0a 00 00 00 00 00 00 07 00 00 00 00
255 | {{< / highlight >}}
256 |
257 | Using this value we can now extract IP and port details of the backend Pod:
258 |
259 |
260 | ```c
261 | static __always_inline struct lb4_backend *__lb4_lookup_backend(__u16 backend_id)
262 | {
263 | return map_lookup_elem(&LB4_BACKEND_MAP, &backend_id);
264 | }
265 | ```
266 |
267 | Let's assume that the `backend_id` that got chosen before was `0a 00` and look up the details in the eBPF map:
268 |
269 | ```
270 | $ kubectl -n cilium exec -it $cilium -- bpftool map lookup pinned /sys/fs/bpf/tc/globals/cilium_lb4_backends key 0x0a 0x00
271 | key: 0a 00 value: 0a 00 00 1b 00 50 00 00
272 | ```
273 |
274 | The [returned value](https://github.com/cilium/cilium/blob/4145278ccc6e90739aa100c9ea8990a0f561ca95/bpf/lib/common.h#L782) can be interpreted as:
275 |
276 | * **Address** = `10.0.0.27`
277 | * **Port** = `80`
278 |
279 | Finally, the eBPF program does the socket-based NAT translation, i.e. re-writing of the destination IP and port with the values returned from the eariler lookup:
280 |
281 | {{< highlight c "linenos=false,hl_lines=1 2 " >}}
282 |
283 | ctx->user_ip4 = backend->address;
284 | ctx_set_port(ctx, backend->port);
285 |
286 | return 0;
287 | {{< / highlight >}}
288 |
289 |
290 | At this stage, the eBPF program returns and execution flow continues inside the Linux kernel networking stack all the way until the packet is built and sent out of the egress interface. The packet continues along the path built by the [CNI portion](/cni/cilium) of Cilium.
291 |
292 | This is all that's required to replace the biggest part of `kube-proxy`'s functionality. One big difference with `kube-proxy` implementation is that NAT translation only happens for traffic originating from one of the Kubernetes nodes, e.g. [externally originated](https://docs.projectcalico.org/networking/advertise-service-ips) ClusterIP traffic is not currently supported. This is why we haven't considered the **Any-to-Service** communication use case, as we did for IPTables and IPVS.
293 |
294 |
295 | {{% notice info %}}
296 | Due to a [known issue](https://docs.cilium.io/en/v1.9/gettingstarted/kind/#unable-to-contact-k8s-api-server) with kind, make sure to run `make cilium-unhook` when you're finished with this Cilium lab to detach eBPF programs from the host cgroup.
297 | {{% /notice %}}
298 |
299 |
300 | ### Additional reading
301 |
302 | [Cilium socket LB presentation](https://docs.google.com/presentation/d/1w2zlpGWV7JUhHYd37El_AUZzyUNSvDfktrF5MJ5G8Bs/edit#slide=id.g746fc02b5b_2_0)
303 |
304 | [Kubernetes Without kube-proxy](https://docs.cilium.io/en/latest/gettingstarted/kubeproxy-free/)
305 |
306 |
--------------------------------------------------------------------------------
/content/services/clusterIP/dataplane/iptables.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: "IPTABLES"
3 | date: 2020-09-13T17:33:04+01:00
4 | weight: 10
5 | ---
6 |
7 | Most of the focus of this section will be on the standard node-local proxy implementation called [`kube-proxy`](https://kubernetes.io/docs/concepts/overview/components/#kube-proxy). It is used by default by most of the Kubernetes orchestrators and is installed as a daemonset on top of an newly bootstrapped cluster:
8 |
9 |
10 | ```
11 | $ kubectl get daemonset -n kube-system -l k8s-app=kube-proxy
12 | NAME DESIRED CURRENT READY UP-TO-DATE AVAILABLE NODE SELECTOR AGE
13 | kube-proxy 3 3 3 3 3 kubernetes.io/os=linux 2d16h
14 | ```
15 |
16 | The default mode of operation for `kube-proxy` is `iptables`, as it provides support for a wider set of operating systems without requiring extra kernel modules and has a "good enough" performance characteristics for the majority of small to medium-sized clusters.
17 |
18 | This area of Kubernetes networking is one of the most poorly documented. On the one hand, there are [blogposts](https://medium.com/google-cloud/understanding-kubernetes-networking-services-f0cb48e4cc82) that cover parts of the `kube-proxy` dataplane, on the other hand there's an amazing [diagram](https://docs.google.com/drawings/d/1MtWL8qRTs6PlnJrW4dh8135_S9e2SaawT410bJuoBPk/edit) created by [Tim Hockin](https://twitter.com/thockin) that shows a complete logical flow of packet forwarding decisions but provides very little context and is quite difficult to trace for specific flows. The goal of this article is to bridge the gap between these two extremes and provide a high level of detail while maintaining an easily consumable format.
19 |
20 | So for demonstration purposes, we'll use the following topology with a "web" deployment and two pods scheduled on different worker nodes. The packet forwarding logic for ClusterIP-type services has two distinct paths within the dataplane, which is what we're gonna be focusing on next:
21 |
22 | 1. **Pod-to-Service** communication (purple packets) -- implemented entirely within an egress node and relies on CNI for pod-to-pod reachability.
23 | 2. **Any-to-Service** communication (grey packets) -- includes any externally-originated and, most notable, node-to-service traffic flows.
24 |
25 | {{< iframe "https://viewer.diagrams.net/?highlight=0000ff&edit=_blank&hide-pages=1&editable=false&layers=1&nav=0&page-id=nEL34B1qbs_s_G34E68V&title=k8s-guide.drawio#Uhttps%3A%2F%2Fraw.githubusercontent.com%2Fnetworkop%2Fk8s-guide-labs%2Fmaster%2Fdiagrams%2Fk8s-guide.drawio" >}}
26 |
27 |
28 | The above diagram shows a slightly simplified sequence of match/set actions implemented inside Netfilter's NAT table. The lab section below will show a more detailed view of this dataplane along verification commands.
29 |
30 | {{% notice note %}}
31 | One key thing to remember is that none of the ClusterIPs implemented this way are visible in the Linux routing table. The whole dataplane is implemented entirely within iptable's NAT table, which makes it both very flexible and extremely difficult to troubleshoot at the same time.
32 | {{% /notice %}}
33 |
34 | ### Lab Setup
35 |
36 | To make sure that lab is in the right state, reset it to a blank state:
37 |
38 | ```bash
39 | make up && make reset
40 | ```
41 |
42 | Now let's spin up a new deployment and expose it with a ClusterIP service:
43 |
44 | ```bash
45 | $ kubectl create deploy web --image=nginx --replicas=2
46 | $ kubectl expose deploy web --port 80
47 | ```
48 |
49 | The result of the above two commands can be verified like this:
50 |
51 | ```bash
52 | $ kubectl get deploy web
53 | NAME READY UP-TO-DATE AVAILABLE AGE
54 | web 2/2 2 2 160m
55 | $ kubectl get svc web
56 | NAME TYPE CLUSTER-IP EXTERNAL-IP PORT(S) AGE
57 | web ClusterIP 10.96.94.225 8080/TCP 31s
58 | ```
59 |
60 | The simplest way to test connectivity would be to connect to the assigned ClusterIP `10.96.94.225` from one of the nodes, e.g.:
61 |
62 | ```bash
63 | $ docker exec k8s-guide-worker curl -s 10.96.94.225 | grep Welcome
64 | Welcome to nginx!
65 |
Welcome to nginx!
66 | ```
67 |
68 | One last thing before moving on, let's set up the following bash alias as a shortcut to `k8s-guide-worker`'s NAT iptable:
69 |
70 | ```bash
71 | $ alias d="docker exec k8s-guide-worker iptables -t nat -nvL"
72 | ```
73 |
74 | ### Use case #1: Pod-to-Service communication
75 |
76 | According to Tim's [diagram](https://docs.google.com/drawings/d/1MtWL8qRTs6PlnJrW4dh8135_S9e2SaawT410bJuoBPk/edit) all Pod-to-Service packets get intercepted by the `PREROUTING` chain:
77 |
78 | {{< highlight bash "linenos=false,hl_lines=4" >}}
79 | $ d PREROUTING
80 | Chain PREROUTING (policy ACCEPT 0 packets, 0 bytes)
81 | pkts bytes target prot opt in out source destination
82 | 313 18736 KUBE-SERVICES all -- * * 0.0.0.0/0 0.0.0.0/0 /* kubernetes service portals */
83 | 36 2242 DOCKER_OUTPUT all -- * * 0.0.0.0/0 172.16.0.190
84 | {{< / highlight >}}
85 |
86 | These packets get redirected to the `KUBE-SERVICES` chain, where they get matched against _all_ configured ClusterIPs, eventually reaching these lines:
87 |
88 | {{< highlight bash "linenos=false,hl_lines=3" >}}
89 | $ d KUBE-SERVICES | grep 10.96.94.225
90 | 3 180 KUBE-MARK-MASQ tcp -- * * !10.244.0.0/16 10.96.94.225 /* default/web cluster IP */ tcp dpt:80
91 | 3 180 KUBE-SVC-LOLE4ISW44XBNF3G tcp -- * * 0.0.0.0/0 10.96.94.225 /* default/web cluster IP */ tcp dpt:80
92 | {{< / highlight >}}
93 |
94 | Since the sourceIP of the packet belongs to a Pod (`10.244.0.0/16` is the PodCIDR range), the second line gets matched and the lookup continues in the service-specific chain. Here we have two Pods matching the same label-selector (`--replicas=2`) and both chains are configured with equal distribution probability:
95 |
96 | {{< highlight bash "linenos=false,hl_lines=4 12" >}}
97 | $ d KUBE-SVC-LOLE4ISW44XBNF3G
98 | Chain KUBE-SVC-LOLE4ISW44XBNF3G (1 references)
99 | pkts bytes target prot opt in out source destination
100 | 0 0 KUBE-SEP-MHDQ23KUGG7EGFMW all -- * * 0.0.0.0/0 0.0.0.0/0 /* default/web */ statistic mode random probability 0.50000000000
101 | 0 0 KUBE-SEP-ZA2JI7K7LSQNKDOS all -- * * 0.0.0.0/0 0.0.0.0/0 /* default/web */
102 | {{< / highlight >}}
103 |
104 | Let's assume that in this case the first rule gets matched, so our packet continues on to the next chain where it gets DNAT'ed to the target IP of the destination Pod (`10.244.1.3`):
105 |
106 | {{< highlight bash "linenos=false,hl_lines=5" >}}
107 | $ d KUBE-SEP-MHDQ23KUGG7EGFMW
108 | Chain KUBE-SEP-MHDQ23KUGG7EGFMW (1 references)
109 | pkts bytes target prot opt in out source destination
110 | 0 0 KUBE-MARK-MASQ all -- * * 10.244.1.3 0.0.0.0/0 /* default/web */
111 | 3 180 DNAT tcp -- * * 0.0.0.0/0 0.0.0.0/0 /* default/web */ tcp to:10.244.1.3:80
112 | {{< / highlight >}}
113 |
114 | From here on our packet remains unmodified and continues along its forwarding path set up by a [CNI plugin](/cni/kindnet/) until it reaches the target Node and gets sent directly to the destination Pod.
115 |
116 |
117 |
118 | ### Use case #2: Any-to-Service communication
119 |
120 | Let's assume that the `k8s-guide-worker` node (IP `172.18.0.12`) is sending a packet to our ClusterIP service. This packet gets intercepted in the `OUTPUT` chain and continues to the `KUBE-SERVICES` where it gets redirected via the `KUBE-MARK-MASQ` chain:
121 |
122 | {{< highlight bash "linenos=false,hl_lines=4 8" >}}
123 | $ d OUTPUT
124 | Chain OUTPUT (policy ACCEPT 224 packets, 13440 bytes)
125 | pkts bytes target prot opt in out source destination
126 | 4540 272K KUBE-SERVICES all -- * * 0.0.0.0/0 0.0.0.0/0 /* kubernetes service portals */
127 | 42 2661 DOCKER_OUTPUT all -- * * 0.0.0.0/0 172.16.0.190
128 |
129 | $ d KUBE-SERVICES | grep 10.96.94.225
130 | 3 180 KUBE-MARK-MASQ tcp -- * * !10.244.0.0/16 10.96.94.225 /* default/web cluster IP */ tcp dpt:80
131 | 3 180 KUBE-SVC-LOLE4ISW44XBNF3G tcp -- * * 0.0.0.0/0 10.96.94.225 /* default/web cluster IP */ tcp dpt:80
132 | {{< / highlight >}}
133 |
134 | The purpose of this chain is to mark all packets that will need to get SNAT'ed before they get sent to the final destination:
135 |
136 | {{< highlight bash "linenos=false,hl_lines=4" >}}
137 | $ d KUBE-MARK-MASQ
138 | Chain KUBE-MARK-MASQ (19 references)
139 | pkts bytes target prot opt in out source destination
140 | 3 180 MARK all -- * * 0.0.0.0/0 0.0.0.0/0 MARK or 0x4000
141 | {{< / highlight >}}
142 |
143 | Since `MARK` is not a [terminating target](https://gist.github.com/mcastelino/c38e71eb0809d1427a6650d843c42ac2#targets), the lookup continues down the `KUBE-SERVICES` chain where our packets gets DNAT'ed to one of the randomly selected backend endpoints (as shown above).
144 |
145 | However, this time, before it gets sent to its final destination, the packet gets another detour via the `KUBE-POSTROUTING` chain:
146 |
147 |
148 | {{< highlight bash "linenos=false,hl_lines=4" >}}
149 | $ d POSTROUTING
150 | Chain POSTROUTING (policy ACCEPT 140 packets, 9413 bytes)
151 | pkts bytes target prot opt in out source destination
152 | 715 47663 KUBE-POSTROUTING all -- * * 0.0.0.0/0 0.0.0.0/0 /* kubernetes postrouting rules */
153 | 0 0 DOCKER_POSTROUTING all -- * * 0.0.0.0/0 172.16.0.190
154 | 657 44150 KIND-MASQ-AGENT all -- * * 0.0.0.0/0 0.0.0.0/0 ADDRTYPE match dst-type !LOCAL /* kind-masq-agent: ensure nat POSTROUTING directs all non-LOCAL destination traffic to our custom KIND-MASQ-AGENT chain */
155 | {{< / highlight >}}
156 |
157 | Here all packets with a special SNAT mark (0x4000) fall through to the last rule and get SNAT'ed to the IP of the outgoing interface, which in this case is the veth interface connected to the Pod:
158 |
159 | {{< highlight bash "linenos=false,hl_lines=6" >}}
160 | $ d KUBE-POSTROUTING
161 | Chain KUBE-POSTROUTING (1 references)
162 | pkts bytes target prot opt in out source destination
163 | 463 31166 RETURN all -- * * 0.0.0.0/0 0.0.0.0/0 mark match ! 0x4000/0x4000
164 | 2 120 MARK all -- * * 0.0.0.0/0 0.0.0.0/0 MARK xor 0x4000
165 | 2 120 MASQUERADE all -- * * 0.0.0.0/0 0.0.0.0/0 /* kubernetes service traffic requiring SNAT */ random-fully
166 | {{< / highlight >}}
167 |
168 |
169 | The final `MASQUERADE` action ensures that the return packets follow the same way back, even if they were originated outside of the Kubernetes cluster.
170 |
171 | {{% notice info %}}
172 | The above sequence of lookups may look long an inefficient but bear in mind that this is only done once, for the first packet of the flow and the remainder of the session gets offloaded to Netfilter's connection tracking system.
173 | {{% /notice %}}
174 |
175 |
176 |
177 | ### Additional Reading
178 |
179 | * [**Netfilter Packet flow** ](https://upload.wikimedia.org/wikipedia/commons/3/37/Netfilter-packet-flow.svg)
180 | * [**Logical diagram of kube-proxy in iptables mode**](https://docs.google.com/drawings/d/1MtWL8qRTs6PlnJrW4dh8135_S9e2SaawT410bJuoBPk/edit)
181 | * [**Alternative kube-proxy implementations**](https://arthurchiao.art/blog/cracking-k8s-node-proxy/)
182 | * [**Kubernetes networking demystified**](https://www.cncf.io/blog/2020/01/30/kubernetes-networking-demystified-a-brief-guide/)
--------------------------------------------------------------------------------
/content/services/mesh/_index.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: "Service Meshes"
3 | date: 2020-09-13T17:33:04+01:00
4 | weight: 80
5 | ---
6 |
7 | # Under construction [help needed]
--------------------------------------------------------------------------------
/layouts/partials/favicon.html:
--------------------------------------------------------------------------------
1 |
2 | {{ template "_internal/twitter_cards.html" . }}
--------------------------------------------------------------------------------
/layouts/partials/logo.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
--------------------------------------------------------------------------------
/layouts/partials/menu-footer.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 | Star
5 |
6 |
7 | Fork
8 |
9 |
10 |
11 |
12 |
13 |
14 |
--------------------------------------------------------------------------------
/layouts/shortcodes/div.html:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/networkop/tkng/a3667e05c00fff40a82adcfd6178ca6fd9db3618/layouts/shortcodes/div.html
--------------------------------------------------------------------------------
/layouts/shortcodes/iframe.html:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/license.md:
--------------------------------------------------------------------------------
1 | Apache License
2 | Version 2.0, January 2004
3 | http://www.apache.org/licenses/
4 |
5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
6 |
7 | 1. Definitions.
8 |
9 | "License" shall mean the terms and conditions for use, reproduction,
10 | and distribution as defined by Sections 1 through 9 of this document.
11 |
12 | "Licensor" shall mean the copyright owner or entity authorized by
13 | the copyright owner that is granting the License.
14 |
15 | "Legal Entity" shall mean the union of the acting entity and all
16 | other entities that control, are controlled by, or are under common
17 | control with that entity. For the purposes of this definition,
18 | "control" means (i) the power, direct or indirect, to cause the
19 | direction or management of such entity, whether by contract or
20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the
21 | outstanding shares, or (iii) beneficial ownership of such entity.
22 |
23 | "You" (or "Your") shall mean an individual or Legal Entity
24 | exercising permissions granted by this License.
25 |
26 | "Source" form shall mean the preferred form for making modifications,
27 | including but not limited to software source code, documentation
28 | source, and configuration files.
29 |
30 | "Object" form shall mean any form resulting from mechanical
31 | transformation or translation of a Source form, including but
32 | not limited to compiled object code, generated documentation,
33 | and conversions to other media types.
34 |
35 | "Work" shall mean the work of authorship, whether in Source or
36 | Object form, made available under the License, as indicated by a
37 | copyright notice that is included in or attached to the work
38 | (an example is provided in the Appendix below).
39 |
40 | "Derivative Works" shall mean any work, whether in Source or Object
41 | form, that is based on (or derived from) the Work and for which the
42 | editorial revisions, annotations, elaborations, or other modifications
43 | represent, as a whole, an original work of authorship. For the purposes
44 | of this License, Derivative Works shall not include works that remain
45 | separable from, or merely link (or bind by name) to the interfaces of,
46 | the Work and Derivative Works thereof.
47 |
48 | "Contribution" shall mean any work of authorship, including
49 | the original version of the Work and any modifications or additions
50 | to that Work or Derivative Works thereof, that is intentionally
51 | submitted to Licensor for inclusion in the Work by the copyright owner
52 | or by an individual or Legal Entity authorized to submit on behalf of
53 | the copyright owner. For the purposes of this definition, "submitted"
54 | means any form of electronic, verbal, or written communication sent
55 | to the Licensor or its representatives, including but not limited to
56 | communication on electronic mailing lists, source code control systems,
57 | and issue tracking systems that are managed by, or on behalf of, the
58 | Licensor for the purpose of discussing and improving the Work, but
59 | excluding communication that is conspicuously marked or otherwise
60 | designated in writing by the copyright owner as "Not a Contribution."
61 |
62 | "Contributor" shall mean Licensor and any individual or Legal Entity
63 | on behalf of whom a Contribution has been received by Licensor and
64 | subsequently incorporated within the Work.
65 |
66 | 2. Grant of Copyright License. Subject to the terms and conditions of
67 | this License, each Contributor hereby grants to You a perpetual,
68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
69 | copyright license to reproduce, prepare Derivative Works of,
70 | publicly display, publicly perform, sublicense, and distribute the
71 | Work and such Derivative Works in Source or Object form.
72 |
73 | 3. Grant of Patent License. Subject to the terms and conditions of
74 | this License, each Contributor hereby grants to You a perpetual,
75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
76 | (except as stated in this section) patent license to make, have made,
77 | use, offer to sell, sell, import, and otherwise transfer the Work,
78 | where such license applies only to those patent claims licensable
79 | by such Contributor that are necessarily infringed by their
80 | Contribution(s) alone or by combination of their Contribution(s)
81 | with the Work to which such Contribution(s) was submitted. If You
82 | institute patent litigation against any entity (including a
83 | cross-claim or counterclaim in a lawsuit) alleging that the Work
84 | or a Contribution incorporated within the Work constitutes direct
85 | or contributory patent infringement, then any patent licenses
86 | granted to You under this License for that Work shall terminate
87 | as of the date such litigation is filed.
88 |
89 | 4. Redistribution. You may reproduce and distribute copies of the
90 | Work or Derivative Works thereof in any medium, with or without
91 | modifications, and in Source or Object form, provided that You
92 | meet the following conditions:
93 |
94 | (a) You must give any other recipients of the Work or
95 | Derivative Works a copy of this License; and
96 |
97 | (b) You must cause any modified files to carry prominent notices
98 | stating that You changed the files; and
99 |
100 | (c) You must retain, in the Source form of any Derivative Works
101 | that You distribute, all copyright, patent, trademark, and
102 | attribution notices from the Source form of the Work,
103 | excluding those notices that do not pertain to any part of
104 | the Derivative Works; and
105 |
106 | (d) If the Work includes a "NOTICE" text file as part of its
107 | distribution, then any Derivative Works that You distribute must
108 | include a readable copy of the attribution notices contained
109 | within such NOTICE file, excluding those notices that do not
110 | pertain to any part of the Derivative Works, in at least one
111 | of the following places: within a NOTICE text file distributed
112 | as part of the Derivative Works; within the Source form or
113 | documentation, if provided along with the Derivative Works; or,
114 | within a display generated by the Derivative Works, if and
115 | wherever such third-party notices normally appear. The contents
116 | of the NOTICE file are for informational purposes only and
117 | do not modify the License. You may add Your own attribution
118 | notices within Derivative Works that You distribute, alongside
119 | or as an addendum to the NOTICE text from the Work, provided
120 | that such additional attribution notices cannot be construed
121 | as modifying the License.
122 |
123 | You may add Your own copyright statement to Your modifications and
124 | may provide additional or different license terms and conditions
125 | for use, reproduction, or distribution of Your modifications, or
126 | for any such Derivative Works as a whole, provided Your use,
127 | reproduction, and distribution of the Work otherwise complies with
128 | the conditions stated in this License.
129 |
130 | 5. Submission of Contributions. Unless You explicitly state otherwise,
131 | any Contribution intentionally submitted for inclusion in the Work
132 | by You to the Licensor shall be under the terms and conditions of
133 | this License, without any additional terms or conditions.
134 | Notwithstanding the above, nothing herein shall supersede or modify
135 | the terms of any separate license agreement you may have executed
136 | with Licensor regarding such Contributions.
137 |
138 | 6. Trademarks. This License does not grant permission to use the trade
139 | names, trademarks, service marks, or product names of the Licensor,
140 | except as required for reasonable and customary use in describing the
141 | origin of the Work and reproducing the content of the NOTICE file.
142 |
143 | 7. Disclaimer of Warranty. Unless required by applicable law or
144 | agreed to in writing, Licensor provides the Work (and each
145 | Contributor provides its Contributions) on an "AS IS" BASIS,
146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 | implied, including, without limitation, any warranties or conditions
148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 | PARTICULAR PURPOSE. You are solely responsible for determining the
150 | appropriateness of using or redistributing the Work and assume any
151 | risks associated with Your exercise of permissions under this License.
152 |
153 | 8. Limitation of Liability. In no event and under no legal theory,
154 | whether in tort (including negligence), contract, or otherwise,
155 | unless required by applicable law (such as deliberate and grossly
156 | negligent acts) or agreed to in writing, shall any Contributor be
157 | liable to You for damages, including any direct, indirect, special,
158 | incidental, or consequential damages of any character arising as a
159 | result of this License or out of the use or inability to use the
160 | Work (including but not limited to damages for loss of goodwill,
161 | work stoppage, computer failure or malfunction, or any and all
162 | other commercial damages or losses), even if such Contributor
163 | has been advised of the possibility of such damages.
164 |
165 | 9. Accepting Warranty or Additional Liability. While redistributing
166 | the Work or Derivative Works thereof, You may choose to offer,
167 | and charge a fee for, acceptance of support, warranty, indemnity,
168 | or other liability obligations and/or rights consistent with this
169 | License. However, in accepting such obligations, You may act only
170 | on Your own behalf and on Your sole responsibility, not on behalf
171 | of any other Contributor, and only if You agree to indemnify,
172 | defend, and hold each Contributor harmless for any liability
173 | incurred by, or claims asserted against, such Contributor by reason
174 | of your accepting any such warranty or additional liability.
175 |
176 | END OF TERMS AND CONDITIONS
177 |
178 | APPENDIX: How to apply the Apache License to your work.
179 |
180 | To apply the Apache License to your work, attach the following
181 | boilerplate notice, with the fields enclosed by brackets "[]"
182 | replaced with your own identifying information. (Don't include
183 | the brackets!) The text should be enclosed in the appropriate
184 | comment syntax for the file format. We also recommend that a
185 | file or class name and description of purpose be included on the
186 | same "printed page" as the copyright notice for easier
187 | identification within third-party archives.
188 |
189 | Copyright [yyyy] [name of copyright owner]
190 |
191 | Licensed under the Apache License, Version 2.0 (the "License");
192 | you may not use this file except in compliance with the License.
193 | You may obtain a copy of the License at
194 |
195 | http://www.apache.org/licenses/LICENSE-2.0
196 |
197 | Unless required by applicable law or agreed to in writing, software
198 | distributed under the License is distributed on an "AS IS" BASIS,
199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 | See the License for the specific language governing permissions and
201 | limitations under the License.
202 |
--------------------------------------------------------------------------------
/netlify.toml:
--------------------------------------------------------------------------------
1 |
2 | [build]
3 | publish = "public"
4 | command = "hugo --gc --minify"
5 |
6 | [context.production.environment]
7 | HUGO_VERSION = "0.74.3"
8 | HUGO_ENV = "production"
9 | HUGO_ENABLEGITINFO = "true"
10 |
11 | [context.split1]
12 | command = "hugo --gc --minify --enableGitInfo"
13 |
14 | [context.split1.environment]
15 | HUGO_VERSION = "0.74.3"
16 | HUGO_ENV = "production"
17 |
18 | [context.deploy-preview]
19 | command = "hugo --gc --minify --buildFuture -b $DEPLOY_PRIME_URL"
20 |
21 | [context.deploy-preview.environment]
22 | HUGO_VERSION = "0.74.3"
23 |
24 | [context.branch-deploy]
25 | command = "hugo --gc --minify -b $DEPLOY_PRIME_URL"
26 |
27 | [context.branch-deploy.environment]
28 | HUGO_VERSION = "0.74.3"
29 |
30 | [context.next.environment]
31 | HUGO_ENABLEGITINFO = "true"
32 |
33 | [[redirects]]
34 | from = "https://k8s.networkop.co.uk/*"
35 | to = "https://www.tkng.io/:splat"
36 | status = 301
37 | force = true
--------------------------------------------------------------------------------
/static/images/favicon.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/networkop/tkng/a3667e05c00fff40a82adcfd6178ca6fd9db3618/static/images/favicon.png
--------------------------------------------------------------------------------
/static/images/k8s-guide-logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/networkop/tkng/a3667e05c00fff40a82adcfd6178ca6fd9db3618/static/images/k8s-guide-logo.png
--------------------------------------------------------------------------------