├── .gitignore
├── LICENSE
├── README.md
├── SUMMARY.md
├── chapter1
    ├── README.md
    ├── section1.1.md
    └── section1.2.md
├── chapter2
    ├── README.md
    ├── section2.1.md
    └── section2.2.md
├── chapter3
    ├── README.md
    ├── section3.1.md
    ├── section3.2.md
    └── section3.3.md
├── chapter4
    ├── README.md
    ├── section4.1.md
    ├── section4.2.md
    ├── section4.3.md
    └── section4.4.md
└── chapter5
    ├── README.md
    ├── section5.1.md
    ├── section5.2.md
    └── section5.3.md


/.gitignore:
--------------------------------------------------------------------------------
1 | _book/*
2 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright [yyyy] [name of copyright owner]
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # studying-containerd-notes
2 | 简要记录在containerd 1.2学习过程中的一些心得体会。
3 | 


--------------------------------------------------------------------------------
/SUMMARY.md:
--------------------------------------------------------------------------------
 1 | # Summary
 2 | 
 3 | * [Introduction](README.md)
 4 | * [入门简介](chapter1/README.md)
 5 |     * [简介](chapter1/section1.1.md)
 6 |     * [启动流程](chapter1/section1.2.md)
 7 | * [API层](chapter2/README.md)
 8 |     * [Containers API](chapter2/section2.1.md)
 9 |     * [Task API](chapter2/section2.2.md)
10 | * [执行引擎](chapter3/README.md)
11 |     * [ctr run流程](chapter3/section3.1.md)
12 |     * [Container I/O](chapter3/section3.2.md)
13 |     * [runc](chapter3/section3.3.md)
14 | * [镜像](chapter4/README.md)
15 |     * [image fetch](chapter4/section4.1.md)
16 |     * [image unpack](chapter4/section4.2.md)
17 |     * [snapshotter](chapter4/section4.3.md)
18 |     * [pouch commit实现](chapter4/section4.4.md)
19 | * [存储](chapter5/README.md)
20 |     * [native](chapter5/section5.1.md)
21 |     * [overlayfs](chapter5/section5.2.md)
22 |     * [btrfs](chapter5/section5.3.md)
23 | 
24 | 


--------------------------------------------------------------------------------
/chapter1/README.md:
--------------------------------------------------------------------------------
1 | # 入门简介
2 | 在本章将会介绍contaierd的整体架构以及其启动的流程。
3 | 


--------------------------------------------------------------------------------
/chapter1/section1.1.md:
--------------------------------------------------------------------------------
 1 | # 简介
 2 | 
 3 | Containerd 是一个工业级标准的容器运行时，它强调简单性、健壮性和可移植性。Containerd 可以在宿主机中管理完整的容器生命周期：容器镜像的传输和存储、容器的执行和管理、存储和网络等。
 4 | 
 5 | ![image.png-16.5kB][1]
 6 | 从上图中我们可以看到containerd在整个docker engine架构中的作用。containerd作为抽象的容器运行时，是比较薄的一层，而containerd-shim作为底层执行容器的父进程，完成具体容器进程的监控和回收工作，使得容器可控。
 7 | 
 8 | 下图中包含了containerd的整个组件，其中底层存储通过golang实现的boltdb来完成，容器以metadata的形式存在于containerd，创建销毁容器通过task的接口来完成，底层的Runtime通过runc/runv等符合OCI标准的运行时来完成。
 9 | ![image.png-105.9kB][2]
10 | 
11 | 在线版grpc接口文档:
12 | https://dnephin.github.io/containerd/
13 | 
14 | 
15 |   [1]: http://static.zybuluo.com/myecho/frkxdgjsk7f4xwjboadifk4n/image.png
16 |   [2]: http://static.zybuluo.com/myecho/i7csubxibdandpywp1wl0jew/image.png
17 | 


--------------------------------------------------------------------------------
/chapter1/section1.2.md:
--------------------------------------------------------------------------------
  1 | # 启动流程
  2 | 
  3 | 入口path: containerd/cmd/containerd/command/main.go
  4 | 
  5 |     app.Action = func(context *cli.Context) error {
  6 |     		var (
  7 |     			start   = time.Now()
  8 |     			signals = make(chan os.Signal, 2048)
  9 |     			serverC = make(chan *server.Server, 1)
 10 |     			ctx     = gocontext.Background()
 11 |     			config  = defaultConfig()
 12 |     			//默认的 config root 为 /var/lib/containerd
 13 |     		)
 14 |     
 15 |     		done := handleSignals(ctx, signals, serverC)
 16 |     		// start the signal handler as soon as we can to make sure that
 17 |     		// we don't miss any signals during boot
 18 |     		signal.Notify(signals, handledSignals...)
 19 |     
 20 |     		if err := srvconfig.LoadConfig(context.GlobalString("config"), config); err != nil && !os.IsNotExist(err) {
 21 |     			return err
 22 |     		}
 23 |     		// apply flags to the config
 24 |     		if err := applyFlags(context, config); err != nil {
 25 |     			return err
 26 |     		}
 27 |     		// cleanup temp mounts
 28 |     		if err := mount.SetTempMountLocation(filepath.Join(config.Root, "tmpmounts")); err != nil {
 29 |     			return errors.Wrap(err, "creating temp mount location")
 30 |     		}
 31 |     		// unmount all temp mounts on boot for the server
 32 |     		warnings, err := mount.CleanupTempMounts(0)
 33 |     		if err != nil {
 34 |     			log.G(ctx).WithError(err).Error("unmounting temp mounts")
 35 |     		}
 36 |     		for _, w := range warnings {
 37 |     			log.G(ctx).WithError(w).Warn("cleanup temp mount")
 38 |     		}
 39 |     		address := config.GRPC.Address
 40 |     		if address == "" {
 41 |     			return errors.New("grpc address cannot be empty")
 42 |     		}
 43 |     		log.G(ctx).WithFields(logrus.Fields{
 44 |     			"version":  version.Version,
 45 |     			"revision": version.Revision,
 46 |     		}).Info("starting containerd")
 47 |     
 48 |     		server, err := server.New(ctx, config)
 49 |     		if err != nil {
 50 |     			return err
 51 |     		}
 52 |     		serverC <- server
 53 |     		if config.Debug.Address != "" {
 54 |     			var l net.Listener
 55 |     			if filepath.IsAbs(config.Debug.Address) {
 56 |     				if l, err = sys.GetLocalListener(config.Debug.Address, config.Debug.UID, config.Debug.GID); err != nil {
 57 |     					return errors.Wrapf(err, "failed to get listener for debug endpoint")
 58 |     				}
 59 |     			} else {
 60 |     				if l, err = net.Listen("tcp", config.Debug.Address); err != nil {
 61 |     					return errors.Wrapf(err, "failed to get listener for debug endpoint")
 62 |     				}
 63 |     			}
 64 |     			//初始化debug的接口
 65 |     			serve(ctx, l, server.ServeDebug)
 66 |     		}
 67 |     		if config.Metrics.Address != "" {
 68 |     			l, err := net.Listen("tcp", config.Metrics.Address)
 69 |     			if err != nil {
 70 |     				return errors.Wrapf(err, "failed to get listener for metrics endpoint")
 71 |     			}
 72 |     			//初始化ServeMetrics的接口
 73 |     			serve(ctx, l, server.ServeMetrics)
 74 |     		}
 75 |             
 76 |             //产生unix-socket被grpcServer使用来监听, 默认在/run/containerd/containerd.sock
 77 |     		l, err := sys.GetLocalListener(address, config.GRPC.UID, config.GRPC.GID)
 78 |     		if err != nil {
 79 |     			return errors.Wrapf(err, "failed to get listener for main endpoint")
 80 |     		}
 81 |     		//rpc在Server.New中被初始化
 82 |     		serve(ctx, l, server.ServeGRPC)
 83 |     
 84 |     		log.G(ctx).Infof("containerd successfully booted in %fs", time.Since(start).Seconds())
 85 |     		<-done
 86 |     		return nil
 87 |     	}
 88 |     	return app
 89 |     }
 90 | 
 91 | 
 92 | 
 93 |     // New creates and initializes a new containerd server
 94 |     func New(ctx context.Context, config *srvconfig.Config) (*Server, error) {
 95 |     	switch {
 96 |     	case config.Root == "":
 97 |     		return nil, errors.New("root must be specified")
 98 |     	case config.State == "":
 99 |     		return nil, errors.New("state must be specified")
100 |     	case config.Root == config.State:
101 |     		return nil, errors.New("root and state must be different paths")
102 |     	}
103 |     
104 |     	if err := os.MkdirAll(config.Root, 0711); err != nil {
105 |     		return nil, err
106 |     	}
107 |     	if err := os.MkdirAll(config.State, 0711); err != nil {
108 |     		return nil, err
109 |     	}
110 |     	if err := apply(ctx, config); err != nil {
111 |     		return nil, err
112 |     	}
113 |     	//加载插件，比如snapshottor, metaStore,content等，但是其中还有个proxyPlugin，看起来是为某些服务提供的代理
114 |     	plugins, err := LoadPlugins(ctx, config)
115 |     	if err != nil {
116 |     		return nil, err
117 |     	}
118 |     
119 |     	serverOpts := []grpc.ServerOption{
120 |     		grpc.UnaryInterceptor(grpc_prometheus.UnaryServerInterceptor),
121 |     		grpc.StreamInterceptor(grpc_prometheus.StreamServerInterceptor),
122 |     	}
123 |     	if config.GRPC.MaxRecvMsgSize > 0 {
124 |     		serverOpts = append(serverOpts, grpc.MaxRecvMsgSize(config.GRPC.MaxRecvMsgSize))
125 |     	}
126 |     	if config.GRPC.MaxSendMsgSize > 0 {
127 |     		serverOpts = append(serverOpts, grpc.MaxSendMsgSize(config.GRPC.MaxSendMsgSize))
128 |     	}
129 |     	rpc := grpc.NewServer(serverOpts...)
130 |     	var (
131 |     		services []plugin.Service
132 |     		s        = &Server{
133 |     			rpc:    rpc,
134 |     			events: exchange.NewExchange(),
135 |     			config: config,
136 |     		}
137 |     		initialized = plugin.NewPluginSet()
138 |     	)
139 |     	//上边加载好了plugin后，由下边调用init方法初始化每个plugin，并将需要注册的grpc路由收集到services []plugin.Service中
140 |     	for _, p := range plugins {
141 |     		id := p.URI()
142 |     		log.G(ctx).WithField("type", p.Type).Infof("loading plugin %q...", id)
143 |     
144 |     		initContext := plugin.NewContext(
145 |     			ctx,
146 |     			p,
147 |     			initialized,
148 |     			config.Root,
149 |     			config.State,
150 |     		)
151 |     		initContext.Events = s.events
152 |     		initContext.Address = config.GRPC.Address
153 |     
154 |     		// load the plugin specific configuration if it is provided
155 |     		if p.Config != nil {
156 |     			pluginConfig, err := config.Decode(p.ID, p.Config)
157 |     			if err != nil {
158 |     				return nil, err
159 |     			}
160 |     			initContext.Config = pluginConfig
161 |     		}
162 |     		result := p.Init(initContext)
163 |     		if err := initialized.Add(result); err != nil {
164 |     			return nil, errors.Wrapf(err, "could not add plugin result to plugin set")
165 |     		}
166 |     
167 |     		instance, err := result.Instance()
168 |     		if err != nil {
169 |     			if plugin.IsSkipPlugin(err) {
170 |     				log.G(ctx).WithField("type", p.Type).Infof("skip loading plugin %q...", id)
171 |     			} else {
172 |     				log.G(ctx).WithError(err).Warnf("failed to load plugin %s", id)
173 |     			}
174 |     			continue
175 |     		}
176 |     		// check for grpc services that should be registered with the server
177 |     		if service, ok := instance.(plugin.Service); ok {
178 |     			services = append(services, service)
179 |     		}
180 |     		s.plugins = append(s.plugins, result)
181 |     	}
182 |     	// register services after all plugins have been initialized(注册rpc路由信息)
183 |     	for _, service := range services {
184 |     		if err := service.Register(rpc); err != nil {
185 |     			return nil, err
186 |     		}
187 |     	}
188 |     	return s, nil
189 |     }
190 | 
191 | 下面简介一下plugin的类型
192 | 
193 |     const (
194 |     	// InternalPlugin implements an internal plugin to containerd
195 |     	InternalPlugin Type = "io.containerd.internal.v1"
196 |     	// RuntimePlugin implements a runtime
197 |     	RuntimePlugin Type = "io.containerd.runtime.v1"
198 |     	// RuntimePluginV2 implements a runtime v2
199 |     	RuntimePluginV2 Type = "io.containerd.runtime.v2"
200 |     	// ServicePlugin implements a internal service
201 |     	ServicePlugin Type = "io.containerd.service.v1"
202 |     	// GRPCPlugin implements a grpc service
203 |     	GRPCPlugin Type = "io.containerd.grpc.v1"
204 |     	// SnapshotPlugin implements a snapshotter
205 |     	SnapshotPlugin Type = "io.containerd.snapshotter.v1"
206 |     	// TaskMonitorPlugin implements a task monitor
207 |     	TaskMonitorPlugin Type = "io.containerd.monitor.v1"
208 |     	// DiffPlugin implements a differ
209 |     	DiffPlugin Type = "io.containerd.differ.v1"
210 |     	// MetadataPlugin implements a metadata store
211 |     	MetadataPlugin Type = "io.containerd.metadata.v1"
212 |     	// ContentPlugin implements a content store
213 |     	ContentPlugin Type = "io.containerd.content.v1"
214 |     	// GCPlugin implements garbage collection policy
215 |     	GCPlugin Type = "io.containerd.gc.v1"
216 |     )
217 | 
218 |     func init() {
219 |            plugin.Register(&plugin.Registration{
220 |                   ID:   "btrfs",
221 |                   Type: plugin.SnapshotPlugin,
222 |                   Init: func(ic *plugin.InitContext) (interface{}, error) {
223 |                          return NewSnapshotter(ic.Root)
224 |                   },
225 |            })
226 |     }
227 |     
228 |     func init() {
229 |            plugin.Register(&plugin.Registration{
230 |                   Type: plugin.SnapshotPlugin,
231 |                   ID:   "overlayfs",
232 |                   Init: func(ic *plugin.InitContext) (interface{}, error) {
233 |                          return NewSnapshotter(ic.Root)
234 |                   },
235 |            })
236 |     }
237 |     //可以看到containerd的snapshottor机制也是通过底层的这些storage driver实现的。
238 |     
239 | 更详细的plugin介绍在后边涉及到的时候再深入研究。
240 | 
241 | ### 总结-流程图
242 | ![应用添加模块.png-12.2kB][1]
243 | 
244 | 
245 | 
246 |   [1]: http://static.zybuluo.com/myecho/tz2m8o9i6ppbp8kvrj1r56q7/%E5%BA%94%E7%94%A8%E6%B7%BB%E5%8A%A0%E6%A8%A1%E5%9D%97.png
247 | 


--------------------------------------------------------------------------------
/chapter2/README.md:
--------------------------------------------------------------------------------
1 | # API层
2 | 
3 | 


--------------------------------------------------------------------------------
/chapter2/section2.1.md:
--------------------------------------------------------------------------------
  1 | # Containers API
  2 | 
  3 | 
  4 | 入口path： containerd/api/services，其中的每个service中都定义各自负责模块的rpc接口。
  5 | 
  6 | ### List
  7 | 以containerd list为例子
  8 | 首先是函数入口:
  9 | 
 10 |     func _Containers_List_Handler(srv interface{}, ctx context.Context, dec func(interface{}) error, interceptor grpc.UnaryServerInterceptor) (interface{}, error) {
 11 |            in := new(ListContainersRequest)
 12 |            if err := dec(in); err != nil {
 13 |                   return nil, err
 14 |            }
 15 |            if interceptor == nil {
 16 |                   return srv.(ContainersServer).List(ctx, in)
 17 |            }
 18 |            info := &grpc.UnaryServerInfo{
 19 |                   Server:     srv,
 20 |                   FullMethod: "/containerd.services.containers.v1.Containers/List",
 21 |            }
 22 |            //这个handler就是API具体的处理逻辑
 23 |            handler := func(ctx context.Context, req interface{}) (interface{}, error) {
 24 |                   return srv.(ContainersServer).List(ctx, req.(*ListContainersRequest))
 25 |            }
 26 |            return interceptor(ctx, in, info, handler)
 27 |     }
 28 | 
 29 | 对应接口的实现都在path: containerd/services，对应service的plugin的path在: containerd/services/containers/service.go, 通过对应Service的client为上层提供调用接口，上文中提到的List接口，在path: containerd/services/containers/local.go中，
 30 | 
 31 |     func (l *local) List(ctx context.Context, req *api.ListContainersRequest, _ ...grpc.CallOption) (*api.ListContainersResponse, error) {
 32 |     	var resp api.ListContainersResponse
 33 |     	return &resp, errdefs.ToGRPC(l.withStoreView(ctx, func(ctx context.Context, store containers.Store) error {
 34 |     		containers, err := store.List(ctx, req.Filters...)
 35 |     		if err != nil {
 36 |     			return err
 37 |     		}
 38 |     		resp.Containers = containersToProto(containers)
 39 |     		return nil
 40 |     	}))
 41 |     }
 42 |     
 43 | 根据 withStore 函数可以得到 store 为 metadata.NewContainerStore，路径 containerd/metadata/containers.go 中，containerStore 结构体是包裹的是操作数据库。
 44 | 
 45 |     func (s *containerStore) List(ctx context.Context, fs ...string) ([]containers.Container, error) {
 46 |     	namespace, err := namespaces.NamespaceRequired(ctx)
 47 |     	if err != nil {
 48 |     		return nil, err
 49 |     	}
 50 |     
 51 |     	filter, err := filters.ParseAll(fs...)
 52 |     	if err != nil {
 53 |     		return nil, errors.Wrapf(errdefs.ErrInvalidArgument, err.Error())
 54 |     	}
 55 |     
 56 |     	bkt := getContainersBucket(s.tx, namespace)
 57 |     	if bkt == nil {
 58 |     		return nil, nil // empty store
 59 |     	}
 60 |     
 61 |     	var m []containers.Container
 62 |     	if err := bkt.ForEach(func(k, v []byte) error {
 63 |     		cbkt := bkt.Bucket(k)
 64 |     		if cbkt == nil {
 65 |     			return nil
 66 |     		}
 67 |     		container := containers.Container{ID: string(k)}
 68 |             //从boltdb的存储格式转化为Container结构
 69 |     		if err := readContainer(&container, cbkt); err != nil {
 70 |     			return errors.Wrapf(err, "failed to read container %q", string(k))
 71 |     		}
 72 |     
 73 |     		if filter.Match(adaptContainer(container)) {
 74 |     			m = append(m, container)
 75 |     		}
 76 |     		return nil
 77 |     	}); err != nil {
 78 |     		return nil, err
 79 |     	}
 80 |     
 81 |     	return m, nil
 82 |     }
 83 | 
 84 | 
 85 |     func getContainersBucket(tx *bolt.Tx, namespace string) *bolt.Bucket {
 86 |     	return getBucket(tx, bucketKeyVersion, []byte(namespace), bucketKeyObjectContainers)
 87 |     }
 88 |     
 89 |     //是个嵌套的bucket结构，从最后的name为:bucketKeyObjectContainers的Bucket中取出value
 90 |     func getBucket(tx *bolt.Tx, keys ...[]byte) *bolt.Bucket {
 91 | 	    bkt := tx.Bucket(keys[0])
 92 | 
 93 | 	    for _, key := range keys[1:] {
 94 | 		    if bkt == nil {
 95 | 			    break
 96 | 		    }
 97 | 		    bkt = bkt.Bucket(key)
 98 | 	    }
 99 | 	    return bkt
100 | 	}
101 |     
102 | ### Create
103 | 入口
104 | 
105 |     func _Containers_Create_Handler(srv interface{}, ctx context.Context, dec func(interface{}) error, interceptor grpc.UnaryServerInterceptor) (interface{}, error) {
106 |     	in := new(CreateContainerRequest)
107 |     	if err := dec(in); err != nil {
108 |     		return nil, err
109 |     	}
110 |     	if interceptor == nil {
111 |     		return srv.(ContainersServer).Create(ctx, in)
112 |     	}
113 |     	info := &grpc.UnaryServerInfo{
114 |     		Server:     srv,
115 |     		FullMethod: "/containerd.services.containers.v1.Containers/Create",
116 |     	}
117 |     	handler := func(ctx context.Context, req interface{}) (interface{}, error) {
118 |     		return srv.(ContainersServer).Create(ctx, req.(*CreateContainerRequest))
119 |     	}
120 |     	return interceptor(ctx, in, info, handler)
121 |     }
122 |     
123 | 执行逻辑同样位于在path: containerd/services/containers/local.go中
124 | 
125 |     func (l *local) Create(ctx context.Context, req *api.CreateContainerRequest, _ ...grpc.CallOption) (*api.CreateContainerResponse, error) {
126 |     	var resp api.CreateContainerResponse
127 |     
128 |     	if err := l.withStoreUpdate(ctx, func(ctx context.Context, store containers.Store) error {
129 |     		container := containerFromProto(&req.Container)
130 |     
131 |     		created, err := store.Create(ctx, container)
132 |     		if err != nil {
133 |     			return err
134 |     		}
135 |     
136 |     		resp.Container = containerToProto(&created)
137 |     
138 |     		return nil
139 |     	}); err != nil {
140 |     		return &resp, errdefs.ToGRPC(err)
141 |     	}
142 |     	// 发出事件Event
143 |     	if err := l.publisher.Publish(ctx, "/containers/create", &eventstypes.ContainerCreate{
144 |     		ID:    resp.Container.ID,
145 |     		Image: resp.Container.Image,
146 |     		Runtime: &eventstypes.ContainerCreate_Runtime{
147 |     			Name:    resp.Container.Runtime.Name,
148 |     			Options: resp.Container.Runtime.Options,
149 |     		},
150 |     	}); err != nil {
151 |     		return &resp, err
152 |     	}
153 |     
154 |     	return &resp, nil
155 |     }
156 | 
157 |     //最后落到存储层
158 |     func (s *containerStore) Create(ctx context.Context, container containers.Container) (containers.Container, error) {
159 |     	namespace, err := namespaces.NamespaceRequired(ctx)
160 |     	if err != nil {
161 |     		return containers.Container{}, err
162 |     	}
163 |     
164 |     	if err := validateContainer(&container); err != nil {
165 |     		return containers.Container{}, errors.Wrap(err, "create container failed validation")
166 |     	}
167 |         
168 |         //如果没有创建过才去创建
169 |     	bkt, err := createContainersBucket(s.tx, namespace)
170 |     	if err != nil {
171 |     		return containers.Container{}, err
172 |     	}
173 |     
174 |         //又去创建了个子bucket，在containersBucket下边
175 |     	cbkt, err := bkt.CreateBucket([]byte(container.ID))
176 |     	if err != nil {
177 |     		if err == bolt.ErrBucketExists {
178 |     			err = errors.Wrapf(errdefs.ErrAlreadyExists, "container %q", container.ID)
179 |     		}
180 |     		return containers.Container{}, err
181 |     	}
182 |     
183 |     	container.CreatedAt = time.Now().UTC()
184 |     	container.UpdatedAt = container.CreatedAt
185 |     	if err := writeContainer(cbkt, &container); err != nil {
186 |     		return containers.Container{}, errors.Wrapf(err, "failed to write container %q", container.ID)
187 |     	}
188 |     
189 |     	return container, nil
190 |     }
191 | 
192 | 综上所述，Containers相关的接口基本上是在维护容器相关的metadata.想要创建完整的容器还需要依赖其他的接口比如Task相关的接口。
193 | 
194 | > A container is a metadata object that resources are allocated and
195 | > attached to
196 | 
197 | 
198 | 
199 | 
200 | 


--------------------------------------------------------------------------------
/chapter2/section2.2.md:
--------------------------------------------------------------------------------
 1 | # Task API
 2 | 
 3 | 首先看一下关于Task API都有哪些接口， path在 containerd/api/services/tasks/v1/tasks.pb.go，
 4 | 
 5 |     var _Tasks_serviceDesc = grpc.ServiceDesc{
 6 |     	ServiceName: "containerd.services.tasks.v1.Tasks",
 7 |     	HandlerType: (*TasksServer)(nil),
 8 |     	Methods: []grpc.MethodDesc{
 9 |     		{
10 |     			MethodName: "Create",
11 |     			Handler:    _Tasks_Create_Handler,
12 |     		},
13 |     		{
14 |     			MethodName: "Start",
15 |     			Handler:    _Tasks_Start_Handler,
16 |     		},
17 |     		{
18 |     			MethodName: "Delete",
19 |     			Handler:    _Tasks_Delete_Handler,
20 |     		},
21 |     		{
22 |     			MethodName: "DeleteProcess",
23 |     			Handler:    _Tasks_DeleteProcess_Handler,
24 |     		},
25 |     		{
26 |     			MethodName: "Get",
27 |     			Handler:    _Tasks_Get_Handler,
28 |     		},
29 |     		{
30 |     			MethodName: "List",
31 |     			Handler:    _Tasks_List_Handler,
32 |     		},
33 |     		{
34 |     			MethodName: "Kill",
35 |     			Handler:    _Tasks_Kill_Handler,
36 |     		},
37 |     		{
38 |     			MethodName: "Exec",
39 |     			Handler:    _Tasks_Exec_Handler,
40 |     		},
41 |     		{
42 |     			MethodName: "ResizePty",
43 |     			Handler:    _Tasks_ResizePty_Handler,
44 |     		},
45 |     		{  //关于容器I/O的部分我们后边会着重看一下
46 |     			MethodName: "CloseIO",
47 |     			Handler:    _Tasks_CloseIO_Handler,
48 |     		},
49 |     		{
50 |     			MethodName: "Pause",
51 |     			Handler:    _Tasks_Pause_Handler,
52 |     		},
53 |     		{
54 |     			MethodName: "Resume",
55 |     			Handler:    _Tasks_Resume_Handler,
56 |     		},
57 |     		{
58 |     			MethodName: "ListPids",
59 |     			Handler:    _Tasks_ListPids_Handler,
60 |     		},
61 |     		{
62 |     			MethodName: "Checkpoint",
63 |     			Handler:    _Tasks_Checkpoint_Handler,
64 |     		},
65 |     		{
66 |     			MethodName: "Update",
67 |     			Handler:    _Tasks_Update_Handler,
68 |     		},
69 |     		{
70 |     			MethodName: "Metrics",
71 |     			Handler:    _Tasks_Metrics_Handler,
72 |     		},
73 |     		{
74 |     			MethodName: "Wait",
75 |     			Handler:    _Tasks_Wait_Handler,
76 |     		},
77 |     	},
78 |     	Streams:  []grpc.StreamDesc{},
79 |     	Metadata: "github.com/containerd/containerd/api/services/tasks/v1/tasks.proto",
80 |     }
81 | 
82 | 来看一下run一个容器的逻辑链路。
83 | 
84 | path: containerd/cmd/ctr/commands/run/run.go
85 | 具体代码的就不截图了，而实现的主要逻辑如下所示:
86 | 
87 |     NewContainer -> c.ContainerService().Create(ctx, container)//创建container这个metadata
88 |     //其中还会针对是否有无checkpoint来进行区别
89 |     
90 |     NewTask -> container.NewTask(ctx, ioCreator, opts...) -> c.client.TaskService().Create(ctx, request) //也验证了必须首先有container的metadata才能够创建，只有当Task被执行的时候才会创建一个真正的容器
91 |     
92 |     task.Start(ctx)
93 |     
94 |     //如果收到退出信号
95 |     task.Delete(ctx) 
96 |     //返回状态码
97 | 
98 | 
99 | 


--------------------------------------------------------------------------------
/chapter3/README.md:
--------------------------------------------------------------------------------
1 | # 执行引擎
2 | 
3 | 在节介绍有关containerd执行引擎的内容，包括容器I/O、runc的运行时、containerd client的运行流程。
4 | 


--------------------------------------------------------------------------------
/chapter3/section3.1.md:
--------------------------------------------------------------------------------
  1 | # ctr run流程
  2 | 
  3 | 我们接下来从TaskService的Create接口看起，看一下containerd是如何与底层的containerd-shim以及最后的runc进行结合工作的。
  4 | 
  5 |     opts := runtime.CreateOpts{
  6 |     		Spec: container.Spec,
  7 |     		//这看起来实际操作I/O还是底层的容器执行器
  8 |     		IO: runtime.IO{
  9 |     			Stdin:    r.Stdin,
 10 |     			Stdout:   r.Stdout,
 11 |     			Stderr:   r.Stderr,
 12 |     			Terminal: r.Terminal,
 13 |     		},
 14 |     		Checkpoint:     checkpointPath,
 15 |     		Runtime:        container.Runtime.Name,
 16 |     		RuntimeOptions: container.Runtime.Options,
 17 |     		TaskOptions:    r.Options,
 18 |     }
 19 |     c, err := runtime.Create(ctx, r.ContainerID, opts)
 20 | 
 21 |     // Create a new task
 22 |     func (r *Runtime) Create(ctx context.Context, id string, opts runtime.CreateOpts) (_ runtime.Task, err error) {
 23 |     	namespace, err := namespaces.NamespaceRequired(ctx)
 24 |     	if err != nil {
 25 |     		return nil, err
 26 |     	}
 27 |     
 28 |     	if err := identifiers.Validate(id); err != nil {
 29 |     		return nil, errors.Wrapf(err, "invalid task id")
 30 |     	}
 31 |     
 32 |     	ropts, err := r.getRuncOptions(ctx, id)
 33 |     	if err != nil {
 34 |     		return nil, err
 35 |     	}
 36 |         // newBundle 根据传入的路径和 ID 创建目录文件，/var/lib/containerd/io.containerd.runtime.v1.linux/default
 37 |     	bundle, err := newBundle(id,
 38 |     		filepath.Join(r.state, namespace),
 39 |     		filepath.Join(r.root, namespace),
 40 |     		opts.Spec.Value)
 41 |     	if err != nil {
 42 |     		return nil, err
 43 |     	}
 44 |     	defer func() {
 45 |     		if err != nil {
 46 |     			bundle.Delete()
 47 |     		}
 48 |     	}()
 49 |     
 50 |     	shimopt := ShimLocal(r.config, r.events)
 51 |     	if !r.config.NoShim {
 52 |     		var cgroup string
 53 |     		if opts.TaskOptions != nil {
 54 |     			v, err := typeurl.UnmarshalAny(opts.TaskOptions)
 55 |     			if err != nil {
 56 |     				return nil, err
 57 |     			}
 58 |     			cgroup = v.(*runctypes.CreateOptions).ShimCgroup
 59 |     		}
 60 |     		exitHandler := func() {
 61 |     			log.G(ctx).WithField("id", id).Info("shim reaped")
 62 |     			t, err := r.tasks.Get(ctx, id)
 63 |     			if err != nil {
 64 |     				// Task was never started or was already successfully deleted
 65 |     				return
 66 |     			}
 67 |     			lc := t.(*Task)
 68 |     
 69 |     			log.G(ctx).WithFields(logrus.Fields{
 70 |     				"id":        id,
 71 |     				"namespace": namespace,
 72 |     			}).Warn("cleaning up after killed shim")
 73 |     			if err = r.cleanupAfterDeadShim(context.Background(), bundle, namespace, id, lc.pid); err != nil {
 74 |     				log.G(ctx).WithError(err).WithFields(logrus.Fields{
 75 |     					"id":        id,
 76 |     					"namespace": namespace,
 77 |     				}).Warn("failed to clean up after killed shim")
 78 |     			}
 79 |     		}
 80 |     		//在这里产生了一个新的shim进程，adress地址就是containerd daemon的地址，调用了WithStart func
 81 |     		shimopt = ShimRemote(r.config, r.address, cgroup, exitHandler)
 82 |     	}
 83 |         
 84 |         // 产生一个shim client
 85 |     	s, err := bundle.NewShimClient(ctx, namespace, shimopt, ropts)
 86 |     	if err != nil {
 87 |     		return nil, err
 88 |     	}
 89 |     	defer func() {
 90 |     		if err != nil {
 91 |     			if kerr := s.KillShim(ctx); kerr != nil {
 92 |     				log.G(ctx).WithError(err).Error("failed to kill shim")
 93 |     			}
 94 |     		}
 95 |     	}()
 96 |     
 97 |     	rt := r.config.Runtime
 98 |     	if ropts != nil && ropts.Runtime != "" {
 99 |     		rt = ropts.Runtime
100 |     	}
101 |     	//填充 CreateTaskRequest 结构体，发送 GRPC 给 shim 创建，Create 路径为containerd/runtime/v1/shim/service.go
102 |     	sopts := &shim.CreateTaskRequest{
103 |     		ID:         id,
104 |     		Bundle:     bundle.path,
105 |     		Runtime:    rt,
106 |     		Stdin:      opts.IO.Stdin,
107 |     		Stdout:     opts.IO.Stdout,
108 |     		Stderr:     opts.IO.Stderr,
109 |     		Terminal:   opts.IO.Terminal,
110 |     		Checkpoint: opts.Checkpoint,
111 |     		Options:    opts.TaskOptions,
112 |     	}
113 |     	for _, m := range opts.Rootfs {
114 |     		sopts.Rootfs = append(sopts.Rootfs, &types.Mount{
115 |     			Type:    m.Type,
116 |     			Source:  m.Source,
117 |     			Options: m.Options,
118 |     		})
119 |     	}
120 |     	//将创建的请求交给shim处理，我们下边看看shim会再如何处理得到的这个create请求
121 |     	cr, err := s.Create(ctx, sopts)
122 |     	if err != nil {
123 |     		return nil, errdefs.FromGRPC(err)
124 |     	}
125 |     	t, err := newTask(id, namespace, int(cr.Pid), s, r.events, r.tasks, bundle)
126 |     	if err != nil {
127 |     		return nil, err
128 |     	}
129 |     	if err := r.tasks.Add(ctx, t); err != nil {
130 |     		return nil, err
131 |     	}
132 |     	r.events.Publish(ctx, runtime.TaskCreateEventTopic, &eventstypes.TaskCreate{
133 |     		ContainerID: sopts.ID,
134 |     		Bundle:      sopts.Bundle,
135 |     		Rootfs:      sopts.Rootfs,
136 |     		IO: &eventstypes.TaskIO{
137 |     			Stdin:    sopts.Stdin,
138 |     			Stdout:   sopts.Stdout,
139 |     			Stderr:   sopts.Stderr,
140 |     			Terminal: sopts.Terminal,
141 |     		},
142 |     		Checkpoint: sopts.Checkpoint,
143 |     		Pid:        uint32(t.pid),
144 |     	})
145 |     
146 |     	return t, nil
147 |     }
148 |     
149 | 
150 |     func WithStart(binary, address, daemonAddress, cgroup string, debug bool, exitHandler func()) Opt {
151 |     	return func(ctx context.Context, config shim.Config) (_ shimapi.ShimService, _ io.Closer, err error) {
152 |     	    // 这个address是shim-address，用来containerd和shim进程之间的通信
153 |     	    // filepath.Join(string(filepath.Separator), "containerd-shim", namespace, bundle.id, "shim.sock")
154 |     		socket, err := newSocket(address)
155 |     		if err != nil {
156 |     			return nil, nil, err
157 |     		}
158 |     		defer socket.Close()
159 |     		f, err := socket.File()
160 |     		if err != nil {
161 |     			return nil, nil, errors.Wrapf(err, "failed to get fd for socket %s", address)
162 |     		}
163 |     		defer f.Close()
164 |            // 执行containerd-shim --namespace default --address /run/containerd/containerd.sock命令
165 |     		cmd, err := newCommand(binary, daemonAddress, debug, config, f)
166 |     		if err != nil {
167 |     			return nil, nil, err
168 |     		}
169 |     		if err := cmd.Start(); err != nil {
170 |     			return nil, nil, errors.Wrapf(err, "failed to start shim")
171 |     		}
172 |     		defer func() {
173 |     			if err != nil {
174 |     				cmd.Process.Kill()
175 |     			}
176 |     		}()
177 |     		go func() {
178 |     			cmd.Wait()
179 |     			exitHandler()
180 |     		}()
181 |     		log.G(ctx).WithFields(logrus.Fields{
182 |     			"pid":     cmd.Process.Pid,
183 |     			"address": address,
184 |     			"debug":   debug,
185 |     		}).Infof("shim %s started", binary)
186 |     		// set shim in cgroup if it is provided
187 |     		if cgroup != "" {
188 |     			if err := setCgroup(cgroup, cmd); err != nil {
189 |     				return nil, nil, err
190 |     			}
191 |     			log.G(ctx).WithFields(logrus.Fields{
192 |     				"pid":     cmd.Process.Pid,
193 |     				"address": address,
194 |     			}).Infof("shim placed in cgroup %s", cgroup)
195 |     		}
196 |     		if err = sys.SetOOMScore(cmd.Process.Pid, sys.OOMScoreMaxKillable); err != nil {
197 |     			return nil, nil, errors.Wrap(err, "failed to set OOM Score on shim")
198 |     		}
199 |     		//通过前边介绍的shim-address产生一个client然后进行通信，使用的也是unix socket通信 
200 |     		c, clo, err := WithConnect(address, func() {})(ctx, config)
201 |     		if err != nil {
202 |     			return nil, nil, errors.Wrap(err, "failed to connect")
203 |     		}
204 |     		return c, clo, nil
205 |     	}
206 |     }
207 | 
208 | ### shim处理逻辑
209 | path: containerd/runtime/v1/shim/service.go
210 | api list:
211 | 
212 |     type ShimService interface {
213 |     	State(ctx context.Context, req *StateRequest) (*StateResponse, error)
214 |     	Create(ctx context.Context, req *CreateTaskRequest) (*CreateTaskResponse, error)
215 |     	Start(ctx context.Context, req *StartRequest) (*StartResponse, error)
216 |     	Delete(ctx context.Context, req *google_protobuf1.Empty) (*DeleteResponse, error)
217 |     	DeleteProcess(ctx context.Context, req *DeleteProcessRequest) (*DeleteResponse, error)
218 |     	ListPids(ctx context.Context, req *ListPidsRequest) (*ListPidsResponse, error)
219 |     	Pause(ctx context.Context, req *google_protobuf1.Empty) (*google_protobuf1.Empty, error)
220 |     	Resume(ctx context.Context, req *google_protobuf1.Empty) (*google_protobuf1.Empty, error)
221 |     	Checkpoint(ctx context.Context, req *CheckpointTaskRequest) (*google_protobuf1.Empty, error)
222 |     	Kill(ctx context.Context, req *KillRequest) (*google_protobuf1.Empty, error)
223 |     	Exec(ctx context.Context, req *ExecProcessRequest) (*google_protobuf1.Empty, error)
224 |     	ResizePty(ctx context.Context, req *ResizePtyRequest) (*google_protobuf1.Empty, error)
225 |     	CloseIO(ctx context.Context, req *CloseIORequest) (*google_protobuf1.Empty, error)
226 |     	ShimInfo(ctx context.Context, req *google_protobuf1.Empty) (*ShimInfoResponse, error)
227 |     	Update(ctx context.Context, req *UpdateTaskRequest) (*google_protobuf1.Empty, error)
228 |     	Wait(ctx context.Context, req *WaitRequest) (*WaitResponse, error)
229 |     }
230 |     
231 |     // 上文中提到过的CreateRequest通过shim的unix socket交给这个函数来进行处理
232 |     // Create a new initial process and container with the underlying OCI runtime
233 |     func (s *Service) Create(ctx context.Context, r *shimapi.CreateTaskRequest) (_ *shimapi.CreateTaskResponse, err error) {
234 |     	s.mu.Lock()
235 |     	defer s.mu.Unlock()
236 |     
237 |     	var mounts []proc.Mount
238 |     	for _, m := range r.Rootfs {
239 |     		mounts = append(mounts, proc.Mount{
240 |     			Type:    m.Type,
241 |     			Source:  m.Source,
242 |     			Target:  m.Target,
243 |     			Options: m.Options,
244 |     		})
245 |     	}
246 |     
247 |     	config := &proc.CreateConfig{
248 |     		ID:               r.ID,
249 |     		Bundle:           r.Bundle,
250 |     		Runtime:          r.Runtime,
251 |     		Rootfs:           mounts,
252 |     		Terminal:         r.Terminal,
253 |     		Stdin:            r.Stdin,
254 |     		Stdout:           r.Stdout,
255 |     		Stderr:           r.Stderr,
256 |     		Checkpoint:       r.Checkpoint,
257 |     		ParentCheckpoint: r.ParentCheckpoint,
258 |     		Options:          r.Options,
259 |     	}
260 |     	rootfs := filepath.Join(r.Bundle, "rootfs")
261 |     	defer func() {
262 |     		if err != nil {
263 |     			if err2 := mount.UnmountAll(rootfs, 0); err2 != nil {
264 |     				log.G(ctx).WithError(err2).Warn("Failed to cleanup rootfs mount")
265 |     			}
266 |     		}
267 |     	}()
268 |     	for _, rm := range mounts {
269 |     		m := &mount.Mount{
270 |     			Type:    rm.Type,
271 |     			Source:  rm.Source,
272 |     			Options: rm.Options,
273 |     		}
274 |     		if err := m.Mount(rootfs); err != nil {
275 |     			return nil, errors.Wrapf(err, "failed to mount rootfs component %v", m)
276 |     		}
277 |     	}
278 |     	//下文有介绍
279 |     	process, err := newInit(
280 |     		ctx,
281 |     		s.config.Path,
282 |     		s.config.WorkDir,
283 |     		s.config.RuntimeRoot,
284 |     		s.config.Namespace,
285 |     		s.config.Criu,
286 |     		s.config.SystemdCgroup,
287 |     		s.platform,
288 |     		config,
289 |     	)
290 |     	if err != nil {
291 |     		return nil, errdefs.ToGRPC(err)
292 |     	}
293 |     	//在这里函数里边开始创建物理容器进程
294 |     	//最终在func (r *Runc) Create(context context.Context, id, bundle string, opts *CreateOpts) 里边通过/run/containerd/runc命令创建符合OCI标准的容器物理进程
295 |     	if err := process.Create(ctx, config); err != nil {
296 |     		return nil, errdefs.ToGRPC(err)
297 |     	}
298 |     	// save the main task id and bundle to the shim for additional requests
299 |     	s.id = r.ID
300 |     	s.bundle = r.Bundle
301 |     	pid := process.Pid()
302 |     	s.processes[r.ID] = process
303 |     	return &shimapi.CreateTaskResponse{
304 |     		Pid: uint32(pid),
305 |     	}, nil
306 |     }
307 |     
308 |     //在这个func里边返回一个process描述对象供以后使用，这里这里已经是在shim进程里边了。
309 |     func newInit(ctx context.Context, path, workDir, runtimeRoot, namespace, criu string, systemdCgroup bool, platform rproc.Platform, r *proc.CreateConfig) (*proc.Init, error) {
310 |     	var options runctypes.CreateOptions
311 |     	if r.Options != nil {
312 |     		v, err := typeurl.UnmarshalAny(r.Options)
313 |     		if err != nil {
314 |     			return nil, err
315 |     		}
316 |     		options = *v.(*runctypes.CreateOptions)
317 |     	}
318 |     
319 |     	rootfs := filepath.Join(path, "rootfs")
320 |     	runtime := proc.NewRunc(runtimeRoot, path, namespace, r.Runtime, criu, systemdCgroup)
321 |     	//创建进程描述对象，这个时候还没有创建起物理的容器进程
322 |     	p := proc.New(r.ID, runtime, rproc.Stdio{
323 |     		Stdin:    r.Stdin,
324 |     		Stdout:   r.Stdout,
325 |     		Stderr:   r.Stderr,
326 |     		Terminal: r.Terminal,
327 |     	})
328 |     	p.Bundle = r.Bundle
329 |     	p.Platform = platform
330 |     	p.Rootfs = rootfs
331 |     	p.WorkDir = workDir
332 |     	p.IoUID = int(options.IoUid)
333 |     	p.IoGID = int(options.IoGid)
334 |     	p.NoPivotRoot = options.NoPivotRoot
335 |     	p.NoNewKeyring = options.NoNewKeyring
336 |     	return p, nil
337 |     }
338 |     
339 | ### 总结
340 | containerd daemon看起来只是一层比较薄的逻辑。
341 | containerd-shim则相当于是容器物理进程，比如会在容器物理挂掉后接管它，避免它被init进程监管等。
342 | 具体的执行还是都交给了底层的runc来完成。
343 | 
344 | 流程图如下所示
345 | ![未命名文件.png-34.3kB][1]
346 | 
347 | 
348 | ### 参考
349 | https://blog.csdn.net/zhonglinzhang/article/details/76615127
350 | https://blog.csdn.net/zhonglinzhang/article/category/3271199
351 | https://blog.csdn.net/zhonglinzhang/article/details/76683925
352 | 
353 | 
354 |   [1]: http://static.zybuluo.com/myecho/r5rshm7wg9ahrv56cqk07zy5/%E6%9C%AA%E5%91%BD%E5%90%8D%E6%96%87%E4%BB%B6.png
355 | 


--------------------------------------------------------------------------------
/chapter3/section3.2.md:
--------------------------------------------------------------------------------
  1 | # Container I/O
  2 | 
  3 | 那么在整个运作过程容器的I/O是如何通过containerd的API来操作的呢？I/O如何被打开？I/O如何被关闭？I/O之间如何进行数据交换的操作，都将在这一节得到答案。
  4 | 
  5 | path: containerd/cmd/ctr/commands/tasks/tasks_unix.go
  6 | 
  7 |     // NewTask creates a new task
  8 |     func NewTask(ctx gocontext.Context, client *containerd.Client, container containerd.Container, checkpoint string, con console.Console, nullIO bool, ioOpts []cio.Opt, opts ...containerd.NewTaskOpts) (containerd.Task, error) {
  9 |         //默认是使用stdin来初始化client的I/O，如os.stdout/os.stdin/os.stderr
 10 |     	stdio := cio.NewCreator(append([]cio.Opt{cio.WithStdio}, ioOpts...)...)
 11 |     	if checkpoint != "" {
 12 |     		im, err := client.GetImage(ctx, checkpoint)
 13 |     		if err != nil {
 14 |     			return nil, err
 15 |     		}
 16 |     		opts = append(opts, containerd.WithTaskCheckpoint(im))
 17 |     	}
 18 |     	ioCreator := stdio
 19 |     	// 如果指定了tty
 20 |     	if con != nil {
 21 |     	    // 改变client端的stdion,  stdin -> console, stdout -> console, stderr->nil
 22 |     		ioCreator = cio.NewCreator(append([]cio.Opt{cio.WithStreams(con, con, nil), cio.WithTerminal}, ioOpts...)...)
 23 |     	}
 24 |     	if nullIO {
 25 |     		if con != nil {
 26 |     			return nil, errors.New("tty and null-io cannot be used together")
 27 |     		}
 28 |     		ioCreator = cio.NullIO
 29 |     	}
 30 |     	//最终会调用containerd的taskService, 这里创建的FIFO-DIR也会作为参数传入
 31 |     	return container.NewTask(ctx, ioCreator, opts...)
 32 |     }
 33 |     
 34 |     // NewCreator returns an IO creator from the options
 35 |     func NewCreator(opts ...Opt) Creator {
 36 |     	streams := &Streams{}
 37 |     	for _, opt := range opts {
 38 |     		opt(streams)
 39 |     	}
 40 |     	if streams.FIFODir == "" {
 41 |     		streams.FIFODir = defaults.DefaultFIFODir
 42 |     	}
 43 |     	return func(id string) (IO, error) {
 44 |     	    //创建containerd对外的I/O管道，所有的I/O都是通过fifo与client端的I/O进行数据交换的
 45 |     	    // path形如:filepath.Join(dir, id+"-stdin"), id就是container id
 46 |     		fifos, err := NewFIFOSetInDir(streams.FIFODir, id, streams.Terminal)
 47 |     		if err != nil {
 48 |     			return nil, err
 49 |     		}
 50 |     		if streams.Stdin == nil {
 51 |     			fifos.Stdin = ""
 52 |     		}
 53 |     		if streams.Stdout == nil {
 54 |     			fifos.Stdout = ""
 55 |     		}
 56 |     		if streams.Stderr == nil {
 57 |     			fifos.Stderr = ""
 58 |     		}
 59 |     		//真正的数据交换过程
 60 |     		return copyIO(fifos, streams)
 61 |     	}
 62 |     }
 63 |     
 64 |     //ioset对应的就是containerd client端的I/O
 65 |     func copyIO(fifos *FIFOSet, ioset *Streams) (*cio, error) {
 66 |     	var ctx, cancel = context.WithCancel(context.Background())
 67 |     	//注重看下打开io的方式
 68 |     	pipes, err := openFifos(ctx, fifos)
 69 |     	if err != nil {
 70 |     		cancel()
 71 |     		return nil, err
 72 |     	}
 73 |     
 74 |     	if fifos.Stdin != "" {
 75 |     		go func() {
 76 |     			p := bufPool.Get().(*[]byte)
 77 |     			defer bufPool.Put(p)
 78 |     
 79 |     			io.CopyBuffer(pipes.Stdin, ioset.Stdin, *p)
 80 |     			pipes.Stdin.Close()
 81 |     		}()
 82 |     	}
 83 |     
 84 |     	var wg = &sync.WaitGroup{}
 85 |     	wg.Add(1)
 86 |     	go func() {
 87 |     		p := bufPool.Get().(*[]byte)
 88 |     		defer bufPool.Put(p)
 89 |     
 90 |     		io.CopyBuffer(ioset.Stdout, pipes.Stdout, *p)
 91 |     		pipes.Stdout.Close()
 92 |     		wg.Done()
 93 |     	}()
 94 |         
 95 |         //对于terminal的环境来说，stdout/stderr统一拷贝到stdout，两者不区分
 96 |     	if !fifos.Terminal {
 97 |     		wg.Add(1)
 98 |     		go func() {
 99 |     			p := bufPool.Get().(*[]byte)
100 |     			defer bufPool.Put(p)
101 |     
102 |     			io.CopyBuffer(ioset.Stderr, pipes.Stderr, *p)
103 |     			pipes.Stderr.Close()
104 |     			wg.Done()
105 |     		}()
106 |     	}
107 |     	return &cio{
108 |     		config:  fifos.Config,
109 |     		wg:      wg,
110 |     		closers: append(pipes.closers(), fifos),
111 |     		cancel:  cancel,
112 |     	}, nil
113 |     }
114 |     
115 |     func openFifos(ctx context.Context, fifos *FIFOSet) (pipes, error) {
116 |     	var err error
117 |     	defer func() {
118 |     		if err != nil {
119 |     			fifos.Close()
120 |     		}
121 |     	}()
122 |     
123 |     	var f pipes
124 |     	if fifos.Stdin != "" {
125 |     	    //非堵塞方式打开写端
126 |     		if f.Stdin, err = fifo.OpenFifo(ctx, fifos.Stdin, syscall.O_WRONLY|syscall.O_CREAT|syscall.O_NONBLOCK, 0700); err != nil {
127 |     			return f, errors.Wrapf(err, "failed to open stdin fifo")
128 |     		}
129 |     		defer func() {
130 |     			if err != nil && f.Stdin != nil {
131 |     				f.Stdin.Close()
132 |     			}
133 |     		}()
134 |     	}
135 |     	if fifos.Stdout != "" {
136 |     	    //非堵塞方式打开读端
137 |     		if f.Stdout, err = fifo.OpenFifo(ctx, fifos.Stdout, syscall.O_RDONLY|syscall.O_CREAT|syscall.O_NONBLOCK, 0700); err != nil {
138 |     			return f, errors.Wrapf(err, "failed to open stdout fifo")
139 |     		}
140 |     		defer func() {
141 |     			if err != nil && f.Stdout != nil {
142 |     				f.Stdout.Close()
143 |     			}
144 |     		}()
145 |     	}
146 |     	if fifos.Stderr != "" {
147 |     	    //非堵塞方式打开读端
148 |     		if f.Stderr, err = fifo.OpenFifo(ctx, fifos.Stderr, syscall.O_RDONLY|syscall.O_CREAT|syscall.O_NONBLOCK, 0700); err != nil {
149 |     			return f, errors.Wrapf(err, "failed to open stderr fifo")
150 |     		}
151 |     	}
152 |     	return f, nil
153 |     }
154 | 
155 | 对于container的daemon来说，它只需要将I/O通过fifo管道写出去即可(其实还是通过底层runc和shim交互来完成的)，不需要关心谁从FIFO管道消费。
156 | 
157 | 接下来再看下，containerd daemon拿到fifo-dir后会如何处理:
158 | 
159 | path: containerd/services/tasks/local.go
160 | 
161 |     opts := runtime.CreateOpts{
162 |     		Spec: container.Spec,
163 |     		//这里的I/O还是fifo-dir目录下的fifo
164 |     		IO: runtime.IO{
165 |     			Stdin:    r.Stdin,
166 |     			Stdout:   r.Stdout,
167 |     			Stderr:   r.Stderr,
168 |     			Terminal: r.Terminal,
169 |     		},
170 |     		Checkpoint:     checkpointPath,
171 |     		Runtime:        container.Runtime.Name,
172 |     		RuntimeOptions: container.Runtime.Options,
173 |     		TaskOptions:    r.Options,
174 |     }
175 |     c, err := runtime.Create(ctx, r.ContainerID, opts)
176 | 
177 | path: containerd/runtime/v1/linux/runtime.go
178 | 
179 |     sopts := &shim.CreateTaskRequest{
180 |     		ID:         id,
181 |     		Bundle:     bundle.path,
182 |     		Runtime:    rt,
183 |     		Stdin:      opts.IO.Stdin,
184 |     		Stdout:     opts.IO.Stdout,
185 |     		Stderr:     opts.IO.Stderr,
186 |     		Terminal:   opts.IO.Terminal,
187 |     		Checkpoint: opts.Checkpoint,
188 |     		Options:    opts.TaskOptions,
189 |     }
190 |     cr, err := s.Create(ctx, sopts)
191 |     //封装成task请求然后又交给shim进程来处理了
192 | 
193 | path: containerd/runtime/v1/shim/service.go
194 | 
195 |     if err := process.Create(ctx, config); err != nil {
196 |     		return nil, errdefs.ToGRPC(err)
197 |     }
198 |     
199 | path: containerd/runtime/v1/linux/proc/init.go
200 | 
201 |     // Create the process with the provided config
202 |     func (p *Init) Create(ctx context.Context, r *CreateConfig) error {
203 |     	var (
204 |     		err    error
205 |     		socket *runc.Socket
206 |     	)
207 |     	//根据是否tty有两种区分，也能体现在后边copyI/O的步骤上
208 |     	if r.Terminal {
209 |     		if socket, err = runc.NewTempConsoleSocket(); err != nil {
210 |     			return errors.Wrap(err, "failed to create OCI runtime console socket")
211 |     		}
212 |     		defer socket.Close()
213 |     	} else if hasNoIO(r) {
214 |     		if p.io, err = runc.NewNullIO(); err != nil {
215 |     			return errors.Wrap(err, "creating new NULL IO")
216 |     		}
217 |     	} else {
218 |     	//这里是创建与容器物理进程通信的Pipe, 不是前边介绍的fifo-dir目录下的fifo
219 |     		if p.io, err = runc.NewPipeIO(p.IoUID, p.IoGID, withConditionalIO(p.stdio)); err != nil {
220 |     			return errors.Wrap(err, "failed to create OCI runtime io pipes")
221 |     		}
222 |     	}
223 |     	pidFile := filepath.Join(p.Bundle, InitPidFile)
224 |     	if r.Checkpoint != "" {
225 |     		opts := &runc.RestoreOpts{
226 |     			CheckpointOpts: runc.CheckpointOpts{
227 |     				ImagePath:  r.Checkpoint,
228 |     				WorkDir:    p.WorkDir,
229 |     				ParentPath: r.ParentCheckpoint,
230 |     			},
231 |     			PidFile:     pidFile,
232 |     			IO:          p.io,
233 |     			NoPivot:     p.NoPivotRoot,
234 |     			Detach:      true,
235 |     			NoSubreaper: true,
236 |     		}
237 |     		p.initState = &createdCheckpointState{
238 |     			p:    p,
239 |     			opts: opts,
240 |     		}
241 |     		return nil
242 |     	}
243 |     	opts := &runc.CreateOpts{
244 |     		PidFile:      pidFile,
245 |     		IO:           p.io,
246 |     		NoPivot:      p.NoPivotRoot,
247 |     		NoNewKeyring: p.NoNewKeyring,
248 |     	}
249 |     	if socket != nil {
250 |     		opts.ConsoleSocket = socket
251 |     	}
252 |     	//这是真正创建物理容器进程的步骤，在其中通过 opts.Set(cmd)将pipe I/O set进去
253 |     	if err := p.runtime.Create(ctx, r.ID, r.Bundle, opts); err != nil {
254 |     		return p.runtimeError(err, "OCI runtime create failed")
255 |     	}
256 |     	//这里为什么打开r.stdin的fifo写端，在CloseIO的时候关闭的就是它
257 |     	//如果只有读端，如果外部进程异常退出了，那么容器只能认为容器输入结束了(因为对于fifo来说，唯一的写关闭了，读就会自动关闭)
258 |     	//而如果增加了写端，就能够区分正常和异常结束，同时可以将关闭的主动权握在自己手里，正常结束通过closeIO来关闭写端来结束
259 |     	if r.Stdin != "" {
260 |     		sc, err := fifo.OpenFifo(ctx, r.Stdin, syscall.O_WRONLY|syscall.O_NONBLOCK, 0)
261 |     		if err != nil {
262 |     			return errors.Wrapf(err, "failed to open stdin fifo %s", r.Stdin)
263 |     		}
264 |     		p.stdin = sc
265 |     		p.closers = append(p.closers, sc)
266 |     	}
267 |     	var copyWaitGroup sync.WaitGroup
268 |     	//下边的逻辑是处理从pipe拷贝到fifo的过程
269 |     	if socket != nil {
270 |     		console, err := socket.ReceiveMaster()
271 |     		if err != nil {
272 |     			return errors.Wrap(err, "failed to retrieve console master")
273 |     		}
274 |     		console, err = p.Platform.CopyConsole(ctx, console, r.Stdin, r.Stdout, r.Stderr, &p.wg, &copyWaitGroup)
275 |     		if err != nil {
276 |     			return errors.Wrap(err, "failed to start console copy")
277 |     		}
278 |     		p.console = console
279 |     	} else if !hasNoIO(r) {
280 |     		if err := copyPipes(ctx, p.io, r.Stdin, r.Stdout, r.Stderr, &p.wg, &copyWaitGroup); err != nil {
281 |     			return errors.Wrap(err, "failed to start io pipe copy")
282 |     		}
283 |     	}
284 |     
285 |     	copyWaitGroup.Wait()
286 |     	pid, err := runc.ReadPidFile(pidFile)
287 |     	if err != nil {
288 |     		return errors.Wrap(err, "failed to retrieve OCI runtime container pid")
289 |     	}
290 |     	p.pid = pid
291 |     	return nil
292 |     }
293 |     
294 |     //以非tty的情况下举例子
295 |     
296 |     func copyPipes(ctx context.Context, rio runc.IO, stdin, stdout, stderr string, wg, cwg *sync.WaitGroup) error {
297 |     	var sameFile io.WriteCloser
298 |     	for _, i := range []struct {
299 |     		name string
300 |     		dest func(wc io.WriteCloser, rc io.Closer)
301 |     	}{
302 |     		{
303 |     			name: stdout,
304 |     			dest: func(wc io.WriteCloser, rc io.Closer) {
305 |     				wg.Add(1)
306 |     				cwg.Add(1)
307 |     				go func() {
308 |     					cwg.Done()
309 |     					p := bufPool.Get().(*[]byte)
310 |     					defer bufPool.Put(p)
311 |     					io.CopyBuffer(wc, rio.Stdout(), *p)
312 |     					wg.Done()
313 |     					wc.Close()
314 |     					if rc != nil {
315 |     						rc.Close()
316 |     					}
317 |     				}()
318 |     			},
319 |     		}, {
320 |     			name: stderr,
321 |     			dest: func(wc io.WriteCloser, rc io.Closer) {
322 |     				wg.Add(1)
323 |     				cwg.Add(1)
324 |     				go func() {
325 |     					cwg.Done()
326 |     					p := bufPool.Get().(*[]byte)
327 |     					defer bufPool.Put(p)
328 |     					io.CopyBuffer(wc, rio.Stderr(), *p)
329 |     					wg.Done()
330 |     					wc.Close()
331 |     					if rc != nil {
332 |     						rc.Close()
333 |     					}
334 |     				}()
335 |     			},
336 |     		},
337 |     	} {//开始对stdout和stderr处理
338 |     		ok, err := isFifo(i.name)
339 |     		if err != nil {
340 |     			return err
341 |     		}
342 |     		var (
343 |     			fw io.WriteCloser
344 |     			fr io.Closer
345 |     		)
346 |     		if ok {
347 |     		    //为什么stdout和stderr读端和写端都打开？这里还是为了防止fifo的自动关闭，开了写再开个读，什么时候关闭全由自己决定
348 |     			if fw, err = fifo.OpenFifo(ctx, i.name, syscall.O_WRONLY, 0); err != nil {
349 |     				return fmt.Errorf("containerd-shim: opening %s failed: %s", i.name, err)
350 |     			}
351 |     			if fr, err = fifo.OpenFifo(ctx, i.name, syscall.O_RDONLY, 0); err != nil {
352 |     				return fmt.Errorf("containerd-shim: opening %s failed: %s", i.name, err)
353 |     			}
354 |     		} else {
355 |     			if sameFile != nil {
356 |     				i.dest(sameFile, nil)
357 |     				continue
358 |     			}
359 |     			if fw, err = os.OpenFile(i.name, syscall.O_WRONLY|syscall.O_APPEND, 0); err != nil {
360 |     				return fmt.Errorf("containerd-shim: opening %s failed: %s", i.name, err)
361 |     			}
362 |     			if stdout == stderr {
363 |     				sameFile = fw
364 |     			}
365 |     		}
366 |     		i.dest(fw, fr)
367 |     	}
368 |     	if stdin == "" {
369 |     		return nil
370 |     	}
371 |     	//stdin打开读端用作正常使用
372 |     	f, err := fifo.OpenFifo(ctx, stdin, syscall.O_RDONLY|syscall.O_NONBLOCK, 0)
373 |     	if err != nil {
374 |     		return fmt.Errorf("containerd-shim: opening %s failed: %s", stdin, err)
375 |     	}
376 |     	cwg.Add(1)
377 |     	go func() {
378 |     		cwg.Done()
379 |     		p := bufPool.Get().(*[]byte)
380 |     		defer bufPool.Put(p)
381 |     
382 |     		io.CopyBuffer(rio.Stdin(), f, *p)
383 |     		rio.Stdin().Close()
384 |     		f.Close()
385 |     	}()
386 |     	return nil
387 |     }
388 | 
389 | #### Container I/O的关闭
390 | 下边关注一下`_Tasks_CloseIO_Handler`，也就是处理I/O关闭的handler.
391 | 
392 | path: containerd/services/tasks/local.go
393 | 
394 |     func (l *local) CloseIO(ctx context.Context, r *api.CloseIORequest, _ ...grpc.CallOption) (*ptypes.Empty, error) {
395 |     	t, err := l.getTask(ctx, r.ContainerID)
396 |     	if err != nil {
397 |     		return nil, err
398 |     	}
399 |     	p := runtime.Process(t)
400 |     	if r.ExecID != "" {
401 |     		if p, err = t.Process(ctx, r.ExecID); err != nil {
402 |     			return nil, errdefs.ToGRPC(err)
403 |     		}
404 |     	}
405 |     	if r.Stdin {
406 |     		if err := p.CloseIO(ctx); err != nil {
407 |     			return nil, err
408 |     		}
409 |     	}
410 |     	return empty, nil
411 |     }
412 | 
413 |     // CloseIO closes the provided IO pipe for the process
414 |     func (p *process) CloseIO(ctx context.Context) error {
415 |     	_, err := p.shim.task.CloseIO(ctx, &task.CloseIORequest{
416 |     		ID:     p.shim.ID(),
417 |     		ExecID: p.id,
418 |     		Stdin:  true,
419 |     	})
420 |     	if err != nil {
421 |     		return errdefs.FromGRPC(err)
422 |     	}
423 |     	return nil
424 |     }
425 |     
426 |     // CloseIO of a process
427 |     func (s *Service) CloseIO(ctx context.Context, r *shimapi.CloseIORequest) (*ptypes.Empty, error) {
428 |     	s.mu.Lock()
429 |     	defer s.mu.Unlock()
430 |     	p := s.processes[r.ID]
431 |     	if p == nil {
432 |     		return nil, errdefs.ToGRPCf(errdefs.ErrNotFound, "process does not exist %s", r.ID)
433 |     	}
434 |     	//在这里看到其实是关闭了stdin(这里对应上文中打开的r.stdin的写端)
435 |     	if stdin := p.Stdin(); stdin != nil {
436 |     		if err := stdin.Close(); err != nil {
437 |     			return nil, errors.Wrap(err, "close stdin")
438 |     		}
439 |     	}
440 |     	return empty, nil
441 |     }
442 |     
443 | 
444 | ### 总结
445 | tty情况下，runc和shim之间通过unix socket进行通信， 通过runc create --console-socket value 可以看出，此时stdout和stderr不区分，统一走socket; 而当non-tty的情况下，是通过父子进程之间的pipe进行通信的, stdout和stderr是有区分的。
446 | 
447 | ### 总结-流程图
448 | ![angular.js.png-8.9kB][1]
449 | 
450 | 
451 | ### Pouch I/O设计
452 | ![image.png-79.5kB][2]
453 | 
454 | https://github.com/alibaba/pouch/pull/2375/files
455 | 
456 | 着重注意一下对这段话的理解:
457 | 
458 | > The contained-shim will open fifo twice for reading and writing. For
459 | > the writing mode, the shim doesn't close stdin fifo until the client
460 | > calls CloseIO. In some case, the pouch daemon might be crash before
461 | > finishing the input. If shim doesn't hold writing mode fifo, the
462 | > process in container will consider that it is EOF signal and exit.
463 | > 
464 | > Based on this case, if the client sends EOF signal to input channel,
465 | > the pouchd should send the CloseIO to shim to let the process exit.
466 | > 
467 | > StdinOnce in container's config is used by attach request. If the
468 | > StdinOnce is true, when one attach request finishes stream copy, the
469 | > pouchd will closes the input of process. So other attach requests to
470 | > the same container will be stopped.
471 | > 
472 | > If the user wants StdinOnce, it should set it to true during creating
473 | > container.
474 | 
475 | Thanks to @fuwid explaination.
476 | 
477 | ### 参考
478 | http://hustcat.github.io/terminal-and-docker/
479 | https://github.com/alibaba/pouch/pull/2375/files
480 | 
481 | 
482 |   [1]: http://static.zybuluo.com/myecho/3xdwajlta9567m0bcjq77qc6/angular.js.png
483 |   [2]: http://static.zybuluo.com/myecho/lig45a4gvcubxul5b9bqv05d/image.png
484 | 


--------------------------------------------------------------------------------
/chapter3/section3.3.md:
--------------------------------------------------------------------------------
  1 | # runc
  2 | 
  3 | ### OCI runtime spec
  4 | 介绍的比较好的文章:
  5 | https://segmentfault.com/a/1190000009583199
  6 | 
  7 | https://github.com/opencontainers/runtime-spec
  8 | https://github.com/opencontainers/runtime-tools
  9 | 
 10 | ### runc使用实例
 11 | 
 12 | 1. 首先通过
 13 | 
 14 |     $ docker pull busybox
 15 |     $ mkdir -p /tmp/mycontainer/rootfs
 16 |     $ cd /tmp/mycontainer
 17 |     $ docker export $(docker create busybox) | tar -C rootfs -xvf -
 18 | 
 19 | 产生一个rootfs，当然还可以通过其他的runtime-tools来直接生成
 20 | 2. 通过runc spec产生一个符合runtime spec的bundle config.json,下边是一个busybox镜像导出的config.json的例子
 21 | 
 22 |     {
 23 |     	"ociVersion": "1.0.0",
 24 |     	//表示进入容器后要执行什么命令，后边还会提到
 25 |     	"process": {
 26 |     		"terminal": true,
 27 |     		"user": {
 28 |     			"uid": 0,
 29 |     			"gid": 0
 30 |     		},
 31 |     		"args": [
 32 |     			"sh"
 33 |     		],
 34 |     		"env": [
 35 |     			"PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin",
 36 |     			"TERM=xterm"
 37 |     		],
 38 |     		"cwd": "/",
 39 |     		"capabilities": {
 40 |     			"bounding": [
 41 |     				"CAP_AUDIT_WRITE",
 42 |     				"CAP_KILL",
 43 |     				"CAP_NET_BIND_SERVICE"
 44 |     			],
 45 |     			"effective": [
 46 |     				"CAP_AUDIT_WRITE",
 47 |     				"CAP_KILL",
 48 |     				"CAP_NET_BIND_SERVICE"
 49 |     			],
 50 |     			"inheritable": [
 51 |     				"CAP_AUDIT_WRITE",
 52 |     				"CAP_KILL",
 53 |     				"CAP_NET_BIND_SERVICE"
 54 |     			],
 55 |     			"permitted": [
 56 |     				"CAP_AUDIT_WRITE",
 57 |     				"CAP_KILL",
 58 |     				"CAP_NET_BIND_SERVICE"
 59 |     			],
 60 |     			"ambient": [
 61 |     				"CAP_AUDIT_WRITE",
 62 |     				"CAP_KILL",
 63 |     				"CAP_NET_BIND_SERVICE"
 64 |     			]
 65 |     		},
 66 |     		"rlimits": [
 67 |     			{
 68 |     				"type": "RLIMIT_NOFILE",
 69 |     				"hard": 1024,
 70 |     				"soft": 1024
 71 |     			}
 72 |     		],
 73 |     		"noNewPrivileges": true
 74 |     	},
 75 |     	"root": {
 76 |     		"path": "rootfs",
 77 |     		"readonly": true
 78 |     	},
 79 |     	"hostname": "runc",
 80 |     	"mounts": [
 81 |     		{
 82 |     			"destination": "/proc",
 83 |     			"type": "proc",
 84 |     			"source": "proc"
 85 |     		},
 86 |     		{
 87 |     			"destination": "/dev",
 88 |     			"type": "tmpfs",
 89 |     			"source": "tmpfs",
 90 |     			"options": [
 91 |     				"nosuid",
 92 |     				"strictatime",
 93 |     				"mode=755",
 94 |     				"size=65536k"
 95 |     			]
 96 |     		},
 97 |     		{
 98 |     			"destination": "/dev/pts",
 99 |     			"type": "devpts",
100 |     			"source": "devpts",
101 |     			"options": [
102 |     				"nosuid",
103 |     				"noexec",
104 |     				"newinstance",
105 |     				"ptmxmode=0666",
106 |     				"mode=0620",
107 |     				"gid=5"
108 |     			]
109 |     		},
110 |     		{
111 |     			"destination": "/dev/shm",
112 |     			"type": "tmpfs",
113 |     			"source": "shm",
114 |     			"options": [
115 |     				"nosuid",
116 |     				"noexec",
117 |     				"nodev",
118 |     				"mode=1777",
119 |     				"size=65536k"
120 |     			]
121 |     		},
122 |     		{
123 |     			"destination": "/dev/mqueue",
124 |     			"type": "mqueue",
125 |     			"source": "mqueue",
126 |     			"options": [
127 |     				"nosuid",
128 |     				"noexec",
129 |     				"nodev"
130 |     			]
131 |     		},
132 |     		{
133 |     			"destination": "/sys",
134 |     			"type": "sysfs",
135 |     			"source": "sysfs",
136 |     			"options": [
137 |     				"nosuid",
138 |     				"noexec",
139 |     				"nodev",
140 |     				"ro"
141 |     			]
142 |     		},
143 |     		{
144 |     			"destination": "/sys/fs/cgroup",
145 |     			"type": "cgroup",
146 |     			"source": "cgroup",
147 |     			"options": [
148 |     				"nosuid",
149 |     				"noexec",
150 |     				"nodev",
151 |     				"relatime",
152 |     				"ro"
153 |     			]
154 |     		}
155 |     	],
156 |     	"linux": {
157 |     		"resources": {
158 |     			"devices": [
159 |     				{
160 |     					"allow": false,
161 |     					"access": "rwm"
162 |     				}
163 |     			]
164 |     		},
165 |     		"namespaces": [
166 |     			{
167 |     				"type": "pid"
168 |     			},
169 |     			{
170 |     				"type": "network",
171 |                     "path": "/var/run/netns/runc1"
172 |     			},
173 |     			{
174 |     				"type": "ipc"
175 |     			},
176 |     			{
177 |     				"type": "uts"
178 |     			},
179 |     			{
180 |     				"type": "mount"
181 |     			}
182 |     		],
183 |     		"maskedPaths": [
184 |     			"/proc/kcore",
185 |     			"/proc/latency_stats",
186 |     			"/proc/timer_list",
187 |     			"/proc/timer_stats",
188 |     			"/proc/sched_debug",
189 |     			"/sys/firmware"
190 |     		],
191 |     		"readonlyPaths": [
192 |     			"/proc/asound",
193 |     			"/proc/bus",
194 |     			"/proc/fs",
195 |     			"/proc/irq",
196 |     			"/proc/sys",
197 |     			"/proc/sysrq-trigger"
198 |     		]
199 |     	}
200 |     }
201 | 
202 | 3. runc run启动容器
203 | 4. runc list查看目前已经有的容器
204 | 
205 | ### 源码分析
206 | 本质上runc是对libContainer的一层封装，将符合OCI的config.json转化为libContainer需要的配置文件，然后通过libContainer将容器启动。
207 | 
208 | #### 容器创建
209 | path: opencontainers/runc/create.go
210 | 
211 |     Action: func(context *cli.Context) error {
212 |     		if err := checkArgs(context, 1, exactArgs); err != nil {
213 |     			return err
214 |     		}
215 |     		if err := revisePidFile(context); err != nil {
216 |     			return err
217 |     		}
218 |     		//load config.json到内存来
219 |     		spec, err := setupSpec(context)
220 |     		if err != nil {
221 |     			return err
222 |     		}
223 |     		// CT_ACT_CREATE参数，表示首次创建容器
224 |     		status, err := startContainer(context, spec, CT_ACT_CREATE, nil)
225 |     		if err != nil {
226 |     			return err
227 |     		}
228 |     		// exit with the container's exit status so any external supervisor is
229 |     		// notified of the exit with the correct exit status.
230 |     		os.Exit(status)
231 |     		return nil
232 |     	}
233 | 
234 |     func startContainer(context *cli.Context, spec *specs.Spec, action CtAct, criuOpts *libcontainer.CriuOpts) (int, error) {
235 |         //获取容器id
236 |     	id := context.Args().First()
237 |     	if id == "" {
238 |     		return -1, errEmptyID
239 |     	}
240 |     
241 |     	notifySocket := newNotifySocket(context, os.Getenv("NOTIFY_SOCKET"), id)
242 |     	if notifySocket != nil {
243 |     	    //如果systemd支持的话，给容器添加对应的socket通信路径
244 |     		notifySocket.setupSpec(context, spec)
245 |     	}
246 |         //根据spec中Container相关的内容，调用libcontainer 创建容器对象，且容器的状态设置为Stopped。仅仅只是一个内存中的数据结构，并没有与之对应的进程
247 |     	container, err := createContainer(context, id, spec)
248 |     	if err != nil {
249 |     		return -1, err
250 |     	}
251 |     
252 |     	if notifySocket != nil {
253 |     		err := notifySocket.setupSocket()
254 |     		if err != nil {
255 |     			return -1, err
256 |     		}
257 |     	}
258 |     
259 |     	// Support on-demand socket activation by passing file descriptors into the container init process.
260 |     	listenFDs := []*os.File{}
261 |     	if os.Getenv("LISTEN_FDS") != "" {
262 |     		listenFDs = activation.Files(false)
263 |     	}
264 |     	r := &runner{
265 |     		enableSubreaper: !context.Bool("no-subreaper"),
266 |     		shouldDestroy:   true,
267 |     		container:       container,
268 |     		listenFDs:       listenFDs,
269 |     		notifySocket:    notifySocket,
270 |     		consoleSocket:   context.String("console-socket"),
271 |     		detach:          context.Bool("detach"),
272 |     		pidFile:         context.String("pid-file"),
273 |     		preserveFDs:     context.Int("preserve-fds"),
274 |     		action:          action,
275 |     		criuOpts:        criuOpts,
276 |     		init:            true,
277 |     	}
278 |     	//将spec中的Process转换成libcontainer兼容的模式，并对容器的IO进行配置
279 |     	return r.run(spec.Process)
280 |     }
281 | 
282 |     func (r *runner) run(config *specs.Process) (int, error) {
283 |         //检查有关tty的设置，其中的console-socket就是tty mode下需要用的unix-socket
284 |     	if err := r.checkTerminal(config); err != nil {
285 |     		r.destroy()
286 |     		return -1, err
287 |     	}
288 |     	//将spec的Process转换为libcontainer要求的Process配置格式
289 |     	process, err := newProcess(*config, r.init)
290 |     	if err != nil {
291 |     		r.destroy()
292 |     		return -1, err
293 |     	}
294 |     	if len(r.listenFDs) > 0 {
295 |     		process.Env = append(process.Env, fmt.Sprintf("LISTEN_FDS=%d", len(r.listenFDs)), "LISTEN_PID=1")
296 |     		process.ExtraFiles = append(process.ExtraFiles, r.listenFDs...)
297 |     	}
298 |     	baseFd := 3 + len(process.ExtraFiles)
299 |     	for i := baseFd; i < baseFd+r.preserveFDs; i++ {
300 |     		process.ExtraFiles = append(process.ExtraFiles, os.NewFile(uintptr(i), "PreserveFD:"+strconv.Itoa(i)))
301 |     	}
302 |     	rootuid, err := r.container.Config().HostRootUID()
303 |     	if err != nil {
304 |     		r.destroy()
305 |     		return -1, err
306 |     	}
307 |     	rootgid, err := r.container.Config().HostRootGID()
308 |     	if err != nil {
309 |     		r.destroy()
310 |     		return -1, err
311 |     	}
312 |     	var (
313 |     		detach = r.detach || (r.action == CT_ACT_CREATE)
314 |     	)
315 |     	// Setting up IO is a two stage process. We need to modify process to deal
316 |     	// with detaching containers, and then we get a tty after the container has
317 |     	// started.
318 |     	/
319 |     	handler := newSignalHandler(r.enableSubreaper, r.notifySocket)
320 |     	//配置容器I/O，前边有章节专门介绍过
321 |     	tty, err := setupIO(process, rootuid, rootgid, config.Terminal, detach, r.consoleSocket)
322 |     	if err != nil {
323 |     		r.destroy()
324 |     		return -1, err
325 |     	}
326 |     	defer tty.Close()
327 |         
328 |         //根据调用方法传入参数的不同，调用不同的执行步骤，在这里就直接start
329 |     	switch r.action {
330 |     	case CT_ACT_CREATE:
331 |     		err = r.container.Start(process)
332 |     	case CT_ACT_RESTORE:
333 |     		err = r.container.Restore(process, r.criuOpts)
334 |     	case CT_ACT_RUN:
335 |     		err = r.container.Run(process)
336 |     	default:
337 |     		panic("Unknown action")
338 |     	}
339 |     	if err != nil {
340 |     		r.destroy()
341 |     		return -1, err
342 |     	}
343 |     	//以下都是完成一些start之后的后续工作
344 |     	if err := tty.waitConsole(); err != nil {
345 |     		r.terminate(process)
346 |     		r.destroy()
347 |     		return -1, err
348 |     	}
349 |     	if err = tty.ClosePostStart(); err != nil {
350 |     		r.terminate(process)
351 |     		r.destroy()
352 |     		return -1, err
353 |     	}
354 |     	if r.pidFile != "" {
355 |     	    //为容器创建一个pid-file
356 |     		if err = createPidFile(r.pidFile, process); err != nil {
357 |     			r.terminate(process)
358 |     			r.destroy()
359 |     			return -1, err
360 |     		}
361 |     	}
362 |     	status, err := handler.forward(process, tty, detach)
363 |     	if err != nil {
364 |     		r.terminate(process)
365 |     	}
366 |     	if detach {
367 |     		return 0, nil
368 |     	}
369 |     	r.destroy()
370 |     	return status, err
371 |     }
372 |     
373 |     func (c *linuxContainer) Start(process *Process) error {
374 |     	c.m.Lock()
375 |     	defer c.m.Unlock()
376 |     	if process.Init {
377 |     	    // 创建一个路径为/run/runc/$ID/exec.fifo的管道文件
378 |     		if err := c.createExecFifo(); err != nil {
379 |     			return err
380 |     		}
381 |     	}
382 |     	// 真正启动容器进程，runc与容器进程之间的通信通过创建的init管道或者环境变量
383 |     	if err := c.start(process); err != nil {
384 |     		if process.Init {
385 |     		    //失败了需要删除刚才创建的管道
386 |     			c.deleteExecFifo()
387 |     		}
388 |     		return err
389 |     	}
390 |     	return nil
391 |     }
392 | 
393 | 容器进程在产生后必须从runc读取配置才能够继续进行，path: opencontainers/runc/libcontainer/factory_linux.go
394 | 
395 |     func (l *LinuxFactory) StartInitialization() (err error) {
396 |         var (
397 |             pipefd, fifofd int
398 |             consoleSocket  *os.File
399 |             envInitPipe    = os.Getenv("_LIBCONTAINER_INITPIPE")
400 |             envFifoFd      = os.Getenv("_LIBCONTAINER_FIFOFD")
401 |             envConsole     = os.Getenv("_LIBCONTAINER_CONSOLE")
402 |         )
403 |     
404 |         // Get the INITPIPE.
405 |         pipefd, err = strconv.Atoi(envInitPipe)
406 |         if err != nil {
407 |             return fmt.Errorf("unable to convert _LIBCONTAINER_INITPIPE=%s to int: %s", envInitPipe, err)
408 |         }
409 |     
410 |         var (
411 |             pipe = os.NewFile(uintptr(pipefd), "pipe")
412 |             // 判断是`runc create`还是`runc exec`
413 |             it   = initType(os.Getenv("_LIBCONTAINER_INITTYPE"))
414 |         )
415 |         defer pipe.Close()
416 |     
417 |         // Only init processes have FIFOFD.
418 |         // 只有init进程有FIFOFD
419 |         fifofd = -1
420 |         if it == initStandard {
421 |             if fifofd, err = strconv.Atoi(envFifoFd); err != nil {
422 |                 return fmt.Errorf("unable to convert _LIBCONTAINER_FIFOFD=%s to int: %s", envFifoFd, err)
423 |             }
424 |         }
425 |         ...
426 |         // 会从管道中读取config，然后返回Init的接口对象
427 |         i, err := newContainerInit(it, pipe, consoleSocket, fifofd)
428 |         if err != nil {
429 |             return err
430 |         }
431 |         // If Init succeeds, syscall.Exec will not return, hence none of the defers will be called. 
432 |         //下边的代码片段就是展示这个方法
433 |         return i.Init()
434 |     }
435 | 
436 | 
437 |     path: opencontainers/runc/libcontainer/standard_init_linux.go
438 |     func (l *linuxStandardInit) Init() error {
439 |         ...
440 |         // 配置network, 配置路由等等
441 |         ...
442 |         // 准备rootfs
443 |         if err := prepareRootfs(l.pipe, l.config); err != nil {
444 |             return err
445 |         }
446 |         // 配置console, hostname, apparmor, process label, sysctl等等
447 |         ...
448 |         // 告诉父进程我们已经准备好Exec了
449 |         if err := syncParentReady(l.pipe); err != nil {
450 |             return err
451 |         }
452 |         // 配置seccomp
453 |         ...
454 |         // 设置正确的capability，用户以及工作目录
455 |         if err := finalizeNamespace(l.config); err != nil {
456 |             return err
457 |         }
458 |         ...
459 |         // 确定用户指定的容器进程在容器文件系统中的路径
460 |         name, err := exec.LookPath(l.config.Args[0])
461 |         if err != nil {
462 |             return err
463 |         }
464 |         // 关闭init管道，告诉runC进程，我们已经完成了初始化工作
465 |         l.pipe.Close()
466 |         // 在exec用户进程之前等待exec.fifo管道在另一端被打开
467 |         // 我们通过/proc/self/fd/$fd打开它
468 |         fd, err := unix.Open(fmt.Sprintf("/proc/self/fd/%d", l.fifoFd), unix.O_WRONLY|unix.O_CLOEXEC, 0)
469 |         ...
470 |         // 向exec.fifo管道写数据，阻塞，直到用户调用`runc start`，读取管道中的数据
471 |         if _, err := unix.Write(fd, []byte("0")); err != nil {
472 |             return newSystemErrorWithCause(err, "write 0 exec fifo")
473 |         }
474 |         ...
475 |         // 调用exec命令，执行用户进程,也就是我们在config文件中看到的process描述的命令
476 |         if err := syscall.Exec(name, l.config.Args[0:], os.Environ()); err != nil {
477 |             return newSystemErrorWithCause(err, "exec user process")
478 |         }
479 |         return nil
480 |     }
481 | 
482 | path: opencontainers/runc/libcontainer/rootfs_linux.go
483 | 
484 |     func prepareRootfs(pipe io.ReadWriter, iConfig *initConfig) (err error) {
485 |         ...
486 |         // 配置mounts, dev,将mounts挂载到rootfs等
487 |         ...
488 |         // 通知父进程运行pre-start hooks
489 |         if err := syncParentHooks(pipe); err != nil {
490 |             return err
491 |         }
492 |         ...
493 |         if config.NoPivotRoot {
494 |             err = msMoveRoot(config.Rootfs)
495 |         } else if config.Namespaces.Contains(configs.NEWNS) {
496 |             err = pivotRoot(config.Rootfs)
497 |         } else {
498 |             //最后还是通过chroot来切换文件系统的视角
499 |             err = chroot(config.Rootfs)
500 |         }
501 |         ...
502 |         return nil
503 | 
504 | > prepareRootfs先对容器的Mounts和Dev等信息进行配置，之后再调用syncParentHooks，通过init管道向runC进程发送procHooks信号。runC进程接收到procHooks信号之后，执行容器的PreStart
505 | > Hook回调函数，再通过init管道给容器初始化进程发送信号procResume，通知其继续执行。可见容器的PreStart
506 | > Hook是在根目录尚未切换之前执行完成的。最终，调用chroot函数，切换根目录。至此，容器的文件系统切换完毕。
507 | > 
508 | > 在文件系统准备完成之后，Init方法还会对Console,
509 | > hostname等属性进行配置。当一切就绪之后，调用syncParentReady通过init管道通知runC进程，获取响应之后，关闭init管道，同步结束，准备开始执行用户指定的容器进程。
510 | > 
511 | > 不过在找到了用户指定的容器程序在容器文件系统的执行路径之后，初始化进程又打开了我们之前多次提到的exec.fifo这个管道，并且往里面写入了一个字节，之后才执行Exec系统调用，切换到用户程序。既然exec.fifo是一个管道，那么我们在这一端写入之后，就必须有消费者在另外一端进行读取，否则写进程就会一直处于阻塞状态。
512 | > 
513 | > 事实上，此处对exec.fifo管道的写阻塞正是runc create和runc
514 | > start执行流的分界点。容器的创建工作，在容器初始化进程往exec.fifo管道进行写操作的那一刻，就全部结束了。
515 | 
516 | ### 容器启动
517 | 
518 | path: opencontainers/runc/start.go
519 | 
520 |     Action: func(context *cli.Context) error {
521 |     		if err := checkArgs(context, 1, exactArgs); err != nil {
522 |     			return err
523 |     		}
524 |     		container, err := getContainer(context)
525 |     		if err != nil {
526 |     			return err
527 |     		}
528 |     		status, err := container.Status()
529 |     		if err != nil {
530 |     			return err
531 |     		}
532 |     		switch status {
533 |     		case libcontainer.Created:
534 |     		    // runc start的执行路径到这
535 |     			return container.Exec()
536 |     		case libcontainer.Stopped:
537 |     			return errors.New("cannot start a container that has stopped")
538 |     		case libcontainer.Running:
539 |     			return errors.New("cannot start an already running container")
540 |     		default:
541 |     			return fmt.Errorf("cannot start a container in the %s state\n", status)
542 |     		}
543 |     	}
544 | 
545 | path: opencontainers/runc/libcontainer/container_linux.go
546 | 
547 |     func (c *linuxContainer) exec() error {
548 |     	path := filepath.Join(c.root, execFifoFilename)
549 |     
550 |     	fifoOpen := make(chan struct{})
551 |     	select {
552 |     	case <-awaitProcessExit(c.initProcess.pid(), fifoOpen):
553 |     		return errors.New("container process is already dead")
554 |     	//打开fifo，以解开刚才创建容器过程中exec-fifo的写堵塞
555 |     	case result := <-awaitFifoOpen(path):
556 |     		close(fifoOpen)
557 |     		if result.err != nil {
558 |     			return result.err
559 |     		}
560 |     		f := result.file
561 |     		defer f.Close()
562 |     		//读取exec-fifo中的内容，也就是刚才写入的那个字节
563 |     		if err := readFromExecFifo(f); err != nil {
564 |     			return err
565 |     		}
566 |     		return os.Remove(path)
567 |     	}
568 |     }
569 | 
570 | > 可是这一路分析下来，似乎并没有对容器的namespace进行配置的操作？事实上，子进程runc
571 | > init的执行流在进入Go语言的运行时之前，会被包/runc/libcontainer/nsenter劫持，先去执行一段C代码。这段C代码同样会从init管道中读取容器的配置，主要是namespace的路径，clone
572 | > flag等等，并根据这些配置，调用setns系统调用，将容器进程加入到合适的namespace中。之后再进入Go的运行时，完成上文所述的各种初始化操作。
573 | 
574 | ### 总结
575 | 摘一张来自[zju blog][1]的图片
576 | ![image.png-68.8kB][2]
577 | 
578 | ### 参考
579 | 1. http://www.sel.zju.edu.cn/?p=840
580 | 2. https://cizixs.com/2017/11/05/oci-and-runc/
581 | 3. https://blog.csdn.net/zhonglinzhang/article/details/76757277
582 | 4. https://segmentfault.com/a/1190000016366810
583 | 
584 |   [1]: http://www.sel.zju.edu.cn/?p=840
585 |   [2]: http://static.zybuluo.com/myecho/5ey61f8u33wpawhc9i30wyu2/image.png
586 | 


--------------------------------------------------------------------------------
/chapter4/README.md:
--------------------------------------------------------------------------------
1 | # 镜像
2 | 
3 | 介绍containerd如何完成镜像的拉取以及使用过程，注意containerd 没有build image的功能。
4 | 


--------------------------------------------------------------------------------
/chapter4/section4.1.md:
--------------------------------------------------------------------------------
  1 | # image fetch
  2 | 
  3 | 如下图所展示pull images的流程，总体来说containerd的pull流程可以分为fetch和unpack两部分。本节主要关注fetch步骤。
  4 | 
  5 | ![image.png-351.2kB][1]
  6 | 
  7 | ### oci distribution spec
  8 | 目前containerd当前同时支持docker版的和oci版的registry api。我们首先来看一下oci distribution spec定义了哪些内容。
  9 | 
 10 | path: https://github.com/opencontainers/distribution-spec/blob/master/spec.md
 11 | 
 12 |     An "image" is a combination of a JSON manifest and individual layer files. The process of pulling an image centers around retrieving these two components.
 13 | 
 14 | 想要pull image首先要从registry中获取manifests，包含以下的fields.
 15 | 
 16 | | field     | description                                    |
 17 | |-----------|------------------------------------------------|
 18 | | name      | The name of the image.                         |
 19 | | tag       | The tag for this version of the image.         |
 20 | | fsLayers  | A list of layer descriptors (including digest) |
 21 | | signature | A JWS used to verify the manifest content      |
 22 | 
 23 | 更详细的manifest的example可以在`https://github.com/ZYecho/image-spec/blob/master/manifest.md`中找到。
 24 | 对应的API为`GET /v2/<name>/manifests/<reference>`，the reference may include a tag or digest.
 25 | 
 26 | 然后根据manifests中包含的digest去获取每层layer的blob，通过API `GET /v2/<name>/blobs/<digest>`进行获取。
 27 | 
 28 | 值得注意在整个拉取过程中没有出现image id的用处，image id是docker的用法，image id和manifest中config的digest相同，本质上是image configuration JSON的digest。
 29 | 
 30 | Docker-Content-Digest header：
 31 | 这个头部后边还需要用到，先放到这里了解一下。
 32 | > To provide verification of http content, any response may include a
 33 | > Docker-Content-Digest header. This will include the digest of the
 34 | > target entity returned in the response. For blobs, this is the entire
 35 | > blob content. For manifests, this is the manifest body without the
 36 | > signature content, also known as the JWS payload. Note that the
 37 | > commonly used canonicalization for digest calculation may be dependent
 38 | > on the mediatype of the content, such as with manifests.
 39 | 
 40 | ### fetch流程
 41 | path: containerd/cmd/ctr/commands/images/pull.go
 42 | ctr image pull操作流程如下：
 43 | 
 44 |  1. resolve用户需要下载的镜像 
 45 |  2. 从registry pull镜像，把镜像层内容和config保存进content服务，把镜像相关的元数据保存进images元数据服务
 46 |  3. unpack进snapshot服务
 47 |     
 48 |     //ref就是拉取用的reference, 形如docker.io/library/redis:alpine
 49 |     img, err := content.Fetch(ctx, client, ref, config)
 50 |     if err != nil {
 51 |     	return err
 52 |     }
 53 |     
 54 | 这里直接调用content实现的Fetch函数，是和`ctr content fetch`用的同样的处理逻辑。 
 55 | 
 56 |     // Fetch loads all resources into the content store and returns the image
 57 |     func Fetch(ctx context.Context, client *containerd.Client, ref string, config *FetchConfig) (images.Image, error) {
 58 |     	ongoing := newJobs(ref)
 59 |     
 60 |     	pctx, stopProgress := context.WithCancel(ctx)
 61 |     	progress := make(chan struct{})
 62 |     
 63 |     	go func() {
 64 |     		if config.ProgressOutput != nil {
 65 |     			// no progress bar, because it hides some debug logs
 66 |     			showProgress(pctx, ongoing, client.ContentStore(), config.ProgressOutput)
 67 |     		}
 68 |     		close(progress)
 69 |     	}()
 70 |     
 71 |     	h := images.HandlerFunc(func(ctx context.Context, desc ocispec.Descriptor) ([]ocispec.Descriptor, error) {
 72 |     	    // 将非manifest的digest添加到Ongoing中，在showProgress中会使用到
 73 |     		if desc.MediaType != images.MediaTypeDockerSchema1Manifest {
 74 |     			ongoing.add(desc)
 75 |     		}
 76 |     		return nil, nil
 77 |     	})
 78 |     
 79 |     	log.G(pctx).WithField("image", ref).Debug("fetching")
 80 |     	labels := commands.LabelArgs(config.Labels)
 81 |     	opts := []containerd.RemoteOpt{
 82 |     		containerd.WithPullLabels(labels),
 83 |     		containerd.WithResolver(config.Resolver),
 84 |     		containerd.WithImageHandler(h),
 85 |     		containerd.WithSchema1Conversion,
 86 |     	}
 87 |     	for _, platform := range config.Platforms {
 88 |     		opts = append(opts, containerd.WithPlatform(platform))
 89 |     	}
 90 |     	//调用containerd的grpc服务
 91 |     	img, err := client.Fetch(pctx, ref, opts...)
 92 |     	stopProgress()
 93 |     	if err != nil {
 94 |     		return images.Image{}, err
 95 |     	}
 96 |     
 97 |     	<-progress
 98 |     	return img, nil
 99 |     }
100 | 
101 | path: containerd/containerd/client.go
102 | 
103 |     func (c *Client) fetch(ctx context.Context, rCtx *RemoteContext, ref string, limit int) (images.Image, error) {
104 |     	store := c.ContentStore()
105 |     	//第一步首先将ref解析为descriptor,descriptor可以理解为一个可以用来下载的描述对象
106 |     	name, desc, err := rCtx.Resolver.Resolve(ctx, ref)
107 |     	if err != nil {
108 |     		return images.Image{}, errors.Wrapf(err, "failed to resolve reference %q", ref)
109 |     	}
110 |     
111 |     	fetcher, err := rCtx.Resolver.Fetcher(ctx, name)
112 |     	if err != nil {
113 |     		return images.Image{}, errors.Wrapf(err, "failed to get fetcher for %q", name)
114 |     	}
115 |     
116 |     	var (
117 |     		schema1Converter *schema1.Converter
118 |     		handler          images.Handler
119 |     	)
120 |     	if desc.MediaType == images.MediaTypeDockerSchema1Manifest && rCtx.ConvertSchema1 {
121 |     	    // 兼容逻辑
122 |     		schema1Converter = schema1.NewConverter(store, fetcher)
123 |     		handler = images.Handlers(append(rCtx.BaseHandlers, schema1Converter)...)
124 |     	} else {
125 |     		// Get all the children for a descriptor
126 |     		childrenHandler := images.ChildrenHandler(store)
127 |     		// Set any children labels for that content
128 |     		childrenHandler = images.SetChildrenLabels(store, childrenHandler)
129 |     		// Filter children by platforms
130 |     		childrenHandler = images.FilterPlatforms(childrenHandler, rCtx.PlatformMatcher)
131 |     		// Sort and limit manifests if a finite number is needed
132 |     		if limit > 0 {
133 |     			childrenHandler = images.LimitManifests(childrenHandler, rCtx.PlatformMatcher, limit)
134 |     		}
135 |             // 会在后边分别介绍这几个Handler, Handlers返回一个新的Handler(具备顺序遍历的功能)
136 |     		handler = images.Handlers(append(rCtx.BaseHandlers,
137 |     			remotes.FetchHandler(store, fetcher),
138 |     			childrenHandler,
139 |     		)...)
140 |     	}
141 |         
142 |         //这个真正开始下载的流程，包括manifest以及镜像层
143 |     	if err := images.Dispatch(ctx, handler, desc); err != nil {
144 |     		return images.Image{}, err
145 |     	}
146 |     	if schema1Converter != nil {
147 |     		desc, err = schema1Converter.Convert(ctx)
148 |     		if err != nil {
149 |     			return images.Image{}, err
150 |     		}
151 |     	}
152 |     
153 |     	img := images.Image{
154 |     		Name:   name,
155 |     		Target: desc,
156 |     		Labels: rCtx.Labels,
157 |     	}
158 |     
159 |     	is := c.ImageService()
160 |     	for {
161 |                 //调用images服务存储镜像元数据
162 |     		if created, err := is.Create(ctx, img); err != nil {
163 |     			if !errdefs.IsAlreadyExists(err) {
164 |     				return images.Image{}, err
165 |     			}
166 |     
167 |     			updated, err := is.Update(ctx, img)
168 |     			if err != nil {
169 |     				// if image was removed, try create again
170 |     				if errdefs.IsNotFound(err) {
171 |     					continue
172 |     				}
173 |     				return images.Image{}, err
174 |     			}
175 |     
176 |     			img = updated
177 |     		} else {
178 |     			img = created
179 |     		}
180 |     
181 |     		return img, nil
182 |     	}
183 |     }
184 | 
185 | ### 构造descriptor对象
186 | 下边看一下第一步ref是如何转到为descriptor的。
187 | 
188 |     func (r *dockerResolver) Resolve(ctx context.Context, ref string) (string, ocispec.Descriptor, error) {
189 |        //首先将ref转化为locator(docker.io/library/redis)和object(tag -> 2.7.8或者是digest)两部分
190 |     	refspec, err := reference.Parse(ref)
191 |     	if err != nil {
192 |     		return "", ocispec.Descriptor{}, err
193 |     	}
194 |     
195 |     	if refspec.Object == "" {
196 |     		return "", ocispec.Descriptor{}, reference.ErrObjectRequired
197 |     	}
198 |         
199 |         //在其中提取出了镜像repo的地址，为了后期方便组装镜像的地址
200 |     	base, err := r.base(refspec)
201 |     	if err != nil {
202 |     		return "", ocispec.Descriptor{}, err
203 |     	}
204 |     
205 |     	fetcher := dockerFetcher{
206 |     		dockerBase: base,
207 |     	}
208 |     
209 |     	var (
210 |     		urls []string
211 |     		dgst = refspec.Digest()
212 |     	)
213 |         
214 |         //如果是Object是用digest来表示的
215 |     	if dgst != "" {
216 |     		if err := dgst.Validate(); err != nil {
217 |     			// need to fail here, since we can't actually resolve the invalid
218 |     			// digest.
219 |     			return "", ocispec.Descriptor{}, err
220 |     		}
221 |             
222 |                 //首先尝试registry-1.docker.io/v2/library/ubuntu/manifest/sha256:xxx
223 |     		// turns out, we have a valid digest, make a url.
224 |     		urls = append(urls, fetcher.url("manifests", dgst.String()))
225 |     
226 |     		// fallback to blobs on not found.
227 |     		//如果失败再使用registry-1.docker.io/v2/library/ubuntu/blobs/sha256:xxx
228 |     		urls = append(urls, fetcher.url("blobs", dgst.String()))
229 |     	} else {
230 |     	    //直接使用tag来进行访问，registry-1.docker.io/v2/library/redis/manifest/alpine
231 |     		urls = append(urls, fetcher.url("manifests", refspec.Object))
232 |     	}
233 |     
234 |     	ctx, err = contextWithRepositoryScope(ctx, refspec, false)
235 |     	if err != nil {
236 |     		return "", ocispec.Descriptor{}, err
237 |     	}
238 |     	for _, u := range urls {
239 |     	    //注意这里是HEAD，也就是对应着spec中的Existing Manifests API，先看看Manifests在不在，在的话再去下载之
240 |     		req, err := http.NewRequest(http.MethodHead, u, nil)
241 |     		if err != nil {
242 |     			return "", ocispec.Descriptor{}, err
243 |     		}
244 |     
245 |     		// set headers for all the types we support for resolution.
246 |     		//构建HTTP请求对象头部
247 |     		req.Header.Set("Accept", strings.Join([]string{
248 |     			images.MediaTypeDockerSchema2Manifest,
249 |     			images.MediaTypeDockerSchema2ManifestList,
250 |     			ocispec.MediaTypeImageManifest,
251 |     			ocispec.MediaTypeImageIndex, "*"}, ", "))
252 |     
253 |     		log.G(ctx).Debug("resolving")
254 |     		resp, err := fetcher.doRequestWithRetries(ctx, req, nil)
255 |     		if err != nil {
256 |     			if errors.Cause(err) == ErrInvalidAuthorization {
257 |     				err = errors.Wrapf(err, "pull access denied, repository does not exist or may require authorization")
258 |     			}
259 |     			return "", ocispec.Descriptor{}, err
260 |     		}
261 |     		//在构建descriptor过程中没有使用到resp body的内容
262 |     		resp.Body.Close() // don't care about body contents.
263 |     
264 |     		if resp.StatusCode > 299 {
265 |     			if resp.StatusCode == http.StatusNotFound {
266 |     				continue
267 |     			}
268 |     			return "", ocispec.Descriptor{}, errors.Errorf("unexpected status code %v: %v", u, resp.Status)
269 |     		}
270 |     
271 |     		// this is the only point at which we trust the registry. we use the
272 |     		// content headers to assemble a descriptor for the name. when this becomes
273 |     		// more robust, we mostly get this information from a secure trust store.
274 |     		//关于这个头部，在前边介绍OCI distribution spec的时候有提到，目的还是为了校验
275 |     		dgstHeader := digest.Digest(resp.Header.Get("Docker-Content-Digest"))
276 |     
277 |     		if dgstHeader != "" {
278 |     			if err := dgstHeader.Validate(); err != nil {
279 |     				return "", ocispec.Descriptor{}, errors.Wrapf(err, "%q in header not a valid digest", dgstHeader)
280 |     			}
281 |     			dgst = dgstHeader
282 |     		}
283 |     
284 |     		if dgst == "" {
285 |     			return "", ocispec.Descriptor{}, errors.Errorf("could not resolve digest for %v", ref)
286 |     		}
287 |     
288 |     		var (
289 |     			size       int64
290 |     			sizeHeader = resp.Header.Get("Content-Length")
291 |     		)
292 |     
293 |     		size, err = strconv.ParseInt(sizeHeader, 10, 64)
294 |     		if err != nil {
295 |     
296 |     			return "", ocispec.Descriptor{}, errors.Wrapf(err, "invalid size header: %q", sizeHeader)
297 |     		}
298 |     		if size < 0 {
299 |     			return "", ocispec.Descriptor{}, errors.Errorf("%q in header not a valid size", sizeHeader)
300 |     		}
301 |     
302 |     		desc := ocispec.Descriptor{
303 |     		        //可以看到这三个值都是从头部中拿到的
304 |     			Digest:    dgst,
305 |     			//可能为application/vnd.docker.distribution.manifest.list.v2+json, 对应着不同平台的多个image, containerd在向registry下载manifest list之后，再去选择下载其中的某个平台的镜像。
306 |     			MediaType: resp.Header.Get("Content-Type"), // need to strip disposition?
307 |     			Size:      size,
308 |     		}
309 |     
310 |     		log.G(ctx).WithField("desc.digest", desc.Digest).Debug("resolved")
311 |     		return ref, desc, nil
312 |     	}
313 |     
314 |     	return "", ocispec.Descriptor{}, errors.Errorf("%v not found", ref)
315 |     }
316 | 
317 |     //执行下载的主要框架，具体的执行逻辑在handler中，在下一节进行介绍
318 |     func Dispatch(ctx context.Context, handler Handler, descs ...ocispec.Descriptor) error {
319 |     	eg, ctx := errgroup.WithContext(ctx)
320 |     	for _, desc := range descs {
321 |     		desc := desc
322 |                 //一个协程池，对于不同的descriptor来说下载是并发的
323 |     		eg.Go(func() error {
324 |     			desc := desc
325 |     
326 |     			children, err := handler.Handle(ctx, desc)
327 |     			if err != nil {
328 |     				if errors.Cause(err) == ErrSkipDesc {
329 |     					return nil // don't traverse the children.
330 |     				}
331 |     				return err
332 |     			}
333 |     
334 |     			if len(children) > 0 {
335 |     			        // 本质上是个dfs过程
336 |     				return Dispatch(ctx, handler, children...)
337 |     			}
338 |     
339 |     			return nil
340 |     		})
341 |     	}
342 |     
343 |     	return eg.Wait()
344 |     }
345 | 
346 | #### Handlers介绍
347 | 
348 | 下载的主要是任务都是通过Handler来完成的，看一下其最原始的定义。
349 | path: /home/zhangyue/go/src/github.com/containerd/containerd/images/handlers.go, 也关注一下其中的一些utils函数
350 | 
351 |     // HandlerFunc function implementing the Handler interface
352 |     type HandlerFunc func(ctx context.Context, desc ocispec.Descriptor) (subdescs []ocispec.Descriptor, err error)
353 | 
354 | BaseHandler:
355 | 通过connext注册过来的，只是负责将当前的desc进行登记，
356 | 
357 |     h := images.HandlerFunc(func(ctx context.Context, desc ocispec.Descriptor) ([]ocispec.Descriptor, error) {
358 |     	if desc.MediaType != images.MediaTypeDockerSchema1Manifest {
359 |     		ongoing.add(desc)
360 |     	}
361 |     	return nil, nil
362 |     })
363 | 
364 | FetchHandler: containerd/containerd/remotes/handlers.go
365 | 
366 |         func fetch(ctx context.Context, ingester content.Ingester, fetcher Fetcher, desc ocispec.Descriptor) error {
367 |     	log.G(ctx).Debug("fetch")
368 |     
369 |     	cw, err := content.OpenWriter(ctx, ingester, content.WithRef(MakeRefKey(ctx, desc)), content.WithDescriptor(desc))
370 |     	if err != nil {
371 |     		if errdefs.IsAlreadyExists(err) {
372 |     			return nil
373 |     		}
374 |     		return err
375 |     	}
376 |     	defer cw.Close()
377 |     
378 |     	ws, err := cw.Status()
379 |     	if err != nil {
380 |     		return err
381 |     	}
382 |     
383 |     	if ws.Offset == desc.Size {
384 |     		// If writer is already complete, commit and return
385 |     		err := cw.Commit(ctx, desc.Size, desc.Digest)
386 |     		if err != nil && !errdefs.IsAlreadyExists(err) {
387 |     			return errors.Wrapf(err, "failed commit on ref %q", ws.Ref)
388 |     		}
389 |     		return nil
390 |     	}
391 |         
392 |         //通过http get得到Reader
393 |     	rc, err := fetcher.Fetch(ctx, desc)
394 |     	if err != nil {
395 |     		return err
396 |     	}
397 |     	defer rc.Close()
398 |         
399 |         //将得到的rc写入到content中去
400 |     	return content.Copy(ctx, cw, rc, desc.Size, desc.Digest)
401 |     }
402 | 
403 | path: containerd/containerd/remotes/docker/fetcher.go
404 | 
405 |     func (r dockerFetcher) Fetch(ctx context.Context, desc ocispec.Descriptor) (io.ReadCloser, error) {
406 |     	ctx = log.WithLogger(ctx, log.G(ctx).WithFields(
407 |     		logrus.Fields{
408 |     			"base":   r.base.String(),
409 |     			"digest": desc.Digest,
410 |     		},
411 |     	))
412 |         // 如果是manifest的话首先是*/manifests/*
413 |         // 其次是走*/blobs/*即可
414 |         // 如果不是manifest的话直接走*/blobs/*即可
415 |     	urls, err := r.getV2URLPaths(ctx, desc)
416 |     	if err != nil {
417 |     		return nil, err
418 |     	}
419 |     
420 |     	ctx, err = contextWithRepositoryScope(ctx, r.refspec, false)
421 |     	if err != nil {
422 |     		return nil, err
423 |     	}
424 |     
425 |     	return newHTTPReadSeeker(desc.Size, func(offset int64) (io.ReadCloser, error) {
426 |     		for _, u := range urls {
427 |     			rc, err := r.open(ctx, u, desc.MediaType, offset)
428 |     			if err != nil {
429 |     				if errdefs.IsNotFound(err) {
430 |     					continue // try one of the other urls.
431 |     				}
432 |     
433 |     				return nil, err
434 |     			}
435 |     
436 |     			return rc, nil
437 |     		}
438 |     
439 |     		return nil, errors.Wrapf(errdefs.ErrNotFound,
440 |     			"could not fetch content descriptor %v (%v) from remote",
441 |     			desc.Digest, desc.MediaType)
442 |     
443 |     	})
444 |     }
445 | 
446 | childrenHandler:
447 | 
448 |     // Get all the children for a descriptor
449 |     // 这是个解析manifest的handler，返回[]Descriptor
450 |     childrenHandler := images.ChildrenHandler(store)
451 |     // Set any children labels for that content
452 |     childrenHandler = images.SetChildrenLabels(store, childrenHandler)
453 |     // Filter children by platforms
454 |     childrenHandler = images.FilterPlatforms(childrenHandler, pullCtx.Platforms...)
455 | 
456 | 中间的处理handler, path: containerd/containerd/images/handlers.go
457 | 
458 |     func SetChildrenLabels(manager content.Manager, f HandlerFunc) HandlerFunc {
459 |         return func(ctx context.Context, desc ocispec.Descriptor) ([]ocispec.Descriptor, error) {
460 |             children, err := f(ctx, desc)
461 |             if err != nil {
462 |                 return children, err
463 |             }
464 |     
465 |             if len(children) &gt; 0 {//如果在上一步处理结束后发现包含子descriptor
466 |                 info := content.Info{
467 |                     Digest: desc.Digest,
468 |                     Labels: map[string]string{},
469 |                 }
470 |                 fields := []string{}
471 |                 for i, ch := range children {
472 |                     info.Labels[fmt.Sprintf("containerd.io/gc.ref.content.%d", i)] = ch.Digest.String()
473 |                     fields = append(fields, fmt.Sprintf("labels.containerd.io/gc.ref.content.%d", i))
474 |                 }
475 |                 //将子descriptor作为descriptor的label
476 |                 _, err := manager.Update(ctx, info, fields...)
477 |                 if err != nil {
478 |                     return nil, err
479 |                 }
480 |             }
481 |     
482 |             return children, err
483 |         }
484 |     }
485 | 
486 |     //主要作用是在manifest list上添加label来表示包含关系
487 |     func SetChildrenLabels(manager content.Manager, f HandlerFunc) HandlerFunc {
488 |         return func(ctx context.Context, desc ocispec.Descriptor) ([]ocispec.Descriptor, error) {
489 |             children, err := f(ctx, desc)
490 |             if err != nil {
491 |                 return children, err
492 |             }
493 |      
494 |             if len(children) &gt; 0 {//如果在上一步处理结束后发现包含子descriptor
495 |                 info := content.Info{
496 |                     Digest: desc.Digest,
497 |                     Labels: map[string]string{},
498 |                 }
499 |                 fields := []string{}
500 |                 for i, ch := range children {
501 |                     info.Labels[fmt.Sprintf("containerd.io/gc.ref.content.%d", i)] = ch.Digest.String()
502 |                     fields = append(fields, fmt.Sprintf("labels.containerd.io/gc.ref.content.%d", i))
503 |                 }
504 |                 //将子descriptor作为descriptor的label
505 |                 _, err := manager.Update(ctx, info, fields...)
506 |                 if err != nil {
507 |                     return nil, err
508 |                 }
509 |             }
510 |      
511 |             return children, err
512 |         }
513 |     }
514 | 
515 | 要注意这些handler在使用过程中是按照顺序来调用，因此首先是baseHandler将任务添加到onGoning任务列表，然后是fetchHandler完成descreiptor的下载任务(落入content对象)，最后是childrenHandler的3个子handler完成任务(完成解析，打标签，过滤的操作); 同时要注意dispatch函数是一个深度优先遍历的过程，同时其内部如果有多个descriptor的话，那么本身是一个并行的过程，也就是说不同layer之间实际上是并行下载的。
516 | 
517 | ### 参考
518 | http://www.sel.zju.edu.cn/?p=921
519 | 
520 |   [1]: http://static.zybuluo.com/myecho/5ge7ybzdsb69920sxvkd958k/image.png
521 | 


--------------------------------------------------------------------------------
/chapter4/section4.2.md:
--------------------------------------------------------------------------------
  1 | # image unpack
  2 | 
  3 | ![pull image][1]
  4 | 前边介绍过fetch把镜像层内容和config保存进content服务，把镜像相关的元数据保存进images元数据服务中，而unpack过程中如下所述
  5 | 
  6 | > Once the image is pulled, the user can instruct the bundle controller
  7 | > to unpack the image into a bundle. Consuming from the content store,
  8 | > layers from the image are unpacked into the snapshot component.
  9 | 
 10 | ### OCI image spec
 11 | ![镜像之间关系图][2]
 12 | Image Index: 可以理解为Manifest list, 是镜像在不同平台的Manifest的集合， path: https://github.com/ZYecho/image-spec/blob/master/image-index.md
 13 | 
 14 | Manifest是一个镜像描述文件，包含config和layers两个主体，path:https://github.com/ZYecho/image-spec/blob/master/manifest.md
 15 | 
 16 | layers有很多层，其digest对应压缩后的格式比如gzip，tar的sha256值。
 17 | 
 18 | 这里要区分下Manifest和镜像的描述文件的区别，下边给出两个例子
 19 | 
 20 | // Manifest文件，其digest在Image Index中
 21 | 
 22 |     {
 23 |     "schemaVersion": 2,
 24 |        "mediaType": "application/vnd.docker.distribution.manifest.v2+json",
 25 |        "config": {
 26 |           "mediaType": "application/vnd.docker.container.image.v1+json",
 27 |           "size": 5108,
 28 |           //对应config文件的digest，需要再去下载
 29 |           "digest": "sha256:80581db8c700155a91bec6fd6398dad9733135e7c58a19472aa679e8367692ab"
 30 |        },
 31 |        "layers": [
 32 |           {
 33 |              "mediaType": "application/vnd.docker.image.rootfs.diff.tar.gzip",
 34 |              "size": 2206542,
 35 |              "digest": "sha256:8e3ba11ec2a2b39ab372c60c16b421536e50e5ce64a0bc81765c2e38381bcff6"
 36 |           },
 37 |           {
 38 |              "mediaType": "application/vnd.docker.image.rootfs.diff.tar.gzip",
 39 |              "size": 1249,
 40 |              "digest": "sha256:1f20bd2a5c234ffab42de6cbf83522946614b21b642a8208dca6b0fd614c31db"
 41 |           },
 42 |           {
 43 |              "mediaType": "application/vnd.docker.image.rootfs.diff.tar.gzip",
 44 |              "size": 9071,
 45 |              "digest": "sha256:782ff7702b5cd0a7c0109740838c74945fc27e4ce34e1028c24bf73f8249a63a"
 46 |           },
 47 |           {
 48 |              "mediaType": "application/vnd.docker.image.rootfs.diff.tar.gzip",
 49 |              "size": 9407568,
 50 |              "digest": "sha256:cd719ead7ee305492514a8dfa2afcd0979a16e8192836b4aaed98d8d932973c0"
 51 |           },
 52 |           {
 53 |              "mediaType": "application/vnd.docker.image.rootfs.diff.tar.gzip",
 54 |              "size": 98,
 55 |              "digest": "sha256:01018940af9a67873ad6737c275cb134372cdf1cda565af58dd14a1e3b85ab2a"
 56 |           },
 57 |           {
 58 |              "mediaType": "application/vnd.docker.image.rootfs.diff.tar.gzip",
 59 |              "size": 398,
 60 |              "digest": "sha256:3f1bfdda9588f5c0643b485580060b460b21b5331f4760778ef3279680e20966"
 61 |           }
 62 |        ]
 63 |     }
 64 | 
 65 |   这是config文件，path: https://github.com/ZYecho/image-spec/blob/master/config.md
 66 |   
 67 |     {
 68 |        "architecture":"amd64",
 69 |        "config":{
 70 |           "Hostname":"",
 71 |           "Domainname":"",
 72 |           "User":"",
 73 |           "AttachStdin":false,
 74 |           "AttachStdout":false,
 75 |           "AttachStderr":false,
 76 |           "ExposedPorts":{
 77 |              "6379/tcp":{
 78 |      
 79 |              }
 80 |           },
 81 |         ...
 82 |        "history":[
 83 |           {
 84 |              "created":"2018-07-06T14:14:06.165546783Z",
 85 |              "created_by":"/bin/sh -c #(nop) ADD file:25f61d70254b9807a40cd3e8d820f6a5ec0e1e596de04e325f6a33810393e95a in / "
 86 |           },
 87 |           {
 88 |              "created":"2018-07-11T00:55:43.769226605Z",
 89 |              "created_by":"/bin/sh -c apk add --no-cache 'su-exec\u003e=0.2'"
 90 |           },
 91 |           {
 92 |              "created":"2018-07-11T00:57:21.027940339Z",
 93 |              "created_by":"/bin/sh -c #(nop)  VOLUME [/data]",
 94 |              "empty_layer":true
 95 |           },
 96 |           {
 97 |              "created":"2018-07-11T00:57:21.312115762Z",
 98 |              "created_by":"/bin/sh -c #(nop) WORKDIR /data",
 99 |              "empty_layer":true
100 |           }
101 |        ],
102 |        "os":"linux",
103 |        //这个很重要
104 |        "rootfs":{
105 |           "type":"layers",
106 |           "diff_ids":[
107 |              "sha256:73046094a9b835e443af1a9d736fcfc11a994107500e474d0abf399499ed280c",
108 |              "sha256:9f8f870604a08909589f09337944210db2bf72b2a71f0f707642b3aa9d225f9b",
109 |              "sha256:221a0f51690d5f9063c9a113aa5e1340b50ac4474e7525efb9c60b945589110f",
110 |              "sha256:1b44c45cbb1c6711042de1c2e8785b554da0d25ed03bbd2a6a13fb498eceb6ae",
111 |              "sha256:211a9f3eb69c634f0368994600e7c93df7510f87870da57e51460f177c603ca9",
112 |              "sha256:151bcb0152a97b3a445bc2e2ed29432fc77d662fcb746b05b87d77a8d7bbf023"
113 |           ]
114 |        }
115 |     }
116 | 
117 | 注意区别一下diffid和manifest layer digest的区别:
118 | 
119 | > A layer DiffID is the digest over the layer's uncompressed tar archive
120 | > and serialized in the descriptor digest format
121 | > 
122 | > NOTE: Do not confuse DiffIDs with layer digests, often referenced in
123 | > the manifest, which are digests over compressed or uncompressed
124 | > content.
125 | 
126 | 
127 | Image layout:
128 | https://github.com/ZYecho/image-spec/blob/master/image-layout.md
129 | 这个是说镜像体现在文件系统上的layout，当然可能是存在于tar包或者nfs等。
130 | 
131 | ### 流程解析
132 |     //snapshotterName default为overlay
133 |     func (i *image) Unpack(ctx context.Context, snapshotterName string) error {
134 |         // lease的意义？应该是和GC相关
135 |     	ctx, done, err := i.client.WithLease(ctx)
136 |     	if err != nil {
137 |     		return err
138 |     	}
139 |     	defer done(ctx)
140 |     
141 |     	layers, err := i.getLayers(ctx, i.platform)
142 |     	if err != nil {
143 |     		return err
144 |     	}
145 |     
146 |     	var (
147 |     		sn = i.client.SnapshotService(snapshotterName)
148 |     		a  = i.client.DiffService()
149 |     		cs = i.client.ContentStore()
150 |     
151 |     		chain    []digest.Digest
152 |     		unpacked bool
153 |     	)
154 |     	for _, layer := range layers {
155 |     		unpacked, err = rootfs.ApplyLayer(ctx, layer, chain, sn, a)
156 |     		if err != nil {
157 |     			return err
158 |     		}
159 |             //unpack成功了
160 |     		if unpacked {
161 |     			// Set the uncompressed label after the uncompressed
162 |     			// digest has been verified through apply.
163 |     			// 看起来这个意思是会把compressed的变成uncompressed的？有待确认
164 |     			cinfo := content.Info{
165 |     				Digest: layer.Blob.Digest,
166 |     				Labels: map[string]string{
167 |     					"containerd.io/uncompressed": layer.Diff.Digest.String(),
168 |     				},
169 |     			}
170 |     			if _, err := cs.Update(ctx, cinfo, "labels.containerd.io/uncompressed"); err != nil {
171 |     				return err
172 |     			}
173 |     		}
174 |     
175 |     		chain = append(chain, layer.Diff.Digest)
176 |     	}
177 |         //最后一层也unpack成功了
178 |     	if unpacked {
179 |     		desc, err := i.i.Config(ctx, cs, i.platform)
180 |     		if err != nil {
181 |     			return err
182 |     		}
183 |     
184 |     		rootfs := identity.ChainID(chain).String()
185 |     
186 |     		cinfo := content.Info{
187 |     			Digest: desc.Digest,
188 |     			Labels: map[string]string{
189 |     				fmt.Sprintf("containerd.io/gc.ref.snapshot.%s", snapshotterName): rootfs,
190 |     			},
191 |     		}
192 |     		if _, err := cs.Update(ctx, cinfo, fmt.Sprintf("labels.containerd.io/gc.ref.snapshot.%s", snapshotterName)); err != nil {
193 |     			return err
194 |     		}
195 |     	}
196 |     
197 |     	return nil
198 |     }
199 |     
200 | 
201 |     func (i *image) getLayers(ctx context.Context, platform platforms.MatchComparer) ([]rootfs.Layer, error) {
202 |     	cs := i.client.ContentStore()
203 |         //从content中根据manifest的desc读出符合条件的Manifest对象
204 |     	manifest, err := images.Manifest(ctx, cs, i.i.Target, platform)
205 |     	if err != nil {
206 |     		return nil, err
207 |     	}
208 |         //拿到image config里的diffids
209 |     	diffIDs, err := i.i.RootFS(ctx, cs, platform)
210 |     	if err != nil {
211 |     		return nil, errors.Wrap(err, "failed to resolve rootfs")
212 |     	}
213 |     	// 查看镜像层数是否相同
214 |     	if len(diffIDs) != len(manifest.Layers) {
215 |     		return nil, errors.Errorf("mismatched image rootfs and manifest layers")
216 |     	}
217 |     	layers := make([]rootfs.Layer, len(diffIDs))
218 |     	for i := range diffIDs {
219 |     	    //这个是image Config的desc, ->  tar
220 |     		layers[i].Diff = ocispec.Descriptor{
221 |     			// TODO: derive media type from compressed type
222 |     			MediaType: ocispec.MediaTypeImageLayer,
223 |     			Digest:    diffIDs[i],
224 |     		}
225 |     		//这个是Manifest里边的desec, -> tar+gzip
226 |     		layers[i].Blob = manifest.Layers[i]
227 |     	}
228 |     	return layers, nil
229 |     }
230 |     
231 | path: containerd/containerd/rootfs/apply.go
232 | 
233 |     func applyLayers(ctx context.Context, layers []Layer, chain []digest.Digest, sn snapshots.Snapshotter, a diff.Applier, opts ...snapshots.Opt) error {
234 |     	var (
235 |     		parent  = identity.ChainID(chain[:len(chain)-1])
236 |     		chainID = identity.ChainID(chain)
237 |     		layer   = layers[len(layers)-1]
238 |     		diff    ocispec.Descriptor
239 |     		key     string
240 |     		mounts  []mount.Mount
241 |     		err     error
242 |     	)
243 |     
244 |     	for {
245 |     	    // 注意这个Key并不是完全等价于chainID
246 |     		key = fmt.Sprintf("extract-%s %s", uniquePart(), chainID)
247 |     
248 |     		// Prepare snapshot with from parent, label as root
249 |     		// step1 获取到经过COW之后的可挂载的mounts(type可能为bind brtfs等，目录和moby存储目录类似)
250 |     		mounts, err = sn.Prepare(ctx, key, parent.String(), opts...)
251 |     		if err != nil {
252 |     			if errdefs.IsNotFound(err) && len(layers) > 1 {
253 |     				if err := applyLayers(ctx, layers[:len(layers)-1], chain[:len(chain)-1], sn, a); err != nil {
254 |     					if !errdefs.IsAlreadyExists(err) {
255 |     						return err
256 |     					}
257 |     				}
258 |     				// Do no try applying layers again
259 |     				layers = nil
260 |     				continue
261 |     			} else if errdefs.IsAlreadyExists(err) {
262 |     				// Try a different key
263 |     				continue
264 |     			}
265 |     
266 |     			// Already exists should have the caller retry
267 |     			return errors.Wrapf(err, "failed to prepare extraction snapshot %q", key)
268 |     
269 |     		}
270 |     		break
271 |     	}
272 |     	defer func() {
273 |     		if err != nil {
274 |     			if !errdefs.IsAlreadyExists(err) {
275 |     				log.G(ctx).WithError(err).WithField("key", key).Infof("apply failure, attempting cleanup")
276 |     			}
277 |     
278 |     			if rerr := sn.Remove(ctx, key); rerr != nil {
279 |     				log.G(ctx).WithError(rerr).WithField("key", key).Warnf("extraction snapshot removal failed")
280 |     			}
281 |     		}
282 |     	}()
283 |         
284 |         // step2, Blob依然是tar+gzip的形式
285 |         // 先将mounts挂载到一个temp dir上并且applyDiff，要明白DIffs在mount的path原有挂载点目录里也是可见的，和temp dir在哪没有关系
286 |     	diff, err = a.Apply(ctx, layer.Blob, mounts)
287 |     	if err != nil {
288 |     		err = errors.Wrapf(err, "failed to extract layer %s", layer.Diff.Digest)
289 |     		return err
290 |     	}
291 |     	// 判断一下是否符合预期
292 |     	if diff.Digest != layer.Diff.Digest {
293 |     		err = errors.Errorf("wrong diff id calculated on extraction %q", diff.Digest)
294 |     		return err
295 |     	}
296 |         //step3
297 |     	if err = sn.Commit(ctx, chainID.String(), key, opts...); err != nil {
298 |     		err = errors.Wrapf(err, "failed to commit snapshot %s", key)
299 |     		return err
300 |     	}
301 |     
302 |     	return nil
303 |     }
304 |     
305 |     // 有兴趣可以跟一下apply的细节
306 |     // Apply applies a tar stream of an OCI style diff tar.
307 |     // See https://github.com/opencontainers/image-spec/blob/master/layer.md#applying-changesets
308 |     func Apply(ctx context.Context, root string, r io.Reader, opts ...ApplyOpt) (int64, error) {
309 |     	root = filepath.Clean(root)
310 |     
311 |     	var options ApplyOptions
312 |     	for _, opt := range opts {
313 |     		if err := opt(&options); err != nil {
314 |     			return 0, errors.Wrap(err, "failed to apply option")
315 |     		}
316 |     	}
317 |     	if options.Filter == nil {
318 |     		options.Filter = all
319 |     	}
320 |     
321 |     	return apply(ctx, root, tar.NewReader(r), options)
322 |     }
323 | 
324 | 这一节最后有关snapshotter的细节比较多，特别是step1-step3，会在介绍snapshotter和具体storage driver的时候看一下其工作机制和step1-3的细节，并总结回顾一下这部分内容。
325 | 
326 |   [1]: http://static.zybuluo.com/myecho/5ge7ybzdsb69920sxvkd958k/image.png
327 |   [2]: http://static.zybuluo.com/myecho/75bv8w7hnh82usvhwe6kok02/image.png
328 | 


--------------------------------------------------------------------------------
/chapter4/section4.3.md:
--------------------------------------------------------------------------------
  1 | # snapshotter
  2 | 
  3 | ### 为什么要有snapshotter?
  4 | https://blog.mobyproject.org/where-are-containerds-graph-drivers-145fc9b7255
  5 | 
  6 | > These differ from the concept of the graphdriver in that the
  7 | > Snapshotter has no knowledge of images or containers. Users simply
  8 | > prepare and commit directories. We also avoid the integration between
  9 | > graph drivers and the tar format used to represent the changesets.
 10 | > 
 11 | > The Snapshotter will only provide mount-oriented snapshot access with
 12 | > minimal metadata. Serialization, hashing, unpacking, packing and
 13 | > mounting are not included in this design, opting for common
 14 | > implementations between graphdrivers, rather than specialized ones.
 15 | > This is less of a problem for performance since direct access to
 16 | > changesets is provided in the interface.
 17 | > 
 18 | > The Snapshotter provides an API for allocating, snapshotting and
 19 | > mounting abstract, layer-based filesystems. The model works by
 20 | > building up sets of directories with parent-child relationships, known
 21 | > as Snapshots.
 22 | 
 23 | ### 整体架构和Model
 24 | 
 25 | ![image.png-119.7kB][1]
 26 | ![image.png-104.8kB][2]
 27 | 
 28 | > Snapshots are best understood by their lifecycle. Active snapshots are
 29 | > always created with Prepare or View from a Committed snapshot
 30 | > (including the empty snapshot). Committed snapshots are always created
 31 | > with Commit from an Active snapshot. Active snapshots never become
 32 | > committed snapshots and vice versa. All snapshots may be removed.
 33 | > 
 34 | > After mounting an Active snapshot, changes can be made to the
 35 | > snapshot. The act of committing creates a Committed snapshot. The
 36 | > committed snapshot will inherit the parent of the active snapshot. The
 37 | > committed snapshot can then be used as a parent. Active snapshots can
 38 | > never be used as a parent.
 39 | > 
 40 | > In this diagram, you can see that the active snapshot a is created by
 41 | > calling Prepare with the committed snapshot P0. After modification, a
 42 | > becomes a' and a committed snapshot P1 is created by calling Commit.
 43 | > a' can be further modified as a'' and a second committed snapshot can
 44 | > be created as P2 by calling Commit again. Note here that P2's parent
 45 | > is P0 and not P1.
 46 | 
 47 | 要搞清楚这个p2的parent为什么是p0，而不是p1? 
 48 | --- 本质上a''还是从a做改动来的，而p2是会继承a''的parent的，也就是a的来源，p0 snapshot。
 49 | 
 50 | > Types of container filesystems In the container world we use two types
 51 | > of filesystems: overlays and snapshotting filesystems. AUFS and
 52 | > OverlayFS are overlay filesystems which have multiple directories with
 53 | > file diffs for each “layer” in an image. Snapshotting filesystems
 54 | > include devicemapper, btrfs, and ZFS which handle file diffs at the
 55 | > block level. Overlays usually work on common filesystem types such as
 56 | > EXT4 and XFS whereas snapshotting filesystems only run on volumes
 57 | > formatted for them.
 58 | 
 59 | 快照类型的文件系统需要底层块设备提前format成对应的特定格式，不能直接运行在common filsystem上比如ext4等
 60 | 
 61 | 目前支持的存储引擎有哪些？
 62 | 
 63 | * native 对应着vfs?对每个image layer直接无脑拷贝，不存在COW的语义
 64 | * lcow http://dockone.io/article/3299, https://github.com/moby/moby/pull/34859
 65 | * overlay
 66 | * aufs
 67 | * btrfs
 68 | * zfs
 69 | 
 70 | ### 基本流程
 71 | 
 72 | #### Importing a Layer
 73 | To import a layer, we simply have the Snapshotter provide a list of mounts to be applied such that our destination will capture a changeset. We start out by getting a path to the layer tar file and creating a temp location to unpack it to:
 74 | 
 75 |     layerPath, tmpDir := getLayerPath(), mkTmpDir() // just a path to layer tar file.
 76 | 
 77 | We start by using a Snapshotter to Prepare a new snapshot transaction, using a key and descending from the empty parent "":
 78 | 
 79 |     mounts, err := snapshotter.Prepare(key, "")
 80 |     if err != nil { ... }
 81 | 
 82 | We get back a list of mounts from Snapshotter.Prepare, with the key identifying the active snapshot. Mount this to the temporary location with the following:
 83 | 
 84 |     if err := mount.All(mounts, tmpDir); err != nil { ... }
 85 | 
 86 | Once the mounts are performed, our temporary location is ready to capture a diff. In practice, this works similar to a filesystem transaction. The next step is to unpack the layer. We have a special function unpackLayer that applies the contents of the layer to target location and calculates the DiffID of the unpacked layer (this is a requirement for docker implementation):
 87 | 
 88 |     layer, err := os.Open(layerPath)
 89 |     if err != nil { ... }
 90 |     digest, err := unpackLayer(tmpLocation, layer) // unpack into layer location
 91 |     if err != nil { ... }
 92 | 
 93 | When the above completes, we should have a filesystem the represents the contents of the layer. Careful implementations should verify that digest matches the expected DiffID. When completed, we unmount the mounts:
 94 | 
 95 |     unmount(mounts) // optional, for now
 96 | 
 97 | Now that we've verified and unpacked our layer, we commit the active snapshot to a name. For this example, we are just going to use the layer digest, but in practice, this will probably be the ChainID:
 98 | 
 99 |     if err := snapshotter.Commit(digest.String(), key); err != nil { ... }
100 | 
101 | Now, we have a layer in the Snapshotter that can be accessed with the digest provided during commit. Once you have committed the snapshot, the active snapshot can be removed with the following:
102 | 
103 |     snapshotter.Remove(key)
104 |     
105 | 从上边的描述我们可以看出，上一节unpack中的提到的step1-step3中只有step1 prepare以及step3 commit属于snapshotter的工作范畴，而step2的apply实际上调用的diff服务实现的接口。
106 | 
107 | #### Importing the Next Layer
108 | Making a layer depend on the above is identical to the process described above except that the parent is provided as parent when calling Snapshotter.Prepare, assuming a clean tmpLocation:
109 | 
110 |     mounts, err := snapshotter.Prepare(tmpLocation, parentDigest)
111 | 
112 | We then mount, apply and commit, as we did above. The new snapshot will be based on the content of the previous one.
113 | 
114 | #### Running a Container
115 | To run a container, we simply provide Snapshotter.Prepare the committed image snapshot as the parent. After mounting, the prepared path can be used directly as the container's filesystem:
116 | 
117 |     mounts, err := snapshotter.Prepare(containerKey, imageRootFSChainID)
118 | 
119 | The returned mounts can then be passed directly to the container runtime. If one would like to create a new image from the filesystem, Snapshotter.Commit is called:
120 | 
121 |     if err := snapshotter.Commit(newImageSnapshot, containerKey); err != nil { ... }
122 | 
123 | Alternatively, for most container runs, Snapshotter.Remove will be called to signal the Snapshotter to abandon the changes.
124 | 
125 | #### ctr snapshot命令
126 | 
127 |     COMMANDS:
128 |          commit            commit an active snapshot into the provided name
129 |          info              get info about a snapshot
130 |          list, ls          list snapshots
131 |          mounts, m, mount  mount gets mount commands for the snapshots
132 |          prepare           prepare a snapshot from a committed snapshot
133 |          remove, rm        remove snapshots
134 |          label             add labels to content
135 |          tree              display tree view of snapshot branches
136 |          unpack            unpack applies layers from a manifest to a snapshot
137 |          usage             usage snapshots
138 |          view              create a read-only snapshot from a committed snapshot
139 | 
140 | 涵盖了上文中介绍过的operation.
141 | 
142 | ### 参考
143 | https://github.com/containerd/containerd/blob/master/design/snapshots.md
144 | https://integratedcode.us/2016/08/30/storage-drivers-in-docker-a-deep-dive/
145 | https://www.cnblogs.com/breezey/p/9589288.html
146 | 
147 |   [1]: http://static.zybuluo.com/myecho/yo7ugmvzsg9fauj7b5nqv7x6/image.png
148 |   [2]: http://static.zybuluo.com/myecho/qkfamwyg8x3f3vb2ylctkyqg/image.png
149 | 
150 | 


--------------------------------------------------------------------------------
/chapter4/section4.4.md:
--------------------------------------------------------------------------------
  1 | # pouch commit实现
  2 | 
  3 | 我们借助pouch commit的实现来看一下，如何使用containerd的diff和snapshot服务。
  4 | 首先要明白containerd是不支持build/commit功能的，那么pouch是如何实现的commit命令呢？
  5 | 
  6 | ### docker commit
  7 | 要明白pouch commit如何实现,我们首先看一下docker commit是如何实现的，在功能上基本相关，只不过前者依赖containerd提供的grpc服务。
  8 | 
  9 | > 深入学习docker commit 的原理前，我不妨先来看看一下 docker help  中关于 commit 命令的阐述： commit
 10 | > Create a new image from a container's changes  结合上图与命令docker commit
 11 | > 的描述，我们可以发现有三个关键字Image、Container 与Changes 。如何理解这三个关键字，我们可以从以下三个步骤入手：
 12 | 
 13 | > 1. Docker Daemon 会通过一个 Docker 镜像运行一个 Docker 容器，Docker 通过层级文件系统为 Docker 容器提供文件系统视角，最上层的是可读可写层（Read-Write Layer）。
 14 | > 
 15 | > 2. Docker 容器初始的可读可写层内容均为空，Docker 容器对文件系统的内容更新将全部更新于可读可写层（Read-Write Layer）。
 16 | > 
 17 | > 3. 实现 docker commit 操作时，Docker 仅仅是将可读可写层（Read-Write Layer）中的更新内容，打包为一个全新的镜像。
 18 | 
 19 | 简言之，所谓的commit功能就是将容器最上层的R/W层重新打包成一个新的镜像，至于说如何构造镜像要依赖不同的实现。
 20 | 
 21 | 
 22 | ### pouch commit实现
 23 | 有了上边的介绍，我们对commit的功能有了一个基本的了解，如果让我们在pouch里边去实现commit的话，无非依赖于以下几个操作，Diff操作来提取出R/W读写层，Content服务写入tar包，然后unpack到snapshot中去，我们实际看一下pouch commit的实现看是不是这样做的。
 24 | 
 25 | path:  alibaba/pouch/ctrd/image_commit.go
 26 | 
 27 |     func (c *Client) Commit(ctx context.Context, config *CommitConfig) (_ digest.Digest, err0 error) {
 28 |     	// get a containerd client
 29 |     	wrapperCli, err := c.Get(ctx)
 30 |     	if err != nil {
 31 |     		return "", fmt.Errorf("failed to get a containerd grpc client: %v", err)
 32 |     	}
 33 |     	client := wrapperCli.client
 34 |     
 35 |     	var (
 36 |     		sn     = client.SnapshotService(defaultSnapshotterName)
 37 |     		cs     = client.ContentStore()
 38 |     		differ = client.DiffService()
 39 |     	)
 40 |     
 41 |     	// export new layer
 42 |     	snapshot, err := c.GetSnapshot(ctx, config.ContainerID)
 43 |     	if err != nil {
 44 |     		return "", fmt.Errorf("failed to get snapshot: %s", err)
 45 |     	}
 46 |     	// 调用diff服务，返回描述这一层的descriptor
 47 |     	layer, diffIDStr, err := exportLayer(ctx, snapshot.Name, sn, cs, differ)
 48 |     	if err != nil {
 49 |     		return "", err
 50 |     	}
 51 |     
 52 |     	// create child image
 53 |     	diffIDDigest, err := digest.Parse(diffIDStr)
 54 |     	if err != nil {
 55 |     		return "", err
 56 |     	}
 57 |         
 58 |         //结合parent image构建一个符合oci spec的img描述体
 59 |     	childImg, err := newChildImage(ctx, config, diffIDDigest)
 60 |     	if err != nil {
 61 |     		return "", err
 62 |     	}
 63 |     
 64 |     	// create new snapshot for new layer
 65 |     	//产生新镜像chainId作为镜像的snapshotKey
 66 |     	snapshotKey := identity.ChainID(childImg.RootFS.DiffIDs).String()
 67 |     	//还是按照pull镜像时候的3步走 先prepare -> apply -> commit
 68 |     	if err = newSnapshot(ctx, config.Image, sn, differ, layer, snapshotKey, diffIDStr); err != nil {
 69 |     		return "", err
 70 |     	}
 71 |     	defer func() {
 72 |     		if err0 != nil {
 73 |     			logrus.Warnf("remove snapshot %s cause commit image failed", snapshotKey)
 74 |     			client.SnapshotService(defaultSnapshotterName).Remove(ctx, snapshotKey)
 75 |     		}
 76 |     	}()
 77 |     
 78 |     	imgJSON, err := json.Marshal(childImg)
 79 |     	if err != nil {
 80 |     		return "", err
 81 |     	}
 82 |     	
 83 |     	// 以下分别构建新镜像config descriptor的layer descriptor
 84 |     
 85 |     	// new config descriptor
 86 |     	configDesc := ocispec.Descriptor{
 87 |     		MediaType: configType,
 88 |     		Digest:    digest.FromBytes(imgJSON),
 89 |     		Size:      int64(len(imgJSON)),
 90 |     	}
 91 |     
 92 |     	// get parent image layer descriptor
 93 |     	pmfst, err := images.Manifest(ctx, cs, config.CImage.Target(), platforms.Default())
 94 |     	if err != nil {
 95 |     		return "", err
 96 |     	}
 97 |     
 98 |     	// new layer descriptor
 99 |     	layers := append(pmfst.Layers, layer)
100 |     	labels := map[string]string{
101 |     		"containerd.io/gc.ref.content.0": configDesc.Digest.String(),
102 |     	}
103 |     	for i, l := range layers {
104 |     		labels[fmt.Sprintf("containerd.io/gc.ref.content.%d", i+1)] = l.Digest.String()
105 |     	}
106 |     
107 |     	// new manifest descriptor
108 |     	mfst := ocispec.Manifest{
109 |     		Versioned: specs.Versioned{
110 |     			SchemaVersion: 2,
111 |     		},
112 |     		Config: configDesc,
113 |     		Layers: layers,
114 |     	}
115 |     
116 |     	mfstJSON, err := json.MarshalIndent(mfst, "", "   ")
117 |     	if err != nil {
118 |     		return "", errors.Wrap(err, "failed to marshal manifest")
119 |     	}
120 |     
121 |     	mfstDigest := digest.FromBytes(mfstJSON)
122 |     	mfstDesc := ocispec.Descriptor{
123 |     		Digest: mfstDigest,
124 |     		Size:   int64(len(mfstJSON)),
125 |     	}
126 |     
127 |     	desc := ocispec.Descriptor{
128 |     		MediaType: manifestType,
129 |     		Digest:    mfstDigest,
130 |     		Size:      int64(len(mfstJSON)),
131 |     	}
132 |     
133 |     	// image create
134 |     	img := images.Image{
135 |     		Name:      config.Reference,
136 |     		Target:    desc,
137 |     		CreatedAt: time.Now(),
138 |     	}
139 |     
140 |     	// register containerd image metadata.
141 |     	// 向containerd的images注册镜像的元数据
142 |     	if _, err := client.ImageService().Update(ctx, img); err != nil {
143 |     		if !errdefs.IsNotFound(err) {
144 |     			return "", fmt.Errorf("failed to cover exist image %s", err)
145 |     		}
146 |     		if _, err := client.ImageService().Create(ctx, img); err != nil {
147 |     			return "", fmt.Errorf("failed to create new image %s", err)
148 |     		}
149 |     	}
150 |     	
151 |     
152 |     	// write manifest content
153 |     	//镜像为单位的，在unpack那个环节有介绍
154 |     	if err := content.WriteBlob(ctx, cs, mfstDigest.String(), bytes.NewReader(mfstJSON), mfstDesc.Size, mfstDesc.Digest, content.WithLabels(labels)); err != nil {
155 |     		return "", errors.Wrapf(err, "error writing manifest blob %s", mfstDigest)
156 |     	}
157 |     
158 |     	// write config content
159 |     	// 实际上就是前边那个oci spec img
160 |     	if err := content.WriteBlob(ctx, cs, configDesc.Digest.String(), bytes.NewReader(imgJSON), configDesc.Size, configDesc.Digest); err != nil {
161 |     		return "", errors.Wrap(err, "error writing config blob")
162 |     	}
163 |     
164 |     	// pouch record config descriptor digest as image id.
165 |     	return configDesc.Digest, nil
166 |     }
167 | 
168 | ### 总结
169 | 通过观察commit的实现来说，我们可以发现diff服务和snapshot服务之间的职责分工，diff服务会操作tar包来完成r/w的打包，applyDiff这些操作，而snapshot服务来说完成类似登记的功能，以mounts挂载点来和其他服务连接起来。
170 | 
171 | ### 参考
172 | https://github.com/alibaba/pouch/pull/2125
173 | http://guide.daocloud.io/dcs/docker-commit-9153991.html
174 | 


--------------------------------------------------------------------------------
/chapter5/README.md:
--------------------------------------------------------------------------------
1 | # 存储
2 | 
3 | 在本节以overlay和btrfs为典型介绍联合文件系统和快照类型文件系统在snapshottor上的实现。
4 | 


--------------------------------------------------------------------------------
/chapter5/section5.1.md:
--------------------------------------------------------------------------------
  1 | # native
  2 | 
  3 | native即用原生的linux文件系统比如ext4等完成snapshottor的工作，虽然在应用环境中很少使用，但对于实现自己的snapshottor有很好的借鉴意义，因为其细节比较简单，对于理解snapshottor的主要流程有很好的帮助。
  4 | 
  5 | 我们来学习snapshottor主要放在prepare和commit两个方法上。
  6 |     
  7 |     // 这里的key应该是chaninID
  8 |     func (o *snapshotter) Prepare(ctx context.Context, key, parent string, opts ...snapshots.Opt) ([]mount.Mount, error) {
  9 |     	return o.createSnapshot(ctx, snapshots.KindActive, key, parent, opts)
 10 |     }
 11 |     
 12 |     func (o *snapshotter) createSnapshot(ctx context.Context, kind snapshots.Kind, key, parent string, opts []snapshots.Opt) ([]mount.Mount, error) {
 13 |     	var (
 14 |     		err      error
 15 |     		path, td string
 16 |     	)
 17 |     
 18 |     	if kind == snapshots.KindActive || parent == "" {
 19 |     	    // 如果是一个active的snapshot或者是no parent需要搞一个目录来存储这一层的内容(相当于moby的)RW层
 20 |     		td, err = ioutil.TempDir(filepath.Join(o.root, "snapshots"), "new-")
 21 |     		if err != nil {
 22 |     			return nil, errors.Wrap(err, "failed to create temp dir")
 23 |     		}
 24 |     		if err := os.Chmod(td, 0755); err != nil {
 25 |     			return nil, errors.Wrapf(err, "failed to chmod %s to 0755", td)
 26 |     		}
 27 |     		defer func() {
 28 |     			if err != nil {
 29 |     				if td != "" {
 30 |     					if err1 := os.RemoveAll(td); err1 != nil {
 31 |     						err = errors.Wrapf(err, "remove failed: %v", err1)
 32 |     					}
 33 |     				}
 34 |     				if path != "" {
 35 |     					if err1 := os.RemoveAll(path); err1 != nil {
 36 |     						err = errors.Wrapf(err, "failed to remove path: %v", err1)
 37 |     					}
 38 |     				}
 39 |     			}
 40 |     		}()
 41 |     	}
 42 |     
 43 |         // MetaStore开启一个事务
 44 |     	ctx, t, err := o.ms.TransactionContext(ctx, true)
 45 |     	if err != nil {
 46 |     		return nil, err
 47 |     	}
 48 |         //完成存储层元数据的管理
 49 |     	s, err := storage.CreateSnapshot(ctx, kind, key, parent, opts...)
 50 |     	if err != nil {
 51 |     		if rerr := t.Rollback(); rerr != nil {
 52 |     			log.G(ctx).WithError(rerr).Warn("failed to rollback transaction")
 53 |     		}
 54 |     		return nil, errors.Wrap(err, "failed to create snapshot")
 55 |     	}
 56 |         
 57 |         // 如果是需要创建新的目录的话
 58 |     	if td != "" {
 59 |     	    // layer have parent
 60 |     		if len(s.ParentIDs) > 0 {
 61 |     		    // 直接拿直系parent的即可，不支持多层的parent没有意义
 62 |     			parent := o.getSnapshotDir(s.ParentIDs[0])
 63 |     			// 因为native filesystem没有提供COW的能力，因此必须全量拷贝parent的目录
 64 |     			if err := fs.CopyDir(td, parent); err != nil {
 65 |     				return nil, errors.Wrap(err, "copying of parent failed")
 66 |     			}
 67 |     		}
 68 |             // s.ID应该每个snapshot唯一，不能冲突
 69 |     		path = o.getSnapshotDir(s.ID)
 70 |     		// 重命名临时目录，因此path对于snapshot应该是唯一的，和s.ID一一对应
 71 |     		if err := os.Rename(td, path); err != nil {
 72 |     			if rerr := t.Rollback(); rerr != nil {
 73 |     				log.G(ctx).WithError(rerr).Warn("failed to rollback transaction")
 74 |     			}
 75 |     			return nil, errors.Wrap(err, "failed to rename")
 76 |     		}
 77 |     		td = ""
 78 |     	}
 79 |     
 80 |     	if err := t.Commit(); err != nil {
 81 |     		return nil, errors.Wrap(err, "commit failed")
 82 |     	}
 83 |     
 84 |     	return o.mounts(s), nil
 85 |     }
 86 |     
 87 |     // 其实就是在bolt的元数据的存储中登记一下
 88 |     func (o *snapshotter) mounts(s storage.Snapshot) []mount.Mount {
 89 |     	var (
 90 |     		roFlag string
 91 |     		source string
 92 |     	)
 93 |         
 94 |         // 区分是否可读写
 95 |     	if s.Kind == snapshots.KindView {
 96 |     		roFlag = "ro"
 97 |     	} else {
 98 |     		roFlag = "rw"
 99 |     	}
100 |     
101 |     	if len(s.ParentIDs) == 0 || s.Kind == snapshots.KindActive {
102 |     		source = o.getSnapshotDir(s.ID)
103 |     	} else {
104 |     	    // 只读的话直接拿parent的即可
105 |     		source = o.getSnapshotDir(s.ParentIDs[0])
106 |     	}
107 |     
108 |     	return []mount.Mount{
109 |     		{
110 |     			Source: source,
111 |     			// 联合挂载，可在本机上实验一下
112 |     			Type:   "bind",
113 |     			Options: []string{
114 |     				roFlag,
115 |     				// rbind表示会把原来目录下的挂载点也会一起挂载过去
116 |     				"rbind",
117 |     			},
118 |     		},
119 |     	}
120 |     }
121 |     
122 |     // 在bolt元数据存储中标记一下，表示snapshot为commited状态
123 |     func (o *snapshotter) Commit(ctx context.Context, name, key string, opts ...snapshots.Opt) error {
124 |     	ctx, t, err := o.ms.TransactionContext(ctx, true)
125 |     	if err != nil {
126 |     		return err
127 |     	}
128 |     
129 |     	id, _, _, err := storage.GetInfo(ctx, key)
130 |     	if err != nil {
131 |     		return err
132 |     	}
133 |     
134 |     	usage, err := fs.DiskUsage(ctx, o.getSnapshotDir(id))
135 |     	if err != nil {
136 |     		return err
137 |     	}
138 |     
139 |     	if _, err := storage.CommitActive(ctx, key, name, snapshots.Usage(usage), opts...); err != nil {
140 |     		if rerr := t.Rollback(); rerr != nil {
141 |     			log.G(ctx).WithError(rerr).Warn("failed to rollback transaction")
142 |     		}
143 |     		return errors.Wrap(err, "failed to commit snapshot")
144 |     	}
145 |     	return t.Commit()
146 |     }
147 | 


--------------------------------------------------------------------------------
/chapter5/section5.2.md:
--------------------------------------------------------------------------------
  1 | # overlayfs
  2 | 
  3 | 
  4 | > OverlayFS is a modern union filesystem that is similar to AUFS, but
  5 | > faster and with a simpler implementation. Docker provides two storage
  6 | > drivers for OverlayFS: the original overlay, and the newer and more
  7 | > stable overlay2.
  8 | 
  9 | ### 预备知识
 10 | https://docs.docker.com/storage/storagedriver/overlayfs-driver/
 11 | 
 12 | overlay vs overlay2
 13 | > If you are still using the overlay driver rather than overlay2, see
 14 | > How the overlay driver works instead.
 15 | > 
 16 | > OverlayFS layers two directories on a single Linux host and presents
 17 | > them as a single directory. These directories are called layers and
 18 | > the unification process is referred to as a union mount. OverlayFS
 19 | > refers to the lower directory as lowerdir and the upper directory a
 20 | > upperdir. The unified view is exposed through its own directory called
 21 | > merged.
 22 | > 
 23 | > While the overlay driver only works with a single lower OverlayFS
 24 | > layer and hence requires hard links for implementation of
 25 | > multi-layered images, the overlay2 driver natively supports up to 128
 26 | > lower OverlayFS layers. This capability provides better performance
 27 | > for layer-related Docker commands such as docker build and docker
 28 | > commit, and consumes fewer inodes on the backing filesystem.
 29 | 
 30 | overlay1存在的问题:
 31 | 
 32 | > While the overlay driver only works with a single lower OverlayFS
 33 | > layer and hence requires hard links for implementation of
 34 | > multi-layered images, the overlay2 driver natively supports up to 128
 35 | > lower OverlayFS layers. This capability provides better performance
 36 | > for layer-related Docker commands such as docker build and docker
 37 | > commit, and consumes fewer inodes on the backing filesystem.
 38 | 
 39 | 如果还是在同一个文件系统的时候，比如pull镜像的时候是可以使用hard-link的方式来引用低层的镜像层的数据的。
 40 | 
 41 |     To create a container, the overlay driver combines the directory representing the image’s top layer plus a new directory for the container. The image’s top layer is the lowerdir in the overlay and is read-only. The new directory for the container is the upperdir and is writable.
 42 | 
 43 | moby overlay的代码：moby/moby/daemon/graphdriver/overlay
 44 | * `Create()`方法也就是产生rootfs的过程中都是`copy.DirCopy(parentUpperDir, upperDir, copy.Content, true)`也就是拷贝的内容(将parentUpperDir的内容拷贝到upperDir,因为只支持一层lowerDir，同时不能将parentUpperDir hard-link到upperDir目录，因为upper层是可读写的，hard-link也会破坏原有的parent)
 45 | * `ApplyDiff()`也就是构建镜像过程中调用的函数都是使用的hard-link的方式`copy.DirCopy(parentRootDir, tmpRootDir, copy.Hardlink, true)`
 46 | 
 47 | docker commit过程需要`Create() -> Diff() -> ApplyDiff()`的过程，因为`Create()`会拷贝文件内容，因此会消耗多余的inode。
 48 | 
 49 | ### 具体实现
 50 | 
 51 |     func (o *snapshotter) Prepare(ctx context.Context, key, parent string, opts ...snapshots.Opt) ([]mount.Mount, error) {
 52 |     	return o.createSnapshot(ctx, snapshots.KindActive, key, parent, opts)
 53 |     }
 54 |     
 55 |     func (o *snapshotter) createSnapshot(ctx context.Context, kind snapshots.Kind, key, parent string, opts []snapshots.Opt) ([]mount.Mount, error) {
 56 |     	ctx, t, err := o.ms.TransactionContext(ctx, true)
 57 |     	if err != nil {
 58 |     		return nil, err
 59 |     	}
 60 |     
 61 |     	var td, path string
 62 |     	defer func() {
 63 |     		if err != nil {
 64 |     			if td != "" {
 65 |     				if err1 := os.RemoveAll(td); err1 != nil {
 66 |     					log.G(ctx).WithError(err1).Warn("failed to cleanup temp snapshot directory")
 67 |     				}
 68 |     			}
 69 |     			if path != "" {
 70 |     				if err1 := os.RemoveAll(path); err1 != nil {
 71 |     					log.G(ctx).WithError(err1).WithField("path", path).Error("failed to reclaim snapshot directory, directory may need removal")
 72 |     					err = errors.Wrapf(err, "failed to remove path: %v", err1)
 73 |     				}
 74 |     			}
 75 |     		}
 76 |     	}()
 77 |     
 78 |     	snapshotDir := filepath.Join(o.root, "snapshots")
 79 |     	td, err = o.prepareDirectory(ctx, snapshotDir, kind)
 80 |     	if err != nil {
 81 |     		if rerr := t.Rollback(); rerr != nil {
 82 |     			log.G(ctx).WithError(rerr).Warn("failed to rollback transaction")
 83 |     		}
 84 |     		return nil, errors.Wrap(err, "failed to create prepare snapshot dir")
 85 |     	}
 86 |     	rollback := true
 87 |     	defer func() {
 88 |     		if rollback {
 89 |     			if rerr := t.Rollback(); rerr != nil {
 90 |     				log.G(ctx).WithError(rerr).Warn("failed to rollback transaction")
 91 |     			}
 92 |     		}
 93 |     	}()
 94 |     
 95 |     	s, err := storage.CreateSnapshot(ctx, kind, key, parent, opts...)
 96 |     	if err != nil {
 97 |     		return nil, errors.Wrap(err, "failed to create snapshot")
 98 |     	}
 99 |     
100 |     	if len(s.ParentIDs) > 0 {
101 |     		st, err := os.Stat(o.upperPath(s.ParentIDs[0]))
102 |     		if err != nil {
103 |     			return nil, errors.Wrap(err, "failed to stat parent")
104 |     		}
105 |     
106 |     		stat := st.Sys().(*syscall.Stat_t)
107 |             
108 |             // 设置目录权限和parent的相同
109 |     		if err := os.Lchown(filepath.Join(td, "fs"), int(stat.Uid), int(stat.Gid)); err != nil {
110 |     			if rerr := t.Rollback(); rerr != nil {
111 |     				log.G(ctx).WithError(rerr).Warn("failed to rollback transaction")
112 |     			}
113 |     			return nil, errors.Wrap(err, "failed to chown")
114 |     		}
115 |     	}
116 |     
117 |     	path = filepath.Join(snapshotDir, s.ID)
118 |     	if err = os.Rename(td, path); err != nil {
119 |     		return nil, errors.Wrap(err, "failed to rename")
120 |     	}
121 |     	td = ""
122 |         
123 |         // 这里不太理解为什么rollback=false? 
124 |     	rollback = false
125 |     	if err = t.Commit(); err != nil {
126 |     		return nil, errors.Wrap(err, "commit failed")
127 |     	}
128 |         
129 |         // merge目录不由snapshotter决定，挂载在哪，哪就是merge层
130 |     	return o.mounts(s), nil
131 |     }
132 | 
133 |     func (o *snapshotter) mounts(s storage.Snapshot) []mount.Mount {
134 |     	if len(s.ParentIDs) == 0 {
135 |     		// if we only have one layer/no parents then just return a bind mount as overlay
136 |     		// will not work
137 |     		roFlag := "rw"
138 |     		if s.Kind == snapshots.KindView {
139 |     			roFlag = "ro"
140 |     		}
141 |     
142 |     		return []mount.Mount{
143 |     			{
144 |     				Source: o.upperPath(s.ID),
145 |     				Type:   "bind",
146 |     				Options: []string{
147 |     					roFlag,
148 |     					"rbind",
149 |     				},
150 |     			},
151 |     		}
152 |     	}
153 |     	var options []string
154 |     
155 |     	if s.Kind == snapshots.KindActive {
156 |     		options = append(options,
157 |     		    // filepath.Join(o.root, "snapshots", id, "work")
158 |     		    // 文件系统挂载后用于存放临时和间接文件的工作基目录
159 |     			fmt.Sprintf("workdir=%s", o.workPath(s.ID)),
160 |     			// upper目录，也就是文件系统存储的主目录，可以认为是container的RW层
161 |     			// filepath.Join(o.root, "snapshots", id, "fs")
162 |     			fmt.Sprintf("upperdir=%s", o.upperPath(s.ID)),
163 |     		)
164 |     	} else if len(s.ParentIDs) == 1 {
165 |     	    // 只有一层且只是返回可读层的时候上边注释也有说明，直接返回父亲的bind mount即可
166 |     		return []mount.Mount{
167 |     			{
168 |     				Source: o.upperPath(s.ParentIDs[0]),
169 |     				Type:   "bind",
170 |     				Options: []string{
171 |     					"ro",
172 |     					"rbind",
173 |     				},
174 |     			},
175 |     		}
176 |     	}
177 |         
178 |         // 不论是返回读写层都需要把lowerdir给放到options中去
179 |     	parentPaths := make([]string, len(s.ParentIDs))
180 |     	for i := range s.ParentIDs {
181 |     		parentPaths[i] = o.upperPath(s.ParentIDs[i])
182 |     	}
183 |     
184 |     	options = append(options, fmt.Sprintf("lowerdir=%s", strings.Join(parentPaths, ":")))
185 |     	// 具体如何处理由mount包处理，参考containerd/containerd/sys/mount_linux.go
186 |     	return []mount.Mount{
187 |     		{
188 |     			Type:    "overlay",
189 |     			Source:  "overlay",
190 |     			Options: options,
191 |     		},
192 |     	}
193 |     
194 |     }
195 |     
196 |     // 和native的一样的，没什么好说的
197 |     func (o *snapshotter) Commit(ctx context.Context, name, key string, opts ...snapshots.Opt) error {
198 |     	ctx, t, err := o.ms.TransactionContext(ctx, true)
199 |     	if err != nil {
200 |     		return err
201 |     	}
202 |     
203 |     	defer func() {
204 |     		if err != nil {
205 |     			if rerr := t.Rollback(); rerr != nil {
206 |     				log.G(ctx).WithError(rerr).Warn("failed to rollback transaction")
207 |     			}
208 |     		}
209 |     	}()
210 |     
211 |     	// grab the existing id
212 |     	id, _, _, err := storage.GetInfo(ctx, key)
213 |     	if err != nil {
214 |     		return err
215 |     	}
216 |     
217 |     	usage, err := fs.DiskUsage(ctx, o.upperPath(id))
218 |     	if err != nil {
219 |     		return err
220 |     	}
221 |     
222 |     	if _, err = storage.CommitActive(ctx, key, name, snapshots.Usage(usage), opts...); err != nil {
223 |     		return errors.Wrap(err, "failed to commit snapshot")
224 |     	}
225 |     	return t.Commit()
226 |     }
227 | 
228 | 
229 | 


--------------------------------------------------------------------------------
/chapter5/section5.3.md:
--------------------------------------------------------------------------------
  1 | # btrfs
  2 | 
  3 | 
  4 | ### 预备知识
  5 | 对于Docker 社区版本来说，不同linux发行版的选择如下：
  6 | ![不同系统支持][1]
  7 | 对于不同的文件系统，推荐如下：
  8 | ![文件系统所需要支持][2]
  9 | 
 10 | https://docs.docker.com/storage/storagedriver/btrfs-driver/
 11 | 
 12 | > Btrfs is a next generation copy-on-write filesystem that supports many
 13 | > advanced storage technologies that make it a good fit for Docker.
 14 | > Btrfs is included in the mainline Linux kernel.
 15 | > btrfs requires a dedicated block storage device such as a physical
 16 | > disk. This block device must be formatted for Btrfs and mounted into
 17 | > /var/lib/docker/.
 18 | > 
 19 | > One of the benefits of Btrfs is the ease of managing Btrfs filesystems
 20 | > without the need to unmount the filesystem or restart Docker.
 21 | > 
 22 | > When space gets low, Btrfs automatically expands the volume in chunks
 23 | > of roughly 1 GB.
 24 | > 
 25 | > To add a block device to a Btrfs volume, use the btrfs device add and
 26 | > btrfs filesystem balance commands.
 27 | > 
 28 | >     $ sudo btrfs device add /dev/svdh /var/lib/docker
 29 | >     $ sudo btrfs filesystem balance /var/lib/docker
 30 | > 
 31 | > With Btrfs, writing and updating lots of small files can result in
 32 | > slow performance.
 33 | 
 34 | 适合大I/O的场景，且由于其日志的实现，顺序写的性能也不会很高。
 35 | 
 36 | ### 具体实现
 37 | 
 38 |     func (b *snapshotter) Prepare(ctx context.Context, key, parent string, opts ...snapshots.Opt) ([]mount.Mount, error) {
 39 |     	return b.makeSnapshot(ctx, snapshots.KindActive, key, parent, opts)
 40 |     }
 41 |     
 42 |     func (b *snapshotter) makeSnapshot(ctx context.Context, kind snapshots.Kind, key, parent string, opts []snapshots.Opt) ([]mount.Mount, error) {
 43 |     	ctx, t, err := b.ms.TransactionContext(ctx, true)
 44 |     	if err != nil {
 45 |     		return nil, err
 46 |     	}
 47 |     	defer func() {
 48 |     		if err != nil && t != nil {
 49 |     			if rerr := t.Rollback(); rerr != nil {
 50 |     				log.G(ctx).WithError(rerr).Warn("failed to rollback transaction")
 51 |     			}
 52 |     		}
 53 |     	}()
 54 |     
 55 |     	s, err := storage.CreateSnapshot(ctx, kind, key, parent, opts...)
 56 |     	if err != nil {
 57 |     		return nil, err
 58 |     	}
 59 |         // 形如root/active/snapshotId
 60 |         // 这里为什么要加kind？不能直接全部放到snapshots里边去？
 61 |     	target := filepath.Join(b.root, strings.ToLower(s.Kind.String()), s.ID)
 62 |     
 63 |     	if len(s.ParentIDs) == 0 {
 64 |     		// create new subvolume
 65 |     		// btrfs subvolume create /dir
 66 |     		// 不做快照直接创建subvolume
 67 |     		if err = btrfs.SubvolCreate(target); err != nil {
 68 |     			return nil, err
 69 |     		}
 70 |     	} else {
 71 |     	    // root/snapshots/parent (单亲关系，不存在多个父节点的情况)
 72 |     	    /*
 73 |     	    var (
 74 | 		        active    = filepath.Join(root, "active")
 75 | 		        view      = filepath.Join(root, "view")
 76 | 		        snapshots = filepath.Join(root, "snapshots")
 77 | 	        )
 78 |     	    */
 79 |     		parentp := filepath.Join(b.root, "snapshots", s.ParentIDs[0])
 80 |     
 81 |     		var readonly bool
 82 |     		if kind == snapshots.KindView {
 83 |     			readonly = true
 84 |     		}
 85 |     
 86 |     		// btrfs subvolume snapshot /parent /subvol
 87 |     		// 通过snapshot进行创建是因为其底层共享storage pool的数据，符合COW的语义
 88 |     		if err = btrfs.SubvolSnapshot(target, parentp, readonly); err != nil {
 89 |     			return nil, err
 90 |     		}
 91 |     	}
 92 |     	err = t.Commit()
 93 |     	t = nil
 94 |     	if err != nil {
 95 |     		if derr := btrfs.SubvolDelete(target); derr != nil {
 96 |     			log.G(ctx).WithError(derr).WithField("subvolume", target).Error("failed to delete subvolume")
 97 |     		}
 98 |     		return nil, err
 99 |     	}
100 |     
101 |     	return b.mounts(target, s)
102 |     }
103 |     
104 |     func (b *snapshotter) mounts(dir string, s storage.Snapshot) ([]mount.Mount, error) {
105 |     	var options []string
106 |     
107 |     	// get the subvolume id back out for the mount
108 |     	sid, err := btrfs.SubvolID(dir)
109 |     	if err != nil {
110 |     		return nil, err
111 |     	}
112 |     
113 |     	options = append(options, fmt.Sprintf("subvolid=%d", sid))
114 |     
115 |     	if s.Kind != snapshots.KindActive {
116 |     		options = append(options, "ro")
117 |     	}
118 |     
119 |     	return []mount.Mount{
120 |     		{
121 |     			Type:   "btrfs",
122 |     			Source: b.device,
123 |     			// NOTE(stevvooe): While it would be nice to use to uuids for
124 |     			// mounts, they don't work reliably if the uuids are missing.
125 |     			Options: options,
126 |     		},
127 |     	}, nil
128 |     }
129 | 
130 |     // 这里和前边简介的commit操作有所不同，不是只做了元数据的处理
131 |     func (b *snapshotter) Commit(ctx context.Context, name, key string, opts ...snapshots.Opt) (err error) {
132 |     	usage, err := b.usage(ctx, key)
133 |     	if err != nil {
134 |     		return errors.Wrap(err, "failed to compute usage")
135 |     	}
136 |     
137 |     	ctx, t, err := b.ms.TransactionContext(ctx, true)
138 |     	if err != nil {
139 |     		return err
140 |     	}
141 |     	defer func() {
142 |     		if err != nil && t != nil {
143 |     			if rerr := t.Rollback(); rerr != nil {
144 |     				log.G(ctx).WithError(rerr).Warn("failed to rollback transaction")
145 |     			}
146 |     		}
147 |     	}()
148 |     
149 |     	id, err := storage.CommitActive(ctx, key, name, usage, opts...) // TODO(stevvooe): Resolve a usage value for btrfs
150 |     	if err != nil {
151 |     		return errors.Wrap(err, "failed to commit")
152 |     	}
153 |     
154 |     	source := filepath.Join(b.root, "active", id)
155 |     	target := filepath.Join(b.root, "snapshots", id)
156 |         
157 |         // 必须完成active到可读写的snapshot的转化，因为前边prepare用的是snapshots的目录
158 |         // commit的时候是 active -> snapshot
159 |         // prepare的时候是 parent snapshot -> snapshot的转换，保证parent snapshot是只读的
160 |     	if err := btrfs.SubvolSnapshot(target, source, true); err != nil {
161 |     		return err
162 |     	}
163 |     
164 |     	err = t.Commit()
165 |     	t = nil
166 |     	if err != nil {
167 |     		if derr := btrfs.SubvolDelete(target); derr != nil {
168 |     			log.G(ctx).WithError(derr).WithField("subvolume", target).Error("failed to delete subvolume")
169 |     		}
170 |     		return err
171 |     	}
172 |     
173 |     	if derr := btrfs.SubvolDelete(source); derr != nil {
174 |     		// Log as warning, only needed for cleanup, will not cause name collision
175 |     		log.G(ctx).WithError(derr).WithField("subvolume", source).Warn("failed to delete subvolume")
176 |     	}
177 |     
178 |     	return nil
179 |     }
180 | 
181 |   [1]: https://raw.githubusercontent.com/zhangchenchen/zhangchenchen.github.io/hexo/images/20180309151325-linux-distribution.jpg
182 |   [2]: https://raw.githubusercontent.com/zhangchenchen/zhangchenchen.github.io/hexo/images/20180309151640-file-system.jpg
183 | 
184 | 


--------------------------------------------------------------------------------